From 3b8b760232a9672406fab03c25251261ae0704d2 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 27 Jul 2021 12:46:32 -0400
Subject: [PATCH 1/5] Update .clang-format

---
 cpp/.clang-format | 97 +++++++++++++++++++++++++----------------------
 1 file changed, 52 insertions(+), 45 deletions(-)

diff --git a/cpp/.clang-format b/cpp/.clang-format
index 779ca0033a..0c05436e92 100644
--- a/cpp/.clang-format
+++ b/cpp/.clang-format
@@ -1,72 +1,78 @@
 ---
 # Refer to the following link for the explanation of each params:
-#   http://releases.llvm.org/8.0.1/tools/clang/docs/ClangFormatStyleOptions.html
-Language:        Cpp
-# BasedOnStyle:  Google
+#   http://releases.llvm.org/8.0.0/tools/clang/docs/ClangFormatStyleOptions.html
+Language: Cpp
+# BasedOnStyle: Google
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
+AlignConsecutiveAssignments: true
+AlignConsecutiveBitFields: true
 AlignConsecutiveDeclarations: false
+AlignConsecutiveMacros: true
 AlignEscapedNewlines: Left
-AlignOperands:   true
+AlignOperands: true
 AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllConstructorInitializersOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
+AllowShortBlocksOnASingleLine: true 
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortEnumsOnASingleLine: true
 AllowShortFunctionsOnASingleLine: All
 AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: true
+AllowShortLambdasOnASingleLine: true
+AllowShortLoopsOnASingleLine: false
 # This is deprecated
 AlwaysBreakAfterDefinitionReturnType: None
 AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: true
 AlwaysBreakTemplateDeclarations: Yes
-BinPackArguments: true
-BinPackParameters: true
+BinPackArguments:  false       
+BinPackParameters: false
 BraceWrapping:
-  AfterClass:      false
+  AfterClass:            false
   AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
+  AfterEnum:             false
+  AfterFunction:         false
+  AfterNamespace:        false
+  AfterObjCDeclaration:  false
+  AfterStruct:           false
+  AfterUnion:            false
+  AfterExternBlock:      false
+  BeforeCatch:           false
+  BeforeElse:            false
+  IndentBraces:          false
   # disabling the below splits, else, they'll just add to the vertical length of source files!
   SplitEmptyFunction: false
   SplitEmptyRecord: false
   SplitEmptyNamespace: false
+BreakAfterJavaFieldAnnotations: false
 BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
+BreakBeforeBraces: WebKit
 BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
 BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
+BreakInheritanceList: BeforeColon
 BreakStringLiterals: true
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
+ColumnLimit: 100
+CommentPragmas: '^ IWYU pragma:'
 CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: true
 # Kept the below 2 to be the same as `IndentWidth` to keep everything uniform
 ConstructorInitializerIndentWidth: 2
 ContinuationIndentWidth: 2
 Cpp11BracedListStyle: true
-DerivePointerAlignment: true
-DisableFormat:   false
+DerivePointerAlignment: false
+DisableFormat: false
 ExperimentalAutoDetectBinPacking: false
 FixNamespaceComments: true
-ForEachMacros:   
+ForEachMacros:
   - foreach
   - Q_FOREACH
   - BOOST_FOREACH
-IncludeBlocks:   Preserve
-IncludeCategories: 
+IncludeBlocks: Preserve
+IncludeCategories:
   - Regex:           '^<ext/.*\.h>'
     Priority:        2
   - Regex:           '^<.*\.h>'
@@ -100,9 +106,9 @@ PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Left
-RawStringFormats: 
-  - Language:        Cpp
-    Delimiters:      
+RawStringFormats:
+  - Language: Cpp
+    Delimiters:
       - cc
       - CC
       - cpp
@@ -111,7 +117,7 @@ RawStringFormats:
       - 'c++'
       - 'C++'
     CanonicalDelimiter: ''
-  - Language:        TextProto
+  - Language: TextProto
     Delimiters:
       - pb
       - PB
@@ -126,10 +132,10 @@ RawStringFormats:
       - ParseTextOrDie
       - ParseTextProtoOrDie
     CanonicalDelimiter: ''
-    BasedOnStyle:    google
+    BasedOnStyle: google
 # Enabling comment reflow causes doxygen comments to be messed up in their formats!
-ReflowComments:  false
-SortIncludes:    true
+ReflowComments: true
+SortIncludes: true
 SortUsingDeclarations: true
 SpaceAfterCStyleCast: false
 SpaceAfterTemplateKeyword: true
@@ -139,19 +145,20 @@ SpaceBeforeCtorInitializerColon: true
 SpaceBeforeInheritanceColon: true
 SpaceBeforeParens: ControlStatements
 SpaceBeforeRangeBasedForLoopColon: true
+SpaceBeforeSquareBrackets: false
+SpaceInEmptyBlock: false
 SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
+SpacesInAngles: false
+SpacesInConditionalStatement: false
 SpacesInContainerLiterals: true
 SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
-# We are C++14, but clang-format puts this under `Cpp11` itself
-Standard:        Cpp11
-StatementMacros: 
+Standard: c++17
+StatementMacros:
   - Q_UNUSED
   - QT_REQUIRE_VERSION
 # Be consistent with indent-width, even for people who use tab for indentation!
-TabWidth:        2
-UseTab:          Never
-...
+TabWidth: 2
+UseTab: Never

From cc03dbac0da3a25b51404fec2526c43812982be7 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Tue, 27 Jul 2021 12:47:02 -0400
Subject: [PATCH 2/5] Formatting changes

---
 cpp/include/raft.hpp                          |    3 +-
 cpp/include/raft/cache/cache_util.cuh         |  104 +-
 cpp/include/raft/common/cub_wrappers.cuh      |   42 +-
 .../raft/common/device_loads_stores.cuh       |   87 +-
 cpp/include/raft/common/scatter.cuh           |   77 +-
 cpp/include/raft/comms/comms.hpp              |  342 ++--
 cpp/include/raft/comms/helper.hpp             |   37 +-
 cpp/include/raft/comms/mpi_comms.hpp          |  300 ++--
 cpp/include/raft/comms/std_comms.hpp          |  328 ++--
 cpp/include/raft/comms/test.hpp               |  236 ++-
 cpp/include/raft/comms/ucp_helper.hpp         |  138 +-
 cpp/include/raft/comms/util.hpp               |  114 +-
 cpp/include/raft/cuda_utils.cuh               |  259 ++-
 cpp/include/raft/cudart_utils.h               |  190 +-
 cpp/include/raft/device_atomics.cuh           |  265 ++-
 cpp/include/raft/distance/canberra.cuh        |  136 +-
 cpp/include/raft/distance/chebyshev.cuh       |  136 +-
 cpp/include/raft/distance/cosine.cuh          |  175 +-
 cpp/include/raft/distance/distance.cuh        |  520 ++++--
 cpp/include/raft/distance/euclidean.cuh       |  314 ++--
 cpp/include/raft/distance/fused_l2_nn.cuh     |  254 ++-
 cpp/include/raft/distance/hellinger.cuh       |  154 +-
 cpp/include/raft/distance/l1.cuh              |  128 +-
 cpp/include/raft/distance/minkowski.cuh       |  139 +-
 .../raft/distance/pairwise_distance_base.cuh  |  159 +-
 cpp/include/raft/error.hpp                    |   50 +-
 cpp/include/raft/handle.hpp                   |  121 +-
 cpp/include/raft/integer_utils.h              |   55 +-
 cpp/include/raft/label/classlabels.cuh        |  137 +-
 cpp/include/raft/label/merge_labels.cuh       |   31 +-
 cpp/include/raft/lap/d_structs.h              |   20 +-
 cpp/include/raft/lap/lap.cuh                  |  161 +-
 cpp/include/raft/lap/lap_functions.cuh        |  399 +++--
 cpp/include/raft/lap/lap_kernels.cuh          |  343 ++--
 cpp/include/raft/linalg/add.cuh               |   35 +-
 cpp/include/raft/linalg/binary_op.cuh         |   61 +-
 .../raft/linalg/cholesky_r1_update.cuh        |   63 +-
 .../raft/linalg/coalesced_reduction.cuh       |   55 +-
 cpp/include/raft/linalg/contractions.cuh      |   76 +-
 cpp/include/raft/linalg/cublas_wrappers.h     |  921 +++++++---
 cpp/include/raft/linalg/cusolver_wrappers.h   | 1144 +++++++++---
 cpp/include/raft/linalg/divide.cuh            |    7 +-
 cpp/include/raft/linalg/eig.cuh               |  169 +-
 cpp/include/raft/linalg/eltwise.cuh           |   56 +-
 cpp/include/raft/linalg/gemm.cuh              |   85 +-
 cpp/include/raft/linalg/gemv.h                |   54 +-
 cpp/include/raft/linalg/init.h                |    6 +-
 cpp/include/raft/linalg/lanczos.hpp           |  786 +++++---
 cpp/include/raft/linalg/map.cuh               |   31 +-
 cpp/include/raft/linalg/map_then_reduce.cuh   |   92 +-
 cpp/include/raft/linalg/matrix_vector_op.cuh  |  102 +-
 .../raft/linalg/mean_squared_error.cuh        |   10 +-
 cpp/include/raft/linalg/multiply.cuh          |    7 +-
 cpp/include/raft/linalg/norm.cuh              |   92 +-
 cpp/include/raft/linalg/qr.cuh                |   87 +-
 cpp/include/raft/linalg/reduce.cuh            |   37 +-
 cpp/include/raft/linalg/strided_reduction.cuh |   74 +-
 cpp/include/raft/linalg/subtract.cuh          |   34 +-
 cpp/include/raft/linalg/svd.cuh               |  238 ++-
 cpp/include/raft/linalg/transpose.h           |   61 +-
 cpp/include/raft/linalg/unary_op.cuh          |   86 +-
 cpp/include/raft/matrix/math.cuh              |  286 ++-
 cpp/include/raft/matrix/matrix.cuh            |  208 ++-
 cpp/include/raft/mr/buffer_base.hpp           |   59 +-
 cpp/include/raft/mr/device/allocator.hpp      |    9 +-
 cpp/include/raft/mr/device/buffer.hpp         |   14 +-
 cpp/include/raft/mr/host/allocator.hpp        |   13 +-
 cpp/include/raft/mr/host/buffer.hpp           |   21 +-
 cpp/include/raft/random/rng.cuh               |  319 ++--
 cpp/include/raft/random/rng_impl.cuh          |   89 +-
 cpp/include/raft/sparse/convert/coo.cuh       |   20 +-
 cpp/include/raft/sparse/convert/csr.cuh       |  126 +-
 cpp/include/raft/sparse/convert/dense.cuh     |   35 +-
 cpp/include/raft/sparse/coo.cuh               |  192 +-
 cpp/include/raft/sparse/csr.cuh               |  131 +-
 cpp/include/raft/sparse/cusparse_wrappers.h   | 1590 ++++++++++++-----
 .../raft/sparse/distance/bin_distance.cuh     |  189 +-
 cpp/include/raft/sparse/distance/common.h     |   18 +-
 cpp/include/raft/sparse/distance/coo_spmv.cuh |  118 +-
 .../coo_spmv_strategies/base_strategy.cuh     |  138 +-
 .../coo_mask_row_iterators.cuh                |  166 +-
 .../dense_smem_strategy.cuh                   |  104 +-
 .../coo_spmv_strategies/hash_strategy.cuh     |  277 +--
 .../distance/detail/coo_spmv_kernel.cuh       |  196 +-
 cpp/include/raft/sparse/distance/distance.cuh |   48 +-
 .../raft/sparse/distance/ip_distance.cuh      |   27 +-
 .../raft/sparse/distance/l2_distance.cuh      |  386 ++--
 .../raft/sparse/distance/lp_distance.cuh      |  199 ++-
 .../raft/sparse/distance/operators.cuh        |   29 +-
 cpp/include/raft/sparse/distance/utils.cuh    |    6 +-
 cpp/include/raft/sparse/hierarchy/common.h    |   10 +-
 .../sparse/hierarchy/detail/agglomerative.cuh |  124 +-
 .../hierarchy/detail/connectivities.cuh       |   92 +-
 .../raft/sparse/hierarchy/detail/mst.cuh      |   93 +-
 .../raft/sparse/hierarchy/single_linkage.hpp  |   66 +-
 cpp/include/raft/sparse/linalg/add.cuh        |  116 +-
 cpp/include/raft/sparse/linalg/degree.cuh     |   56 +-
 cpp/include/raft/sparse/linalg/norm.cuh       |   51 +-
 cpp/include/raft/sparse/linalg/spectral.cuh   |   72 +-
 cpp/include/raft/sparse/linalg/symmetrize.cuh |  157 +-
 cpp/include/raft/sparse/linalg/transpose.h    |   56 +-
 .../raft/sparse/mst/detail/mst_kernels.cuh    |  160 +-
 .../raft/sparse/mst/detail/mst_solver_inl.cuh |  258 +--
 cpp/include/raft/sparse/mst/detail/utils.cuh  |   19 +-
 cpp/include/raft/sparse/mst/mst.cuh           |   34 +-
 cpp/include/raft/sparse/mst/mst_solver.cuh    |   48 +-
 cpp/include/raft/sparse/op/filter.cuh         |  115 +-
 cpp/include/raft/sparse/op/reduce.cuh         |   55 +-
 cpp/include/raft/sparse/op/row_op.cuh         |   16 +-
 cpp/include/raft/sparse/op/slice.h            |   34 +-
 cpp/include/raft/sparse/op/sort.h             |   35 +-
 .../sparse/selection/connect_components.cuh   |  224 ++-
 cpp/include/raft/sparse/selection/knn.cuh     |  444 +++--
 .../raft/sparse/selection/knn_graph.cuh       |   54 +-
 .../raft/sparse/selection/selection.cuh       |   99 +-
 cpp/include/raft/sparse/utils.h               |   22 +-
 cpp/include/raft/spatial/knn/ann.hpp          |   31 +-
 cpp/include/raft/spatial/knn/ann_common.h     |   10 +-
 .../knn/detail/ann_quantized_faiss.cuh        |  141 +-
 .../raft/spatial/knn/detail/common_faiss.h    |   37 +-
 .../spatial/knn/detail/haversine_distance.cuh |   56 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |  178 +-
 .../raft/spatial/knn/detail/processing.hpp    |  134 +-
 cpp/include/raft/spatial/knn/knn.hpp          |   64 +-
 cpp/include/raft/spectral/cluster_solvers.hpp |   39 +-
 cpp/include/raft/spectral/eigen_solvers.hpp   |   66 +-
 cpp/include/raft/spectral/kmeans.hpp          |  476 +++--
 cpp/include/raft/spectral/lapack.hpp          |  552 ++++--
 cpp/include/raft/spectral/matrix_wrappers.hpp |  279 +--
 .../raft/spectral/modularity_maximization.hpp |   52 +-
 cpp/include/raft/spectral/partition.hpp       |   61 +-
 cpp/include/raft/spectral/spectral_util.hpp   |  125 +-
 cpp/include/raft/spectral/warn_dbg.hpp        |    4 +-
 cpp/include/raft/stats/mean.cuh               |   42 +-
 cpp/include/raft/stats/mean_center.cuh        |   45 +-
 cpp/include/raft/stats/stddev.cuh             |  102 +-
 cpp/include/raft/stats/sum.cuh                |   38 +-
 cpp/include/raft/vectorized.cuh               |  112 +-
 cpp/test/cluster_solvers.cu                   |   22 +-
 cpp/test/cudart_utils.cpp                     |    3 +-
 cpp/test/distance/dist_adj.cu                 |   78 +-
 cpp/test/distance/dist_canberra.cu            |   24 +-
 cpp/test/distance/dist_chebyshev.cu           |   24 +-
 cpp/test/distance/dist_cos.cu                 |   23 +-
 cpp/test/distance/dist_euc_exp.cu             |   22 +-
 cpp/test/distance/dist_euc_unexp.cu           |   18 +-
 cpp/test/distance/dist_hellinger.cu           |   24 +-
 cpp/test/distance/dist_l1.cu                  |   24 +-
 cpp/test/distance/dist_minkowski.cu           |   23 +-
 cpp/test/distance/distance_base.cuh           |  203 ++-
 cpp/test/distance/fused_l2_nn.cu              |  192 +-
 cpp/test/eigen_solvers.cu                     |   35 +-
 cpp/test/handle.cpp                           |   21 +-
 cpp/test/integer_utils.cpp                    |    6 +-
 cpp/test/label/label.cu                       |   31 +-
 cpp/test/label/merge_labels.cu                |   67 +-
 cpp/test/lap/lap.cu                           |   92 +-
 cpp/test/linalg/add.cu                        |   13 +-
 cpp/test/linalg/add.cuh                       |   17 +-
 cpp/test/linalg/binary_op.cu                  |   88 +-
 cpp/test/linalg/binary_op.cuh                 |   17 +-
 cpp/test/linalg/cholesky_r1.cu                |   50 +-
 cpp/test/linalg/coalesced_reduction.cu        |   60 +-
 cpp/test/linalg/divide.cu                     |   50 +-
 cpp/test/linalg/eig.cu                        |  177 +-
 cpp/test/linalg/eig_sel.cu                    |   92 +-
 cpp/test/linalg/eltwise.cu                    |   98 +-
 cpp/test/linalg/gemm_layout.cu                |   63 +-
 cpp/test/linalg/map.cu                        |   98 +-
 cpp/test/linalg/map_then_reduce.cu            |   99 +-
 cpp/test/linalg/matrix_vector_op.cu           |  109 +-
 cpp/test/linalg/matrix_vector_op.cuh          |   73 +-
 cpp/test/linalg/multiply.cu                   |   30 +-
 cpp/test/linalg/norm.cu                       |  140 +-
 cpp/test/linalg/reduce.cu                     |   84 +-
 cpp/test/linalg/reduce.cuh                    |   59 +-
 cpp/test/linalg/strided_reduction.cu          |   61 +-
 cpp/test/linalg/subtract.cu                   |   74 +-
 cpp/test/linalg/svd.cu                        |  108 +-
 cpp/test/linalg/transpose.cu                  |   51 +-
 cpp/test/linalg/unary_op.cu                   |   46 +-
 cpp/test/linalg/unary_op.cuh                  |   17 +-
 cpp/test/matrix/math.cu                       |  194 +-
 cpp/test/matrix/matrix.cu                     |   84 +-
 cpp/test/mr/device/buffer.cpp                 |   16 +-
 cpp/test/mr/host/buffer.cpp                   |    9 +-
 cpp/test/mst.cu                               |  172 +-
 cpp/test/random/rng.cu                        |  203 +--
 cpp/test/random/rng_int.cu                    |   66 +-
 cpp/test/random/sample_without_replacement.cu |   35 +-
 cpp/test/sparse/add.cu                        |   97 +-
 cpp/test/sparse/connect_components.cu         |  599 +++----
 cpp/test/sparse/convert_coo.cu                |   20 +-
 cpp/test/sparse/convert_csr.cu                |   50 +-
 cpp/test/sparse/csr_row_slice.cu              |   80 +-
 cpp/test/sparse/csr_to_dense.cu               |   63 +-
 cpp/test/sparse/csr_transpose.cu              |   80 +-
 cpp/test/sparse/degree.cu                     |   23 +-
 cpp/test/sparse/dist_coo_spmv.cu              |  936 +++++-----
 cpp/test/sparse/distance.cu                   |  248 ++-
 cpp/test/sparse/filter.cu                     |   33 +-
 cpp/test/sparse/knn.cu                        |   91 +-
 cpp/test/sparse/knn_graph.cu                  |   36 +-
 cpp/test/sparse/linkage.cu                    |  647 +++----
 cpp/test/sparse/norm.cu                       |   34 +-
 cpp/test/sparse/reduce.cu                     |   50 +-
 cpp/test/sparse/row_op.cu                     |   40 +-
 cpp/test/sparse/selection.cu                  |   59 +-
 cpp/test/sparse/sort.cu                       |   22 +-
 cpp/test/sparse/symmetrize.cu                 |   89 +-
 cpp/test/spatial/haversine.cu                 |   61 +-
 cpp/test/spatial/knn.cu                       |   89 +-
 cpp/test/spectral_matrix.cu                   |   13 +-
 cpp/test/stats/mean.cu                        |   94 +-
 cpp/test/stats/mean_center.cu                 |   63 +-
 cpp/test/stats/stddev.cu                      |   46 +-
 cpp/test/stats/sum.cu                         |   25 +-
 cpp/test/test_utils.h                         |  136 +-
 218 files changed, 16429 insertions(+), 11470 deletions(-)

diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp
index f380d276b2..08f836d3a8 100644
--- a/cpp/include/raft.hpp
+++ b/cpp/include/raft.hpp
@@ -21,7 +21,8 @@ namespace raft {
 /* Function for testing RAFT include
  *
  * @return message indicating RAFT has been included succesfully*/
-inline std::string test_raft() {
+inline std::string test_raft()
+{
   std::string status = "RAFT Setup succesfully";
   return status;
 }
diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh
index ce8ef9a095..f63040fa00 100644
--- a/cpp/include/raft/cache/cache_util.cuh
+++ b/cpp/include/raft/cache/cache_util.cuh
@@ -42,17 +42,15 @@ namespace cache {
  * @param [out] out vectors collected from the cache, size [n_vec * n]
  */
 template <typename math_t>
-__global__ void get_vecs(const math_t *cache, int n_vec, const int *cache_idx,
-                         int n, math_t *out) {
+__global__ void get_vecs(const math_t* cache, int n_vec, const int* cache_idx, int n, math_t* out)
+{
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int row = tid % n_vec;  // row idx
   if (tid < n_vec * n) {
-    size_t out_col = tid / n_vec;  // col idx
+    size_t out_col   = tid / n_vec;  // col idx
     size_t cache_col = cache_idx[out_col];
     if (cache_idx[out_col] >= 0) {
-      if (row + out_col * n_vec < (size_t)n_vec * n) {
-        out[tid] = cache[row + cache_col * n_vec];
-      }
+      if (row + out_col * n_vec < (size_t)n_vec * n) { out[tid] = cache[row + cache_col * n_vec]; }
     }
   }
 }
@@ -84,21 +82,26 @@ __global__ void get_vecs(const math_t *cache, int n_vec, const int *cache_idx,
  * @param [in] n_cache_vecs
  */
 template <typename math_t>
-__global__ void store_vecs(const math_t *tile, int n_tile, int n_vec,
-                           const int *tile_idx, int n, const int *cache_idx,
-                           math_t *cache, int n_cache_vecs) {
+__global__ void store_vecs(const math_t* tile,
+                           int n_tile,
+                           int n_vec,
+                           const int* tile_idx,
+                           int n,
+                           const int* cache_idx,
+                           math_t* cache,
+                           int n_cache_vecs)
+{
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int row = tid % n_vec;  // row idx
   if (tid < n_vec * n) {
-    int tile_col = tid / n_vec;  // col idx
-    int data_col = tile_idx ? tile_idx[tile_col] : tile_col;
+    int tile_col  = tid / n_vec;  // col idx
+    int data_col  = tile_idx ? tile_idx[tile_col] : tile_col;
     int cache_col = cache_idx[tile_col];
 
     // We ignore negative values. The rest of the checks should be fulfilled
     // if the cache is used properly
     if (cache_col >= 0 && cache_col < n_cache_vecs && data_col < n_tile) {
-      cache[row + (size_t)cache_col * n_vec] =
-        tile[row + (size_t)data_col * n_vec];
+      cache[row + (size_t)cache_col * n_vec] = tile[row + (size_t)data_col * n_vec];
     }
   }
 }
@@ -121,14 +124,15 @@ int DI hash(int key, int n_cache_sets) { return key % n_cache_sets; }
  * @return the index of the first element in the array for which
  * array[idx] >= value. If there is no such value, then return n.
  */
-int DI arg_first_ge(const int *array, int n, int val) {
+int DI arg_first_ge(const int* array, int n, int val)
+{
   int start = 0;
-  int end = n - 1;
+  int end   = n - 1;
   if (array[0] == val) return 0;
   if (array[end] < val) return n;
   while (start + 1 < end) {
     int q = (start + end + 1) / 2;
-    //invariants:
+    // invariants:
     // start < end
     // start < q <=end
     // array[start] < val && array[end] <=val
@@ -157,7 +161,8 @@ int DI arg_first_ge(const int *array, int n, int val) {
  * @return the idx of the k-th occurance of val in array, or -1 if
  * the value is not found.
  */
-int DI find_nth_occurrence(const int *array, int n, int val, int k) {
+int DI find_nth_occurrence(const int* array, int n, int val, int k)
+{
   int q = arg_first_ge(array, n, val);
   if (q + k < n && array[q + k] == val) {
     q += k;
@@ -196,10 +201,10 @@ int DI find_nth_occurrence(const int *array, int n, int val, int k) {
  *   Each block should give a different pointer for rank.
  */
 template <int nthreads, int associativity>
-DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
+DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank)
+{
   const int items_per_thread = raft::ceildiv(associativity, nthreads);
-  typedef cub::BlockRadixSort<int, nthreads, items_per_thread, int>
-    BlockRadixSort;
+  typedef cub::BlockRadixSort<int, nthreads, items_per_thread, int> BlockRadixSort;
   __shared__ typename BlockRadixSort::TempStorage temp_storage;
 
   int key[items_per_thread];
@@ -208,8 +213,8 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
   int block_offset = blockIdx.x * associativity;
 
   for (int j = 0; j < items_per_thread; j++) {
-    int k = threadIdx.x + j * nthreads;
-    int t = (k < associativity) ? cache_time[block_offset + k] : 32768;
+    int k  = threadIdx.x + j * nthreads;
+    int t  = (k < associativity) ? cache_time[block_offset + k] : 32768;
     key[j] = t;
     val[j] = k;
   }
@@ -217,9 +222,7 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
   BlockRadixSort(temp_storage).Sort(key, val);
 
   for (int j = 0; j < items_per_thread; j++) {
-    if (val[j] < associativity) {
-      rank[val[j]] = threadIdx.x * items_per_thread + j;
-    }
+    if (val[j] < associativity) { rank[val[j]] = threadIdx.x * items_per_thread + j; }
   }
   __syncthreads();
 }
@@ -252,9 +255,15 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
  *   not be cached, size [n]
  */
 template <int nthreads, int associativity>
-__global__ void assign_cache_idx(const int *keys, int n, const int *cache_set,
-                                 int *cached_keys, int n_cache_sets,
-                                 int *cache_time, int time, int *cache_idx) {
+__global__ void assign_cache_idx(const int* keys,
+                                 int n,
+                                 const int* cache_set,
+                                 int* cached_keys,
+                                 int n_cache_sets,
+                                 int* cache_time,
+                                 int time,
+                                 int* cache_idx)
+{
   int block_offset = blockIdx.x * associativity;
 
   const int items_per_thread = raft::ceildiv(associativity, nthreads);
@@ -273,7 +282,7 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set,
   // these elements are assigned -1.
 
   for (int j = 0; j < items_per_thread; j++) {
-    int i = threadIdx.x + j * nthreads;
+    int i     = threadIdx.x + j * nthreads;
     int t_idx = block_offset + i;
     bool mask = (i < associativity);
     // whether this slot is available for writing
@@ -284,10 +293,10 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set,
     if (mask) {
       int k = find_nth_occurrence(cache_set, n, blockIdx.x, rank[i]);
       if (k > -1) {
-        int key_val = keys[k];
+        int key_val        = keys[k];
         cached_keys[t_idx] = key_val;
-        cache_idx[k] = t_idx;
-        cache_time[t_idx] = time;
+        cache_idx[k]       = t_idx;
+        cache_time[t_idx]  = time;
       }
     }
   }
@@ -315,21 +324,28 @@ namespace {
  * @param [inout] cached_keys keys stored in the cache, size [n_cache_sets * associativity]
  * @param [in] n_cache_sets number of cache sets
  * @param [in] associativity number of keys in cache set
- * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * associativity]
+ * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets *
+ * associativity]
  * @param [out] cache_idx cache indices of the working set elements, size [n]
  * @param [out] is_cached whether the element is cached size[n]
  * @param [in] time iteration counter (used for time stamping)
  */
-__global__ void get_cache_idx(int *keys, int n, int *cached_keys,
-                              int n_cache_sets, int associativity,
-                              int *cache_time, int *cache_idx, bool *is_cached,
-                              int time) {
+__global__ void get_cache_idx(int* keys,
+                              int n,
+                              int* cached_keys,
+                              int n_cache_sets,
+                              int associativity,
+                              int* cache_time,
+                              int* cache_idx,
+                              bool* is_cached,
+                              int time)
+{
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n) {
-    int widx = keys[tid];
-    int sidx = hash(widx, n_cache_sets);
-    int cidx = sidx * associativity;
-    int i = 0;
+    int widx   = keys[tid];
+    int sidx   = hash(widx, n_cache_sets);
+    int cidx   = sidx * associativity;
+    int i      = 0;
     bool found = false;
     // search for empty spot and the least recently used spot
     while (i < associativity && !found) {
@@ -338,9 +354,9 @@ __global__ void get_cache_idx(int *keys, int n, int *cached_keys,
     }
     is_cached[tid] = found;
     if (found) {
-      cidx = cidx + i - 1;
-      cache_time[cidx] = time;  //update time stamp
-      cache_idx[tid] = cidx;    //exact cache idx
+      cidx             = cidx + i - 1;
+      cache_time[cidx] = time;  // update time stamp
+      cache_idx[tid]   = cidx;  // exact cache idx
     } else {
       cache_idx[tid] = sidx;  // assign cache set
     }
diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh
index 8d5b29f700..4767c7f254 100644
--- a/cpp/include/raft/common/cub_wrappers.cuh
+++ b/cpp/include/raft/common/cub_wrappers.cuh
@@ -22,28 +22,32 @@
 namespace raft {
 
 /**
-     * @brief Convenience wrapper over cub's SortPairs method
-     * @tparam KeyT key type
-     * @tparam ValueT value type
-     * @param workspace workspace buffer which will get resized if not enough space
-     * @param inKeys input keys array
-     * @param outKeys output keys array
-     * @param inVals input values array
-     * @param outVals output values array
-     * @param len array length
-     * @param stream cuda stream
-     */
+ * @brief Convenience wrapper over cub's SortPairs method
+ * @tparam KeyT key type
+ * @tparam ValueT value type
+ * @param workspace workspace buffer which will get resized if not enough space
+ * @param inKeys input keys array
+ * @param outKeys output keys array
+ * @param inVals input values array
+ * @param outVals output values array
+ * @param len array length
+ * @param stream cuda stream
+ */
 template <typename KeyT, typename ValueT>
-void sortPairs(raft::mr::device::buffer<char> &workspace, const KeyT *inKeys,
-               KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len,
-               cudaStream_t stream) {
+void sortPairs(raft::mr::device::buffer<char>& workspace,
+               const KeyT* inKeys,
+               KeyT* outKeys,
+               const ValueT* inVals,
+               ValueT* outVals,
+               int len,
+               cudaStream_t stream)
+{
   size_t worksize;
-  cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals,
-                                  outVals, len, 0, sizeof(KeyT) * 8, stream);
+  cub::DeviceRadixSort::SortPairs(
+    nullptr, worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream);
   workspace.resize(worksize, stream);
-  cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys,
-                                  inVals, outVals, len, 0, sizeof(KeyT) * 8,
-                                  stream);
+  cub::DeviceRadixSort::SortPairs(
+    workspace.data(), worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh
index bb2b019ecb..41dc9cab08 100644
--- a/cpp/include/raft/common/device_loads_stores.cuh
+++ b/cpp/include/raft/common/device_loads_stores.cuh
@@ -31,40 +31,43 @@ namespace raft {
  * @param[out] addr shared memory address (should be aligned to vector size)
  * @param[in]  x    data to be stored at this address
  */
-DI void sts(float* addr, const float& x) {
+DI void sts(float* addr, const float& x)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x));
 }
-DI void sts(float* addr, const float (&x)[1]) {
+DI void sts(float* addr, const float (&x)[1])
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0]));
 }
-DI void sts(float* addr, const float (&x)[2]) {
+DI void sts(float* addr, const float (&x)[2])
+{
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
-  asm volatile("st.shared.v2.f32 [%0], {%1, %2};"
-               :
-               : "l"(s2), "f"(x[0]), "f"(x[1]));
+  asm volatile("st.shared.v2.f32 [%0], {%1, %2};" : : "l"(s2), "f"(x[0]), "f"(x[1]));
 }
-DI void sts(float* addr, const float (&x)[4]) {
+DI void sts(float* addr, const float (&x)[4])
+{
   auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
   asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};"
                :
                : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3]));
 }
 
-DI void sts(double* addr, const double& x) {
+DI void sts(double* addr, const double& x)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x));
 }
-DI void sts(double* addr, const double (&x)[1]) {
+DI void sts(double* addr, const double (&x)[1])
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0]));
 }
-DI void sts(double* addr, const double (&x)[2]) {
+DI void sts(double* addr, const double (&x)[2])
+{
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<double2*>(addr));
-  asm volatile("st.shared.v2.f64 [%0], {%1, %2};"
-               :
-               : "l"(s2), "d"(x[0]), "d"(x[1]));
+  asm volatile("st.shared.v2.f64 [%0], {%1, %2};" : : "l"(s2), "d"(x[0]), "d"(x[1]));
 }
 /** @} */
 
@@ -80,39 +83,42 @@ DI void sts(double* addr, const double (&x)[2]) {
  * @param[in]  addr shared memory address from where to load
  *                  (should be aligned to vector size)
  */
-DI void lds(float& x, float* addr) {
+DI void lds(float& x, float* addr)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1));
 }
-DI void lds(float (&x)[1], float* addr) {
+DI void lds(float (&x)[1], float* addr)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1));
 }
-DI void lds(float (&x)[2], float* addr) {
+DI void lds(float (&x)[2], float* addr)
+{
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
-  asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];"
-               : "=f"(x[0]), "=f"(x[1])
-               : "l"(s2));
+  asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2));
 }
-DI void lds(float (&x)[4], float* addr) {
+DI void lds(float (&x)[4], float* addr)
+{
   auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
   asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];"
                : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
                : "l"(s4));
 }
-DI void lds(double& x, double* addr) {
+DI void lds(double& x, double* addr)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1));
 }
-DI void lds(double (&x)[1], double* addr) {
+DI void lds(double (&x)[1], double* addr)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1));
 }
-DI void lds(double (&x)[2], double* addr) {
+DI void lds(double (&x)[2], double* addr)
+{
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<double2*>(addr));
-  asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];"
-               : "=d"(x[0]), "=d"(x[1])
-               : "l"(s2));
+  asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(s2));
 }
 /** @} */
 
@@ -123,32 +129,35 @@ DI void lds(double (&x)[2], double* addr) {
  * @param[out] x    data to be loaded from global memory
  * @param[in]  addr address in global memory from where to load
  */
-DI void ldg(float& x, const float* addr) {
+DI void ldg(float& x, const float* addr)
+{
   asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x) : "l"(addr));
 }
-DI void ldg(float (&x)[1], const float* addr) {
+DI void ldg(float (&x)[1], const float* addr)
+{
   asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x[0]) : "l"(addr));
 }
-DI void ldg(float (&x)[2], const float* addr) {
-  asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];"
-               : "=f"(x[0]), "=f"(x[1])
-               : "l"(addr));
+DI void ldg(float (&x)[2], const float* addr)
+{
+  asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(addr));
 }
-DI void ldg(float (&x)[4], const float* addr) {
+DI void ldg(float (&x)[4], const float* addr)
+{
   asm volatile("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];"
                : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
                : "l"(addr));
 }
-DI void ldg(double& x, const double* addr) {
+DI void ldg(double& x, const double* addr)
+{
   asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x) : "l"(addr));
 }
-DI void ldg(double (&x)[1], const double* addr) {
+DI void ldg(double (&x)[1], const double* addr)
+{
   asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x[0]) : "l"(addr));
 }
-DI void ldg(double (&x)[2], const double* addr) {
-  asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];"
-               : "=d"(x[0]), "=d"(x[1])
-               : "l"(addr));
+DI void ldg(double (&x)[2], const double* addr)
+{
+  asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr));
 }
 /** @} */
 
diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh
index 785794461e..b228ac5499 100644
--- a/cpp/include/raft/common/scatter.cuh
+++ b/cpp/include/raft/common/scatter.cuh
@@ -22,8 +22,8 @@
 namespace raft {
 
 template <typename DataT, int VecLen, typename Lambda, typename IdxT>
-__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx,
-                              IdxT len, Lambda op) {
+__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
+{
   typedef TxN_t<DataT, VecLen> DataVec;
   typedef TxN_t<IdxT, VecLen> IdxVec;
   IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
@@ -34,61 +34,60 @@ __global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx,
   DataVec dataIn;
 #pragma unroll
   for (int i = 0; i < VecLen; ++i) {
-    auto inPos = idxIn.val.data[i];
+    auto inPos         = idxIn.val.data[i];
     dataIn.val.data[i] = op(in[inPos], tid + i);
   }
   dataIn.store(out, tid);
 }
 
 template <typename DataT, int VecLen, typename Lambda, typename IdxT, int TPB>
-void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len,
-                 Lambda op, cudaStream_t stream) {
+void scatterImpl(
+  DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream)
+{
   const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB);
-  scatterKernel<DataT, VecLen, Lambda, IdxT>
-    <<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
+  scatterKernel<DataT, VecLen, Lambda, IdxT><<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
   CUDA_CHECK(cudaGetLastError());
 }
 
 /**
-     * @brief Performs scatter operation based on the input indexing array
-     * @tparam DataT data type whose array gets scattered
-     * @tparam IdxT indexing type
-     * @tparam TPB threads-per-block in the final kernel launched
-     * @tparam Lambda the device-lambda performing a unary operation on the loaded
-     * data before it gets scattered
-     * @param out the output array
-     * @param in the input array
-     * @param idx the indexing array
-     * @param len number of elements in the input array
-     * @param stream cuda stream where to launch work
-     * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This
-     * will be applied to every element before scattering it to the right location.
-     * The second param in this method will be the destination index.
-     */
-template <typename DataT, typename IdxT,
-          typename Lambda = raft::Nop<DataT, IdxT>, int TPB = 256>
-void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len,
-             cudaStream_t stream, Lambda op = raft::Nop<DataT, IdxT>()) {
+ * @brief Performs scatter operation based on the input indexing array
+ * @tparam DataT data type whose array gets scattered
+ * @tparam IdxT indexing type
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Lambda the device-lambda performing a unary operation on the loaded
+ * data before it gets scattered
+ * @param out the output array
+ * @param in the input array
+ * @param idx the indexing array
+ * @param len number of elements in the input array
+ * @param stream cuda stream where to launch work
+ * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This
+ * will be applied to every element before scattering it to the right location.
+ * The second param in this method will be the destination index.
+ */
+template <typename DataT, typename IdxT, typename Lambda = raft::Nop<DataT, IdxT>, int TPB = 256>
+void scatter(DataT* out,
+             const DataT* in,
+             const IdxT* idx,
+             IdxT len,
+             cudaStream_t stream,
+             Lambda op = raft::Nop<DataT, IdxT>())
+{
   if (len <= 0) return;
-  constexpr size_t DataSize = sizeof(DataT);
-  constexpr size_t IdxSize = sizeof(IdxT);
+  constexpr size_t DataSize   = sizeof(DataT);
+  constexpr size_t IdxSize    = sizeof(IdxT);
   constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize;
-  size_t bytes = len * MaxPerElem;
+  size_t bytes                = len * MaxPerElem;
   if (16 / MaxPerElem && bytes % 16 == 0) {
-    scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len,
-                                                           op, stream);
+    scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (8 / MaxPerElem && bytes % 8 == 0) {
-    scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
-                                                          stream);
+    scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (4 / MaxPerElem && bytes % 4 == 0) {
-    scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
-                                                          stream);
+    scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (2 / MaxPerElem && bytes % 2 == 0) {
-    scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
-                                                          stream);
+    scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (1 / MaxPerElem) {
-    scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
-                                                          stream);
+    scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else {
     scatterImpl<DataT, 1, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   }
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index dc172c9503..72c3b3897e 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -25,16 +25,7 @@ namespace raft {
 namespace comms {
 
 typedef unsigned int request_t;
-enum class datatype_t {
-  CHAR,
-  UINT8,
-  INT32,
-  UINT32,
-  INT64,
-  UINT64,
-  FLOAT32,
-  FLOAT64
-};
+enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
 enum class op_t { SUM, PROD, MIN, MAX };
 
 /**
@@ -50,42 +41,50 @@ template <typename value_t>
 constexpr datatype_t get_type();
 
 template <>
-constexpr datatype_t get_type<char>() {
+constexpr datatype_t get_type<char>()
+{
   return datatype_t::CHAR;
 }
 
 template <>
-constexpr datatype_t get_type<uint8_t>() {
+constexpr datatype_t get_type<uint8_t>()
+{
   return datatype_t::UINT8;
 }
 
 template <>
-constexpr datatype_t get_type<int>() {
+constexpr datatype_t get_type<int>()
+{
   return datatype_t::INT32;
 }
 
 template <>
-constexpr datatype_t get_type<uint32_t>() {
+constexpr datatype_t get_type<uint32_t>()
+{
   return datatype_t::UINT32;
 }
 
 template <>
-constexpr datatype_t get_type<int64_t>() {
+constexpr datatype_t get_type<int64_t>()
+{
   return datatype_t::INT64;
 }
 
 template <>
-constexpr datatype_t get_type<uint64_t>() {
+constexpr datatype_t get_type<uint64_t>()
+{
   return datatype_t::UINT64;
 }
 
 template <>
-constexpr datatype_t get_type<float>() {
+constexpr datatype_t get_type<float>()
+{
   return datatype_t::FLOAT32;
 }
 
 template <>
-constexpr datatype_t get_type<double>() {
+constexpr datatype_t get_type<double>()
+{
   return datatype_t::FLOAT64;
 }
 
@@ -95,72 +94,99 @@ class comms_iface {
   virtual int get_rank() const = 0;
 
   virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
-  virtual void barrier() const = 0;
+  virtual void barrier() const                                              = 0;
 
   virtual status_t sync_stream(cudaStream_t stream) const = 0;
 
-  virtual void isend(const void* buf, size_t size, int dest, int tag,
-                     request_t* request) const = 0;
+  virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
 
-  virtual void irecv(void* buf, size_t size, int source, int tag,
-                     request_t* request) const = 0;
+  virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
 
   virtual void waitall(int count, request_t array_of_requests[]) const = 0;
 
-  virtual void allreduce(const void* sendbuff, void* recvbuff, size_t count,
-                         datatype_t datatype, op_t op,
+  virtual void allreduce(const void* sendbuff,
+                         void* recvbuff,
+                         size_t count,
+                         datatype_t datatype,
+                         op_t op,
                          cudaStream_t stream) const = 0;
 
-  virtual void bcast(void* buff, size_t count, datatype_t datatype, int root,
-                     cudaStream_t stream) const = 0;
+  virtual void bcast(
+    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
 
-  virtual void reduce(const void* sendbuff, void* recvbuff, size_t count,
-                      datatype_t datatype, op_t op, int root,
+  virtual void reduce(const void* sendbuff,
+                      void* recvbuff,
+                      size_t count,
+                      datatype_t datatype,
+                      op_t op,
+                      int root,
                       cudaStream_t stream) const = 0;
 
-  virtual void allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                         datatype_t datatype, cudaStream_t stream) const = 0;
-
-  virtual void allgatherv(const void* sendbuf, void* recvbuf,
-                          const size_t* recvcounts, const size_t* displs,
-                          datatype_t datatype, cudaStream_t stream) const = 0;
+  virtual void allgather(const void* sendbuff,
+                         void* recvbuff,
+                         size_t sendcount,
+                         datatype_t datatype,
+                         cudaStream_t stream) const = 0;
 
-  virtual void gather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                      datatype_t datatype, int root,
+  virtual void allgatherv(const void* sendbuf,
+                          void* recvbuf,
+                          const size_t* recvcounts,
+                          const size_t* displs,
+                          datatype_t datatype,
+                          cudaStream_t stream) const = 0;
+
+  virtual void gather(const void* sendbuff,
+                      void* recvbuff,
+                      size_t sendcount,
+                      datatype_t datatype,
+                      int root,
                       cudaStream_t stream) const = 0;
 
-  virtual void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount,
-                       const size_t* recvcounts, const size_t* displs,
-                       datatype_t datatype, int root,
+  virtual void gatherv(const void* sendbuf,
+                       void* recvbuf,
+                       size_t sendcount,
+                       const size_t* recvcounts,
+                       const size_t* displs,
+                       datatype_t datatype,
+                       int root,
                        cudaStream_t stream) const = 0;
 
-  virtual void reducescatter(const void* sendbuff, void* recvbuff,
-                             size_t recvcount, datatype_t datatype, op_t op,
+  virtual void reducescatter(const void* sendbuff,
+                             void* recvbuff,
+                             size_t recvcount,
+                             datatype_t datatype,
+                             op_t op,
                              cudaStream_t stream) const = 0;
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_send(const void* buf, size_t size, int dest,
-                           cudaStream_t stream) const = 0;
+  virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_recv(void* buf, size_t size, int source,
-                           cudaStream_t stream) const = 0;
-
-  virtual void device_sendrecv(const void* sendbuf, size_t sendsize, int dest,
-                               void* recvbuf, size_t recvsize, int source,
+  virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
+
+  virtual void device_sendrecv(const void* sendbuf,
+                               size_t sendsize,
+                               int dest,
+                               void* recvbuf,
+                               size_t recvsize,
+                               int source,
                                cudaStream_t stream) const = 0;
 
-  virtual void device_multicast_sendrecv(
-    const void* sendbuf, std::vector<size_t> const& sendsizes,
-    std::vector<size_t> const& sendoffsets, std::vector<int> const& dests,
-    void* recvbuf, std::vector<size_t> const& recvsizes,
-    std::vector<size_t> const& recvoffsets, std::vector<int> const& sources,
-    cudaStream_t stream) const = 0;
+  virtual void device_multicast_sendrecv(const void* sendbuf,
+                                         std::vector<size_t> const& sendsizes,
+                                         std::vector<size_t> const& sendoffsets,
+                                         std::vector<int> const& dests,
+                                         void* recvbuf,
+                                         std::vector<size_t> const& recvsizes,
+                                         std::vector<size_t> const& recvoffsets,
+                                         std::vector<int> const& sources,
+                                         cudaStream_t stream) const = 0;
 };
 
 class comms_t {
  public:
-  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release()) {
+  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
+  {
     ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
   }
 
@@ -187,7 +213,8 @@ class comms_t {
    * @param color ranks w/ the same color are placed in the same communicator
    * @param key controls rank assignment
    */
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const {
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
     return impl_->comm_split(color, key);
   }
 
@@ -204,9 +231,7 @@ class comms_t {
    *
    * @param stream the cuda stream to sync collective operations on
    */
-  status_t sync_stream(cudaStream_t stream) const {
-    return impl_->sync_stream(stream);
-  }
+  status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
 
   /**
    * Performs an asynchronous point-to-point send
@@ -219,10 +244,9 @@ class comms_t {
    * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
    */
   template <typename value_t>
-  void isend(const value_t* buf, size_t size, int dest, int tag,
-             request_t* request) const {
-    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest,
-                 tag, request);
+  void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
+  {
+    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
   }
 
   /**
@@ -236,10 +260,9 @@ class comms_t {
    * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
    */
   template <typename value_t>
-  void irecv(value_t* buf, size_t size, int source, int tag,
-             request_t* request) const {
-    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag,
-                 request);
+  void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
+  {
+    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
   }
 
   /**
@@ -247,7 +270,8 @@ class comms_t {
    * @param count number of requests to synchronize on
    * @param array_of_requests an array of request_t objects returned from isend/irecv
    */
-  void waitall(int count, request_t array_of_requests[]) const {
+  void waitall(int count, request_t array_of_requests[]) const
+  {
     impl_->waitall(count, array_of_requests);
   }
 
@@ -261,11 +285,15 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void allreduce(const value_t* sendbuff, value_t* recvbuff, size_t count,
-                 op_t op, cudaStream_t stream) const {
+  void allreduce(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
+  {
     impl_->allreduce(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff), count, get_type<value_t>(),
-                     op, stream);
+                     static_cast<void*>(recvbuff),
+                     count,
+                     get_type<value_t>(),
+                     op,
+                     stream);
   }
 
   /**
@@ -277,9 +305,9 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const {
-    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root,
-                 stream);
+  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
+  {
+    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
   }
 
   /**
@@ -293,11 +321,20 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op,
-              int root, cudaStream_t stream) const {
+  void reduce(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t count,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
     impl_->reduce(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff), count, get_type<value_t>(), op,
-                  root, stream);
+                  static_cast<void*>(recvbuff),
+                  count,
+                  get_type<value_t>(),
+                  op,
+                  root,
+                  stream);
   }
 
   /**
@@ -309,11 +346,16 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void allgather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount,
-                 cudaStream_t stream) const {
+  void allgather(const value_t* sendbuff,
+                 value_t* recvbuff,
+                 size_t sendcount,
+                 cudaStream_t stream) const
+  {
     impl_->allgather(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff), sendcount,
-                     get_type<value_t>(), stream);
+                     static_cast<void*>(recvbuff),
+                     sendcount,
+                     get_type<value_t>(),
+                     stream);
   }
 
   /**
@@ -328,12 +370,18 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void allgatherv(const value_t* sendbuf, value_t* recvbuf,
-                  const size_t* recvcounts, const size_t* displs,
-                  cudaStream_t stream) const {
+  void allgatherv(const value_t* sendbuf,
+                  value_t* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  cudaStream_t stream) const
+  {
     impl_->allgatherv(static_cast<const void*>(sendbuf),
-                      static_cast<void*>(recvbuf), recvcounts, displs,
-                      get_type<value_t>(), stream);
+                      static_cast<void*>(recvbuf),
+                      recvcounts,
+                      displs,
+                      get_type<value_t>(),
+                      stream);
   }
 
   /**
@@ -346,11 +394,18 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void gather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount,
-              int root, cudaStream_t stream) const {
+  void gather(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t sendcount,
+              int root,
+              cudaStream_t stream) const
+  {
     impl_->gather(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff), sendcount, get_type<value_t>(),
-                  root, stream);
+                  static_cast<void*>(recvbuff),
+                  sendcount,
+                  get_type<value_t>(),
+                  root,
+                  stream);
   }
 
   /**
@@ -367,12 +422,22 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void gatherv(const value_t* sendbuf, value_t* recvbuf, size_t sendcount,
-               const size_t* recvcounts, const size_t* displs, int root,
-               cudaStream_t stream) const {
+  void gatherv(const value_t* sendbuf,
+               value_t* recvbuf,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               int root,
+               cudaStream_t stream) const
+  {
     impl_->gatherv(static_cast<const void*>(sendbuf),
-                   static_cast<void*>(recvbuf), sendcount, recvcounts, displs,
-                   get_type<value_t>(), root, stream);
+                   static_cast<void*>(recvbuf),
+                   sendcount,
+                   recvcounts,
+                   displs,
+                   get_type<value_t>(),
+                   root,
+                   stream);
   }
 
   /**
@@ -384,11 +449,18 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void reducescatter(const value_t* sendbuff, value_t* recvbuff,
-                     size_t recvcount, op_t op, cudaStream_t stream) const {
+  void reducescatter(const value_t* sendbuff,
+                     value_t* recvbuff,
+                     size_t recvcount,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
     impl_->reducescatter(static_cast<const void*>(sendbuff),
-                         static_cast<void*>(recvbuff), recvcount,
-                         get_type<value_t>(), op, stream);
+                         static_cast<void*>(recvbuff),
+                         recvcount,
+                         get_type<value_t>(),
+                         op,
+                         stream);
   }
 
   /**
@@ -403,10 +475,9 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_send(const value_t* buf, size_t size, int dest,
-                   cudaStream_t stream) const {
-    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t),
-                       dest, stream);
+  void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
+  {
+    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
   }
 
   /**
@@ -421,10 +492,9 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_recv(value_t* buf, size_t size, int source,
-                   cudaStream_t stream) const {
-    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source,
-                       stream);
+  void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
+  {
+    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
   }
 
   /**
@@ -440,12 +510,21 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_sendrecv(const value_t* sendbuf, size_t sendsize, int dest,
-                       value_t* recvbuf, size_t recvsize, int source,
-                       cudaStream_t stream) const {
-    impl_->device_sendrecv(
-      static_cast<const void*>(sendbuf), sendsize * sizeof(value_t), dest,
-      static_cast<void*>(recvbuf), recvsize * sizeof(value_t), source, stream);
+  void device_sendrecv(const value_t* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       value_t* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
+    impl_->device_sendrecv(static_cast<const void*>(sendbuf),
+                           sendsize * sizeof(value_t),
+                           dest,
+                           static_cast<void*>(recvbuf),
+                           recvsize * sizeof(value_t),
+                           source,
+                           stream);
   }
 
   /**
@@ -463,28 +542,37 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_multicast_sendrecv(
-    const value_t* sendbuf, std::vector<size_t> const& sendsizes,
-    std::vector<size_t> const& sendoffsets, std::vector<int> const& dests,
-    value_t* recvbuf, std::vector<size_t> const& recvsizes,
-    std::vector<size_t> const& recvoffsets, std::vector<int> const& sources,
-    cudaStream_t stream) const {
-    auto sendbytesizes = sendsizes;
+  void device_multicast_sendrecv(const value_t* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
+                                 value_t* recvbuf,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
+                                 cudaStream_t stream) const
+  {
+    auto sendbytesizes   = sendsizes;
     auto sendbyteoffsets = sendoffsets;
     for (size_t i = 0; i < sendsizes.size(); ++i) {
       sendbytesizes[i] *= sizeof(value_t);
       sendbyteoffsets[i] *= sizeof(value_t);
     }
-    auto recvbytesizes = recvsizes;
+    auto recvbytesizes   = recvsizes;
     auto recvbyteoffsets = recvoffsets;
     for (size_t i = 0; i < recvsizes.size(); ++i) {
       recvbytesizes[i] *= sizeof(value_t);
       recvbyteoffsets[i] *= sizeof(value_t);
     }
     impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
-                                     sendbytesizes, sendbyteoffsets, dests,
-                                     static_cast<void*>(recvbuf), recvbytesizes,
-                                     recvbyteoffsets, sources, stream);
+                                     sendbytesizes,
+                                     sendbyteoffsets,
+                                     dests,
+                                     static_cast<void*>(recvbuf),
+                                     recvbytesizes,
+                                     recvbyteoffsets,
+                                     sources,
+                                     stream);
   }
 
  private:
diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp
index 7b24e31bbe..93e31b4d6a 100644
--- a/cpp/include/raft/comms/helper.hpp
+++ b/cpp/include/raft/comms/helper.hpp
@@ -36,9 +36,9 @@ namespace comms {
  * @param num_ranks number of ranks in communicator clique
  * @param rank rank of local instance
  */
-void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm,
-                           int num_ranks, int rank) {
-  auto d_alloc = handle->get_device_allocator();
+void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank)
+{
+  auto d_alloc        = handle->get_device_allocator();
   cudaStream_t stream = handle->get_stream();
 
   auto communicator = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(
@@ -61,40 +61,41 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm,
  * @param num_ranks number of ranks in communicator clique
  * @param rank rank of local instance
  */
-void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm,
-                          void *ucp_worker, void *eps, int num_ranks,
-                          int rank) {
-  auto eps_sp = std::make_shared<ucp_ep_h *>(new ucp_ep_h[num_ranks]);
+void build_comms_nccl_ucx(
+  handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank)
+{
+  auto eps_sp = std::make_shared<ucp_ep_h*>(new ucp_ep_h[num_ranks]);
 
-  auto size_t_ep_arr = reinterpret_cast<size_t *>(eps);
+  auto size_t_ep_arr = reinterpret_cast<size_t*>(eps);
 
   for (int i = 0; i < num_ranks; i++) {
-    size_t ptr = size_t_ep_arr[i];
-    auto ucp_ep_v = reinterpret_cast<ucp_ep_h *>(*eps_sp);
+    size_t ptr    = size_t_ep_arr[i];
+    auto ucp_ep_v = reinterpret_cast<ucp_ep_h*>(*eps_sp);
 
     if (ptr != 0) {
       auto eps_ptr = reinterpret_cast<ucp_ep_h>(size_t_ep_arr[i]);
-      ucp_ep_v[i] = eps_ptr;
+      ucp_ep_v[i]  = eps_ptr;
     } else {
       ucp_ep_v[i] = nullptr;
     }
   }
 
-  auto d_alloc = handle->get_device_allocator();
+  auto d_alloc        = handle->get_device_allocator();
   cudaStream_t stream = handle->get_stream();
 
-  auto communicator = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(
-    new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, eps_sp,
-                               num_ranks, rank, d_alloc, stream)));
+  auto communicator =
+    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new raft::comms::std_comms(
+      nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, d_alloc, stream)));
   handle->set_comms(communicator);
 }
 
-inline void nccl_unique_id_from_char(ncclUniqueId *id, char *uniqueId,
-                                     int size) {
+inline void nccl_unique_id_from_char(ncclUniqueId* id, char* uniqueId, int size)
+{
   memcpy(id->internal, uniqueId, size);
 }
 
-inline void get_unique_id(char *uid, int size) {
+inline void get_unique_id(char* uid, int size)
+{
   ncclUniqueId id;
   ncclGetUniqueId(&id);
 
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index 8dda74f0a9..65f38b2625 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -32,16 +32,16 @@
 #include <raft/error.hpp>
 #include <raft/handle.hpp>
 
-#define MPI_TRY(call)                                                          \
-  do {                                                                         \
-    int status = call;                                                         \
-    if (MPI_SUCCESS != status) {                                               \
-      int mpi_error_string_lenght = 0;                                         \
-      char mpi_error_string[MPI_MAX_ERROR_STRING];                             \
-      MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght);    \
-      RAFT_EXPECTS(MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", \
-                   #call, mpi_error_string);                                   \
-    }                                                                          \
+#define MPI_TRY(call)                                                                         \
+  do {                                                                                        \
+    int status = call;                                                                        \
+    if (MPI_SUCCESS != status) {                                                              \
+      int mpi_error_string_lenght = 0;                                                        \
+      char mpi_error_string[MPI_MAX_ERROR_STRING];                                            \
+      MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght);                   \
+      RAFT_EXPECTS(                                                                           \
+        MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", #call, mpi_error_string); \
+    }                                                                                         \
   } while (0)
 
 #define MPI_TRY_NO_THROW(call)                                              \
@@ -51,48 +51,41 @@
       int mpi_error_string_lenght = 0;                                      \
       char mpi_error_string[MPI_MAX_ERROR_STRING];                          \
       MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \
-      printf("MPI call='%s' at file=%s line=%d failed with %s ", #call,     \
-             __FILE__, __LINE__, mpi_error_string);                         \
+      printf("MPI call='%s' at file=%s line=%d failed with %s ",            \
+             #call,                                                         \
+             __FILE__,                                                      \
+             __LINE__,                                                      \
+             mpi_error_string);                                             \
     }                                                                       \
   } while (0)
 
 namespace raft {
 namespace comms {
 
-constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) {
+constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype)
+{
   switch (datatype) {
-    case datatype_t::CHAR:
-      return MPI_CHAR;
-    case datatype_t::UINT8:
-      return MPI_UNSIGNED_CHAR;
-    case datatype_t::INT32:
-      return MPI_INT;
-    case datatype_t::UINT32:
-      return MPI_UNSIGNED;
-    case datatype_t::INT64:
-      return MPI_LONG_LONG;
-    case datatype_t::UINT64:
-      return MPI_UNSIGNED_LONG_LONG;
-    case datatype_t::FLOAT32:
-      return MPI_FLOAT;
-    case datatype_t::FLOAT64:
-      return MPI_DOUBLE;
+    case datatype_t::CHAR: return MPI_CHAR;
+    case datatype_t::UINT8: return MPI_UNSIGNED_CHAR;
+    case datatype_t::INT32: return MPI_INT;
+    case datatype_t::UINT32: return MPI_UNSIGNED;
+    case datatype_t::INT64: return MPI_LONG_LONG;
+    case datatype_t::UINT64: return MPI_UNSIGNED_LONG_LONG;
+    case datatype_t::FLOAT32: return MPI_FLOAT;
+    case datatype_t::FLOAT64: return MPI_DOUBLE;
     default:
       // Execution should never reach here. This takes care of compiler warning.
       return MPI_DOUBLE;
   }
 }
 
-constexpr MPI_Op get_mpi_op(const op_t op) {
+constexpr MPI_Op get_mpi_op(const op_t op)
+{
   switch (op) {
-    case op_t::SUM:
-      return MPI_SUM;
-    case op_t::PROD:
-      return MPI_PROD;
-    case op_t::MIN:
-      return MPI_MIN;
-    case op_t::MAX:
-      return MPI_MAX;
+    case op_t::SUM: return MPI_SUM;
+    case op_t::PROD: return MPI_PROD;
+    case op_t::MIN: return MPI_MIN;
+    case op_t::MAX: return MPI_MAX;
     default:
       // Execution should never reach here. This takes care of compiler warning.
       return MPI_MAX;
@@ -102,38 +95,35 @@ constexpr MPI_Op get_mpi_op(const op_t op) {
 class mpi_comms : public comms_iface {
  public:
   mpi_comms(MPI_Comm comm, const bool owns_mpi_comm)
-    : owns_mpi_comm_(owns_mpi_comm),
-      mpi_comm_(comm),
-      size_(0),
-      rank_(1),
-      next_request_id_(0) {
+    : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0)
+  {
     int mpi_is_initialized = 0;
     MPI_TRY(MPI_Initialized(&mpi_is_initialized));
     RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!");
     MPI_TRY(MPI_Comm_size(mpi_comm_, &size_));
     MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_));
-    //get NCCL unique ID at rank 0 and broadcast it to all others
+    // get NCCL unique ID at rank 0 and broadcast it to all others
     ncclUniqueId id;
     if (0 == rank_) NCCL_TRY(ncclGetUniqueId(&id));
     MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_));
 
-    //initializing NCCL
+    // initializing NCCL
     NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_));
   }
 
-  virtual ~mpi_comms() {
-    //finalizing NCCL
+  virtual ~mpi_comms()
+  {
+    // finalizing NCCL
     NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_));
-    if (owns_mpi_comm_) {
-      MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_));
-    }
+    if (owns_mpi_comm_) { MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); }
   }
 
   int get_size() const { return size_; }
 
   int get_rank() const { return rank_; }
 
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const {
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
     MPI_Comm new_comm;
     MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm));
     return std::unique_ptr<comms_iface>(new mpi_comms(new_comm, true));
@@ -141,15 +131,15 @@ class mpi_comms : public comms_iface {
 
   void barrier() const { MPI_TRY(MPI_Barrier(mpi_comm_)); }
 
-  void isend(const void* buf, size_t size, int dest, int tag,
-             request_t* request) const {
+  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
+  {
     MPI_Request mpi_req;
     request_t req_id;
     if (free_requests_.empty()) {
       req_id = next_request_id_++;
     } else {
       auto it = free_requests_.begin();
-      req_id = *it;
+      req_id  = *it;
       free_requests_.erase(it);
     }
     MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req));
@@ -157,15 +147,15 @@ class mpi_comms : public comms_iface {
     *request = req_id;
   }
 
-  void irecv(void* buf, size_t size, int source, int tag,
-             request_t* request) const {
+  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
+  {
     MPI_Request mpi_req;
     request_t req_id;
     if (free_requests_.empty()) {
       req_id = next_request_id_++;
     } else {
       auto it = free_requests_.begin();
-      req_id = *it;
+      req_id  = *it;
       free_requests_.erase(it);
     }
 
@@ -174,7 +164,8 @@ class mpi_comms : public comms_iface {
     *request = req_id;
   }
 
-  void waitall(int count, request_t array_of_requests[]) const {
+  void waitall(int count, request_t array_of_requests[]) const
+  {
     std::vector<MPI_Request> requests;
     requests.reserve(count);
     for (int i = 0; i < count; ++i) {
@@ -189,87 +180,138 @@ class mpi_comms : public comms_iface {
     MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE));
   }
 
-  void allreduce(const void* sendbuff, void* recvbuff, size_t count,
-                 datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count,
-                           get_nccl_datatype(datatype), get_nccl_op(op),
-                           nccl_comm_, stream));
+  void allreduce(const void* sendbuff,
+                 void* recvbuff,
+                 size_t count,
+                 datatype_t datatype,
+                 op_t op,
+                 cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclAllReduce(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
   }
 
-  void bcast(void* buff, size_t count, datatype_t datatype, int root,
-             cudaStream_t stream) const {
-    NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root,
-                           nccl_comm_, stream));
+  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
+  {
+    NCCL_TRY(
+      ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
-  void reduce(const void* sendbuff, void* recvbuff, size_t count,
-              datatype_t datatype, op_t op, int root,
-              cudaStream_t stream) const {
-    NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype),
-                        get_nccl_op(op), root, nccl_comm_, stream));
+  void reduce(const void* sendbuff,
+              void* recvbuff,
+              size_t count,
+              datatype_t datatype,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclReduce(sendbuff,
+                        recvbuff,
+                        count,
+                        get_nccl_datatype(datatype),
+                        get_nccl_op(op),
+                        root,
+                        nccl_comm_,
+                        stream));
   }
 
-  void allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                 datatype_t datatype, cudaStream_t stream) const {
-    NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount,
-                           get_nccl_datatype(datatype), nccl_comm_, stream));
+  void allgather(const void* sendbuff,
+                 void* recvbuff,
+                 size_t sendcount,
+                 datatype_t datatype,
+                 cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclAllGather(
+      sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
   }
 
-  void allgatherv(const void* sendbuf, void* recvbuf, const size_t* recvcounts,
-                  const size_t* displs, datatype_t datatype,
-                  cudaStream_t stream) const {
-    //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf
-    //Listing 1 on page 4.
+  void allgatherv(const void* sendbuf,
+                  void* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  datatype_t datatype,
+                  cudaStream_t stream) const
+  {
+    // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
+    // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
     for (int root = 0; root < size_; ++root) {
-      NCCL_TRY(ncclBroadcast(sendbuf,
-                             static_cast<char*>(recvbuf) +
-                               displs[root] * get_datatype_size(datatype),
-                             recvcounts[root], get_nccl_datatype(datatype),
-                             root, nccl_comm_, stream));
+      NCCL_TRY(
+        ncclBroadcast(sendbuf,
+                      static_cast<char*>(recvbuf) + displs[root] * get_datatype_size(datatype),
+                      recvcounts[root],
+                      get_nccl_datatype(datatype),
+                      root,
+                      nccl_comm_,
+                      stream));
     }
   }
 
-  void gather(const void* sendbuff, void* recvbuff, size_t sendcount,
-              datatype_t datatype, int root, cudaStream_t stream) const {
+  void gather(const void* sendbuff,
+              void* recvbuff,
+              size_t sendcount,
+              datatype_t datatype,
+              int root,
+              cudaStream_t stream) const
+  {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(
-          static_cast<char*>(recvbuff) + sendcount * r * dtype_size, sendcount,
-          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
+                          sendcount,
+                          get_nccl_datatype(datatype),
+                          r,
+                          nccl_comm_,
+                          stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
-                      nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void gatherv(const void* sendbuff, void* recvbuff, size_t sendcount,
-               const size_t* recvcounts, const size_t* displs,
-               datatype_t datatype, int root, cudaStream_t stream) const {
+  void gatherv(const void* sendbuff,
+               void* recvbuff,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               datatype_t datatype,
+               int root,
+               cudaStream_t stream) const
+  {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
         NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
-                          recvcounts[r], get_nccl_datatype(datatype), r,
-                          nccl_comm_, stream));
+                          recvcounts[r],
+                          get_nccl_datatype(datatype),
+                          r,
+                          nccl_comm_,
+                          stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
-                      nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount,
-                     datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
-                               get_nccl_datatype(datatype), get_nccl_op(op),
-                               nccl_comm_, stream));
+  void reducescatter(const void* sendbuff,
+                     void* recvbuff,
+                     size_t recvcount,
+                     datatype_t datatype,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclReduceScatter(sendbuff,
+                               recvbuff,
+                               recvcount,
+                               get_nccl_datatype(datatype),
+                               get_nccl_op(op),
+                               nccl_comm_,
+                               stream));
   }
 
-  status_t sync_stream(cudaStream_t stream) const {
+  status_t sync_stream(cudaStream_t stream) const
+  {
     cudaError_t cudaErr;
     ncclResult_t ncclErr, ncclAsyncErr;
     while (1) {
@@ -302,45 +344,58 @@ class mpi_comms : public comms_iface {
   };
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_send(const void* buf, size_t size, int dest,
-                   cudaStream_t stream) const {
+  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
+  {
     NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_recv(void* buf, size_t size, int source,
-                   cudaStream_t stream) const {
+  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
+  {
     NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
   }
 
-  void device_sendrecv(const void* sendbuf, size_t sendsize, int dest,
-                       void* recvbuf, size_t recvsize, int source,
-                       cudaStream_t stream) const {
+  void device_sendrecv(const void* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       void* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
-    NCCL_TRY(
-      ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
+    NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
   void device_multicast_sendrecv(const void* sendbuf,
                                  std::vector<size_t> const& sendsizes,
                                  std::vector<size_t> const& sendoffsets,
-                                 std::vector<int> const& dests, void* recvbuf,
+                                 std::vector<int> const& dests,
+                                 void* recvbuf,
                                  std::vector<size_t> const& recvsizes,
                                  std::vector<size_t> const& recvoffsets,
                                  std::vector<int> const& sources,
-                                 cudaStream_t stream) const {
+                                 cudaStream_t stream) const
+  {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     for (size_t i = 0; i < sendsizes.size(); ++i) {
       NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
-                        sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream));
+                        sendsizes[i],
+                        ncclUint8,
+                        dests[i],
+                        nccl_comm_,
+                        stream));
     }
     for (size_t i = 0; i < recvsizes.size(); ++i) {
       NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
-                        recvsizes[i], ncclUint8, sources[i], nccl_comm_,
+                        recvsizes[i],
+                        ncclUint8,
+                        sources[i],
+                        nccl_comm_,
                         stream));
     }
     NCCL_TRY(ncclGroupEnd());
@@ -358,9 +413,10 @@ class mpi_comms : public comms_iface {
   mutable std::unordered_set<request_t> free_requests_;
 };
 
-inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) {
-  auto communicator = std::make_shared<comms_t>(
-    std::unique_ptr<comms_iface>(new mpi_comms(comm, true)));
+inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm)
+{
+  auto communicator =
+    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mpi_comms(comm, true)));
   handle->set_comms(communicator);
 };
 
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 765e8741bb..5f80328d3f 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -62,10 +62,14 @@ class std_comms : public comms_iface {
    * @param size size of the cluster
    * @param rank rank of the current worker
    */
-  std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker,
-            std::shared_ptr<ucp_ep_h *> eps, int num_ranks, int rank,
+  std_comms(ncclComm_t nccl_comm,
+            ucp_worker_h ucp_worker,
+            std::shared_ptr<ucp_ep_h*> eps,
+            int num_ranks,
+            int rank,
             const std::shared_ptr<mr::device::allocator> device_allocator,
-            cudaStream_t stream, bool subcomms_ucp = true)
+            cudaStream_t stream,
+            bool subcomms_ucp = true)
     : nccl_comm_(nccl_comm),
       stream_(stream),
       num_ranks_(num_ranks),
@@ -74,7 +78,8 @@ class std_comms : public comms_iface {
       ucp_worker_(ucp_worker),
       ucp_eps_(eps),
       next_request_id_(0),
-      device_allocator_(device_allocator) {
+      device_allocator_(device_allocator)
+  {
     initialize();
   };
 
@@ -84,7 +89,9 @@ class std_comms : public comms_iface {
    * @param size size of the cluster
    * @param rank rank of the current worker
    */
-  std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank,
+  std_comms(const ncclComm_t nccl_comm,
+            int num_ranks,
+            int rank,
             const std::shared_ptr<mr::device::allocator> device_allocator,
             cudaStream_t stream)
     : nccl_comm_(nccl_comm),
@@ -92,37 +99,37 @@ class std_comms : public comms_iface {
       num_ranks_(num_ranks),
       rank_(rank),
       subcomms_ucp_(false),
-      device_allocator_(device_allocator) {
+      device_allocator_(device_allocator)
+  {
     initialize();
   };
 
-  virtual ~std_comms() {
+  virtual ~std_comms()
+  {
     device_allocator_->deallocate(sendbuff_, sizeof(int), stream_);
     device_allocator_->deallocate(recvbuff_, sizeof(int), stream_);
   }
 
-  void initialize() {
-    sendbuff_ = reinterpret_cast<int *>(
-      device_allocator_->allocate(sizeof(int), stream_));
-    recvbuff_ = reinterpret_cast<int *>(
-      device_allocator_->allocate(sizeof(int), stream_));
+  void initialize()
+  {
+    sendbuff_ = reinterpret_cast<int*>(device_allocator_->allocate(sizeof(int), stream_));
+    recvbuff_ = reinterpret_cast<int*>(device_allocator_->allocate(sizeof(int), stream_));
   }
 
   int get_size() const { return num_ranks_; }
 
   int get_rank() const { return rank_; }
 
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const {
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
     mr::device::buffer<int> d_colors(device_allocator_, stream_, get_size());
     mr::device::buffer<int> d_keys(device_allocator_, stream_, get_size());
 
     update_device(d_colors.data() + get_rank(), &color, 1, stream_);
     update_device(d_keys.data() + get_rank(), &key, 1, stream_);
 
-    allgather(d_colors.data() + get_rank(), d_colors.data(), 1,
-              datatype_t::INT32, stream_);
-    allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32,
-              stream_);
+    allgather(d_colors.data() + get_rank(), d_colors.data(), 1, datatype_t::INT32, stream_);
+    allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, stream_);
     this->sync_stream(stream_);
 
     std::vector<int> h_colors(get_size());
@@ -139,9 +146,7 @@ class std_comms : public comms_iface {
     for (int i = 0; i < get_size(); ++i) {
       if (h_colors[i] == color) {
         subcomm_ranks.push_back(i);
-        if (ucp_worker_ != nullptr && subcomms_ucp_) {
-          new_ucx_ptrs.push_back((*ucp_eps_)[i]);
-        }
+        if (ucp_worker_ != nullptr && subcomms_ucp_) { new_ucx_ptrs.push_back((*ucp_eps_)[i]); }
       }
     }
 
@@ -150,8 +155,7 @@ class std_comms : public comms_iface {
       NCCL_TRY(ncclGetUniqueId(&id));
       std::vector<request_t> requests(subcomm_ranks.size() - 1);
       for (size_t i = 1; i < subcomm_ranks.size(); ++i) {
-        isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color,
-              requests.data() + (i - 1));
+        isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1));
       }
       waitall(requests.size(), requests.data());
     } else {
@@ -166,17 +170,23 @@ class std_comms : public comms_iface {
     NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key));
 
     if (ucp_worker_ != nullptr && subcomms_ucp_) {
-      auto eps_sp = std::make_shared<ucp_ep_h *>(new_ucx_ptrs.data());
-      return std::unique_ptr<comms_iface>(new std_comms(
-        nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, subcomm_ranks.size(), key,
-        device_allocator_, stream_, subcomms_ucp_));
+      auto eps_sp = std::make_shared<ucp_ep_h*>(new_ucx_ptrs.data());
+      return std::unique_ptr<comms_iface>(new std_comms(nccl_comm,
+                                                        (ucp_worker_h)ucp_worker_,
+                                                        eps_sp,
+                                                        subcomm_ranks.size(),
+                                                        key,
+                                                        device_allocator_,
+                                                        stream_,
+                                                        subcomms_ucp_));
     } else {
-      return std::unique_ptr<comms_iface>(new std_comms(
-        nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_));
+      return std::unique_ptr<comms_iface>(
+        new std_comms(nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_));
     }
   }
 
-  void barrier() const {
+  void barrier() const
+  {
     CUDA_CHECK(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_));
     CUDA_CHECK(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_));
 
@@ -186,39 +196,37 @@ class std_comms : public comms_iface {
            "ERROR: syncStream failed. This can be caused by a failed rank_.");
   }
 
-  void get_request_id(request_t *req) const {
+  void get_request_id(request_t* req) const
+  {
     request_t req_id;
 
     if (this->free_requests_.empty())
       req_id = this->next_request_id_++;
     else {
       auto it = this->free_requests_.begin();
-      req_id = *it;
+      req_id  = *it;
       this->free_requests_.erase(it);
     }
     *req = req_id;
   }
 
-  void isend(const void *buf, size_t size, int dest, int tag,
-             request_t *request) const {
-    ASSERT(ucp_worker_ != nullptr,
-           "ERROR: UCX comms not initialized on communicator.");
+  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
+  {
+    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
 
     get_request_id(request);
     ucp_ep_h ep_ptr = (*ucp_eps_)[dest];
 
-    ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request));
+    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
 
-    this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag,
-                                 default_tag_mask, get_rank());
+    this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank());
 
     requests_in_flight_.insert(std::make_pair(*request, ucp_req));
   }
 
-  void irecv(void *buf, size_t size, int source, int tag,
-             request_t *request) const {
-    ASSERT(ucp_worker_ != nullptr,
-           "ERROR: UCX comms not initialized on communicator.");
+  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
+  {
+    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
 
     get_request_id(request);
 
@@ -226,18 +234,17 @@ class std_comms : public comms_iface {
 
     ucp_tag_t tag_mask = default_tag_mask;
 
-    ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request));
-    ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag,
-                           tag_mask, source);
+    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
+    ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source);
 
     requests_in_flight_.insert(std::make_pair(*request, ucp_req));
   }
 
-  void waitall(int count, request_t array_of_requests[]) const {
-    ASSERT(ucp_worker_ != nullptr,
-           "ERROR: UCX comms not initialized on communicator.");
+  void waitall(int count, request_t array_of_requests[]) const
+  {
+    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
 
-    std::vector<ucp_request *> requests;
+    std::vector<ucp_request*> requests;
     requests.reserve(count);
 
     time_t start = time(NULL);
@@ -245,7 +252,8 @@ class std_comms : public comms_iface {
     for (int i = 0; i < count; ++i) {
       auto req_it = requests_in_flight_.find(array_of_requests[i]);
       ASSERT(requests_in_flight_.end() != req_it,
-             "ERROR: waitall on invalid request: %d", array_of_requests[i]);
+             "ERROR: waitall on invalid request: %d",
+             array_of_requests[i]);
       requests.push_back(req_it->second);
       free_requests_.insert(req_it->first);
       requests_in_flight_.erase(req_it);
@@ -258,8 +266,7 @@ class std_comms : public comms_iface {
       // in 10 or more seconds.
       ASSERT(now - start < 10, "Timed out waiting for requests.");
 
-      for (std::vector<ucp_request *>::iterator it = requests.begin();
-           it != requests.end();) {
+      for (std::vector<ucp_request*>::iterator it = requests.begin(); it != requests.end();) {
         bool restart = false;  // resets the timeout when any progress was made
 
         // Causes UCP to progress through the send/recv message queue
@@ -272,10 +279,8 @@ class std_comms : public comms_iface {
         // If the message needs release, we know it will be sent/received
         // asynchronously, so we will need to track and verify its state
         if (req->needs_release) {
-          ASSERT(UCS_PTR_IS_PTR(req->req),
-                 "UCX Request Error. Request is not valid UCX pointer");
-          ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n",
-                 UCS_PTR_STATUS(req->req));
+          ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer");
+          ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req));
           ASSERT(req->req->completed == 1 || req->req->completed == 0,
                  "request->completed not a valid value: %d\n",
                  req->req->completed);
@@ -296,94 +301,143 @@ class std_comms : public comms_iface {
           ++it;
         }
         // if any progress was made, reset the timeout start time
-        if (restart) {
-          start = time(NULL);
-        }
+        if (restart) { start = time(NULL); }
       }
     }
   }
 
-  void allreduce(const void *sendbuff, void *recvbuff, size_t count,
-                 datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count,
-                           get_nccl_datatype(datatype), get_nccl_op(op),
-                           nccl_comm_, stream));
+  void allreduce(const void* sendbuff,
+                 void* recvbuff,
+                 size_t count,
+                 datatype_t datatype,
+                 op_t op,
+                 cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclAllReduce(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
   }
 
-  void bcast(void *buff, size_t count, datatype_t datatype, int root,
-             cudaStream_t stream) const {
-    NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root,
-                           nccl_comm_, stream));
+  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
+  {
+    NCCL_TRY(
+      ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
-  void reduce(const void *sendbuff, void *recvbuff, size_t count,
-              datatype_t datatype, op_t op, int root,
-              cudaStream_t stream) const {
-    NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype),
-                        get_nccl_op(op), root, nccl_comm_, stream));
+  void reduce(const void* sendbuff,
+              void* recvbuff,
+              size_t count,
+              datatype_t datatype,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclReduce(sendbuff,
+                        recvbuff,
+                        count,
+                        get_nccl_datatype(datatype),
+                        get_nccl_op(op),
+                        root,
+                        nccl_comm_,
+                        stream));
   }
 
-  void allgather(const void *sendbuff, void *recvbuff, size_t sendcount,
-                 datatype_t datatype, cudaStream_t stream) const {
-    NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount,
-                           get_nccl_datatype(datatype), nccl_comm_, stream));
+  void allgather(const void* sendbuff,
+                 void* recvbuff,
+                 size_t sendcount,
+                 datatype_t datatype,
+                 cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclAllGather(
+      sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
   }
 
-  void allgatherv(const void *sendbuf, void *recvbuf, const size_t *recvcounts,
-                  const size_t *displs, datatype_t datatype,
-                  cudaStream_t stream) const {
-    //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf
-    //Listing 1 on page 4.
+  void allgatherv(const void* sendbuf,
+                  void* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  datatype_t datatype,
+                  cudaStream_t stream) const
+  {
+    // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
+    // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
     for (int root = 0; root < num_ranks_; ++root) {
       size_t dtype_size = get_datatype_size(datatype);
-      NCCL_TRY(ncclBroadcast(
-        sendbuf, static_cast<char *>(recvbuf) + displs[root] * dtype_size,
-        recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_,
-        stream));
+      NCCL_TRY(ncclBroadcast(sendbuf,
+                             static_cast<char*>(recvbuf) + displs[root] * dtype_size,
+                             recvcounts[root],
+                             get_nccl_datatype(datatype),
+                             root,
+                             nccl_comm_,
+                             stream));
     }
   }
 
-  void gather(const void *sendbuff, void *recvbuff, size_t sendcount,
-              datatype_t datatype, int root, cudaStream_t stream) const {
+  void gather(const void* sendbuff,
+              void* recvbuff,
+              size_t sendcount,
+              datatype_t datatype,
+              int root,
+              cudaStream_t stream) const
+  {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(
-          static_cast<char *>(recvbuff) + sendcount * r * dtype_size, sendcount,
-          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
+                          sendcount,
+                          get_nccl_datatype(datatype),
+                          r,
+                          nccl_comm_,
+                          stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
-                      nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void gatherv(const void *sendbuff, void *recvbuff, size_t sendcount,
-               const size_t *recvcounts, const size_t *displs,
-               datatype_t datatype, int root, cudaStream_t stream) const {
+  void gatherv(const void* sendbuff,
+               void* recvbuff,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               datatype_t datatype,
+               int root,
+               cudaStream_t stream) const
+  {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(
-          static_cast<char *>(recvbuff) + displs[r] * dtype_size, recvcounts[r],
-          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
+                          recvcounts[r],
+                          get_nccl_datatype(datatype),
+                          r,
+                          nccl_comm_,
+                          stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
-                      nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount,
-                     datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
-                               get_nccl_datatype(datatype), get_nccl_op(op),
-                               nccl_comm_, stream));
+  void reducescatter(const void* sendbuff,
+                     void* recvbuff,
+                     size_t recvcount,
+                     datatype_t datatype,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclReduceScatter(sendbuff,
+                               recvbuff,
+                               recvcount,
+                               get_nccl_datatype(datatype),
+                               get_nccl_op(op),
+                               nccl_comm_,
+                               stream));
   }
 
-  status_t sync_stream(cudaStream_t stream) const {
+  status_t sync_stream(cudaStream_t stream) const
+  {
     cudaError_t cudaErr;
     ncclResult_t ncclErr, ncclAsyncErr;
     while (1) {
@@ -416,45 +470,58 @@ class std_comms : public comms_iface {
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_send(const void *buf, size_t size, int dest,
-                   cudaStream_t stream) const {
+  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
+  {
     NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_recv(void *buf, size_t size, int source,
-                   cudaStream_t stream) const {
+  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
+  {
     NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
   }
 
-  void device_sendrecv(const void *sendbuf, size_t sendsize, int dest,
-                       void *recvbuf, size_t recvsize, int source,
-                       cudaStream_t stream) const {
+  void device_sendrecv(const void* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       void* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
-    NCCL_TRY(
-      ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
+    NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void device_multicast_sendrecv(const void *sendbuf,
-                                 std::vector<size_t> const &sendsizes,
-                                 std::vector<size_t> const &sendoffsets,
-                                 std::vector<int> const &dests, void *recvbuf,
-                                 std::vector<size_t> const &recvsizes,
-                                 std::vector<size_t> const &recvoffsets,
-                                 std::vector<int> const &sources,
-                                 cudaStream_t stream) const {
+  void device_multicast_sendrecv(const void* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
+                                 void* recvbuf,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
+                                 cudaStream_t stream) const
+  {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     for (size_t i = 0; i < sendsizes.size(); ++i) {
-      NCCL_TRY(ncclSend(static_cast<const char *>(sendbuf) + sendoffsets[i],
-                        sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream));
+      NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
+                        sendsizes[i],
+                        ncclUint8,
+                        dests[i],
+                        nccl_comm_,
+                        stream));
     }
     for (size_t i = 0; i < recvsizes.size(); ++i) {
-      NCCL_TRY(ncclRecv(static_cast<char *>(recvbuf) + recvoffsets[i],
-                        recvsizes[i], ncclUint8, sources[i], nccl_comm_,
+      NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
+                        recvsizes[i],
+                        ncclUint8,
+                        sources[i],
+                        nccl_comm_,
                         stream));
     }
     NCCL_TRY(ncclGroupEnd());
@@ -473,10 +540,9 @@ class std_comms : public comms_iface {
 
   comms_ucp_handler ucp_handler_;
   ucp_worker_h ucp_worker_;
-  std::shared_ptr<ucp_ep_h *> ucp_eps_;
+  std::shared_ptr<ucp_ep_h*> ucp_eps_;
   mutable request_t next_request_id_;
-  mutable std::unordered_map<request_t, struct ucp_request *>
-    requests_in_flight_;
+  mutable std::unordered_map<request_t, struct ucp_request*> requests_in_flight_;
   mutable std::unordered_set<request_t> free_requests_;
 
   std::shared_ptr<mr::device::allocator> device_allocator_;
diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
index 4e95c4eef0..86827a294e 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/test.hpp
@@ -37,8 +37,9 @@ namespace comms {
  * @param the raft handle to use. This is expected to already have an
  *        initialized comms instance.
  */
-bool test_collective_allreduce(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_allreduce(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = 1;
 
@@ -46,14 +47,12 @@ bool test_collective_allreduce(const handle_t &handle, int root) {
 
   raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
   temp_d.resize(1, stream);
-  CUDA_CHECK(
-    cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
 
   communicator.allreduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, stream);
 
   int temp_h = 0;
-  CUDA_CHECK(
-    cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -69,8 +68,9 @@ bool test_collective_allreduce(const handle_t &handle, int root) {
  * @param the raft handle to use. This is expected to already have an
  *        initialized comms instance.
  */
-bool test_collective_broadcast(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_broadcast(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = root;
 
@@ -80,14 +80,12 @@ bool test_collective_broadcast(const handle_t &handle, int root) {
   temp_d.resize(1, stream);
 
   if (communicator.get_rank() == root)
-    CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
-                               cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.bcast(temp_d.data(), 1, root, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int),
-                             cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -97,8 +95,9 @@ bool test_collective_broadcast(const handle_t &handle, int root) {
   return temp_h == root;
 }
 
-bool test_collective_reduce(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_reduce(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = root;
 
@@ -107,14 +106,12 @@ bool test_collective_reduce(const handle_t &handle, int root) {
   raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
   temp_d.resize(1, stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
-                             cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int),
-                             cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -127,8 +124,9 @@ bool test_collective_reduce(const handle_t &handle, int root) {
     return true;
 }
 
-bool test_collective_allgather(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_allgather(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = communicator.get_rank();
 
@@ -137,19 +135,16 @@ bool test_collective_allgather(const handle_t &handle, int root) {
   raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
   temp_d.resize(1, stream);
 
-  raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(), stream,
-                                       communicator.get_size());
+  raft::mr::device::buffer<int> recv_d(
+    handle.get_device_allocator(), stream, communicator.get_size());
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
-                             cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.allgather(temp_d.data(), recv_d.data(), 1, stream);
   communicator.sync_stream(stream);
-  int
-    temp_h[communicator.get_size()];  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(),
-                             sizeof(int) * communicator.get_size(),
-                             cudaMemcpyDeviceToHost, stream));
+  int temp_h[communicator.get_size()];  // Verify more than one byte is being sent
+  CUDA_CHECK(cudaMemcpyAsync(
+    &temp_h, recv_d.data(), sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -162,8 +157,9 @@ bool test_collective_allgather(const handle_t &handle, int root) {
   return true;
 }
 
-bool test_collective_gather(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_gather(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = communicator.get_rank();
 
@@ -173,20 +169,19 @@ bool test_collective_gather(const handle_t &handle, int root) {
   temp_d.resize(1, stream);
 
   raft::mr::device::buffer<int> recv_d(
-    handle.get_device_allocator(), stream,
+    handle.get_device_allocator(),
+    stream,
     communicator.get_rank() == root ? communicator.get_size() : 0);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
-                             cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream);
   communicator.sync_stream(stream);
 
   if (communicator.get_rank() == root) {
     std::vector<int> temp_h(communicator.get_size(), 0);
-    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(),
-                               sizeof(int) * temp_h.size(),
-                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(
+      temp_h.data(), recv_d.data(), sizeof(int) * temp_h.size(), cudaMemcpyDeviceToHost, stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     for (int i = 0; i < communicator.get_size(); i++) {
@@ -196,46 +191,48 @@ bool test_collective_gather(const handle_t &handle, int root) {
   return true;
 }
 
-bool test_collective_gatherv(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_gatherv(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   std::vector<size_t> sendcounts(communicator.get_size());
   std::iota(sendcounts.begin(), sendcounts.end(), size_t{1});
   std::vector<size_t> displacements(communicator.get_size() + 1, 0);
-  std::partial_sum(sendcounts.begin(), sendcounts.end(),
-                   displacements.begin() + 1);
+  std::partial_sum(sendcounts.begin(), sendcounts.end(), displacements.begin() + 1);
 
-  std::vector<int> sends(displacements[communicator.get_rank() + 1] -
-                           displacements[communicator.get_rank()],
-                         communicator.get_rank());
+  std::vector<int> sends(
+    displacements[communicator.get_rank() + 1] - displacements[communicator.get_rank()],
+    communicator.get_rank());
 
   cudaStream_t stream = handle.get_stream();
 
   raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
   temp_d.resize(sends.size(), stream);
 
-  raft::mr::device::buffer<int> recv_d(
-    handle.get_device_allocator(), stream,
-    communicator.get_rank() == root ? displacements.back() : 0);
+  raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(),
+                                       stream,
+                                       communicator.get_rank() == root ? displacements.back() : 0);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
-                             sends.size() * sizeof(int), cudaMemcpyHostToDevice,
-                             stream));
+  CUDA_CHECK(cudaMemcpyAsync(
+    temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.gatherv(
-    temp_d.data(), recv_d.data(), temp_d.size(),
-    communicator.get_rank() == root ? sendcounts.data()
-                                    : static_cast<size_t *>(nullptr),
-    communicator.get_rank() == root ? displacements.data()
-                                    : static_cast<size_t *>(nullptr),
-    root, stream);
+    temp_d.data(),
+    recv_d.data(),
+    temp_d.size(),
+    communicator.get_rank() == root ? sendcounts.data() : static_cast<size_t*>(nullptr),
+    communicator.get_rank() == root ? displacements.data() : static_cast<size_t*>(nullptr),
+    root,
+    stream);
   communicator.sync_stream(stream);
 
   if (communicator.get_rank() == root) {
     std::vector<int> temp_h(displacements.back(), 0);
-    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(),
+    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(),
+                               recv_d.data(),
                                sizeof(int) * displacements.back(),
-                               cudaMemcpyDeviceToHost, stream));
+                               cudaMemcpyDeviceToHost,
+                               stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     for (int i = 0; i < communicator.get_size(); i++) {
@@ -249,28 +246,24 @@ bool test_collective_gatherv(const handle_t &handle, int root) {
   return true;
 }
 
-bool test_collective_reducescatter(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_reducescatter(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   std::vector<int> sends(communicator.get_size(), 1);
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream,
-                                       sends.size());
-  raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(), stream,
-                                       1);
+  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream, sends.size());
+  raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(), stream, 1);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
-                             sends.size() * sizeof(int), cudaMemcpyHostToDevice,
-                             stream));
+  CUDA_CHECK(cudaMemcpyAsync(
+    temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream));
 
-  communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM,
-                             stream);
+  communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int),
-                             cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -287,9 +280,10 @@ bool test_collective_reducescatter(const handle_t &handle, int root) {
  *        initialized comms instance.
  * @param number of iterations of all-to-all messaging to perform
  */
-bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
+bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -298,11 +292,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
     std::vector<request_t> requests;
     requests.resize(2 * (communicator.get_size() - 1));
     int request_idx = 0;
-    //post receives
+    // post receives
     for (int r = 0; r < communicator.get_size(); ++r) {
       if (r != rank) {
-        communicator.irecv(received_data.data() + request_idx, 1, r, 0,
-                           requests.data() + request_idx);
+        communicator.irecv(
+          received_data.data() + request_idx, 1, r, 0, requests.data() + request_idx);
         ++request_idx;
       }
     }
@@ -338,8 +332,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
       communicator.barrier();
     }
 
-    if (communicator.get_rank() == 0)
-      std::cout << "=========================" << std::endl;
+    if (communicator.get_rank() == 0) std::cout << "=========================" << std::endl;
   }
 
   return ret;
@@ -352,10 +345,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
  *        initialized comms instance.
  * @param number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
-  cudaStream_t stream = h.get_stream();
+bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
+  cudaStream_t stream         = h.get_stream();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -378,13 +372,9 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) {
 
     communicator.sync_stream(stream);
 
-    if (!sender && received_data.value(stream) != rank - 1) {
-      ret = false;
-    }
+    if (!sender && received_data.value(stream) != rank - 1) { ret = false; }
 
-    if (communicator.get_rank() == 0) {
-      std::cout << "=========================" << std::endl;
-    }
+    if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; }
   }
 
   return ret;
@@ -397,10 +387,11 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) {
  *        initialized comms instance.
  * @param number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
-  cudaStream_t stream = h.get_stream();
+bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
+  cudaStream_t stream         = h.get_stream();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -414,12 +405,12 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
 
     if (rank % 2 == 0) {
       if (rank + 1 < communicator.get_size()) {
-        communicator.device_sendrecv(sent_data.data(), 1, rank + 1,
-                                     received_data.data(), 1, rank + 1, stream);
+        communicator.device_sendrecv(
+          sent_data.data(), 1, rank + 1, received_data.data(), 1, rank + 1, stream);
       }
     } else {
-      communicator.device_sendrecv(sent_data.data(), 1, rank - 1,
-                                   received_data.data(), 1, rank - 1, stream);
+      communicator.device_sendrecv(
+        sent_data.data(), 1, rank - 1, received_data.data(), 1, rank - 1, stream);
     }
 
     communicator.sync_stream(stream);
@@ -429,9 +420,7 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
       ret = false;
     }
 
-    if (communicator.get_rank() == 0) {
-      std::cout << "=========================" << std::endl;
-    }
+    if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; }
   }
 
   return ret;
@@ -444,11 +433,11 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
  *        initialized comms instance.
  * @param number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
-                                                 int numTrials) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
-  cudaStream_t stream = h.get_stream();
+bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
+  cudaStream_t stream         = h.get_stream();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -471,25 +460,26 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
     std::vector<int> srcs(communicator.get_size());
     std::iota(srcs.begin(), srcs.end(), int{0});
 
-    communicator.device_multicast_sendrecv(
-      sent_data.data(), sendsizes, sendoffsets, dests, received_data.data(),
-      recvsizes, recvoffsets, srcs, stream);
+    communicator.device_multicast_sendrecv(sent_data.data(),
+                                           sendsizes,
+                                           sendoffsets,
+                                           dests,
+                                           received_data.data(),
+                                           recvsizes,
+                                           recvoffsets,
+                                           srcs,
+                                           stream);
 
     communicator.sync_stream(stream);
 
     std::vector<int> h_received_data(communicator.get_size());
-    raft::update_host(h_received_data.data(), received_data.data(),
-                      received_data.size(), stream);
+    raft::update_host(h_received_data.data(), received_data.data(), received_data.size(), stream);
     CUDA_TRY(cudaStreamSynchronize(stream));
     for (int i = 0; i < communicator.get_size(); ++i) {
-      if (h_received_data[i] != i) {
-        ret = false;
-      }
+      if (h_received_data[i] != i) { ret = false; }
     }
 
-    if (communicator.get_rank() == 0) {
-      std::cout << "=========================" << std::endl;
-    }
+    if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; }
   }
 
   return ret;
@@ -502,20 +492,20 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
  *        initialized comms instance.
  * @param n_colors number of different colors to test
  */
-bool test_commsplit(const handle_t &h, int n_colors) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
-  int const size = communicator.get_size();
+bool test_commsplit(const handle_t& h, int n_colors)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
+  int const size              = communicator.get_size();
 
   if (n_colors > size) n_colors = size;
 
   // first we need to assign to a color, then assign the rank within the color
   int color = rank % n_colors;
-  int key = rank / n_colors;
+  int key   = rank / n_colors;
 
   handle_t new_handle(1);
-  auto shared_comm =
-    std::make_shared<comms_t>(communicator.comm_split(color, key));
+  auto shared_comm = std::make_shared<comms_t>(communicator.comm_split(color, key));
   new_handle.set_comms(shared_comm);
 
   return test_collective_allreduce(new_handle, 0);
diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp
index 226b6f0527..89c7b25630 100644
--- a/cpp/include/raft/comms/ucp_helper.hpp
+++ b/cpp/include/raft/comms/ucp_helper.hpp
@@ -25,16 +25,19 @@
 namespace raft {
 namespace comms {
 
-typedef void (*dlsym_print_info)(ucp_ep_h, FILE *);
-typedef void (*dlsym_rec_free)(void *);
+typedef void (*dlsym_print_info)(ucp_ep_h, FILE*);
+typedef void (*dlsym_rec_free)(void*);
 typedef int (*dlsym_worker_progress)(ucp_worker_h);
 
-typedef ucs_status_ptr_t (*dlsym_send)(ucp_ep_h, const void *, size_t,
-                                       ucp_datatype_t, ucp_tag_t,
-                                       ucp_send_callback_t);
-typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, void *, size_t count,
-                                       ucp_datatype_t datatype, ucp_tag_t,
-                                       ucp_tag_t, ucp_tag_recv_callback_t);
+typedef ucs_status_ptr_t (*dlsym_send)(
+  ucp_ep_h, const void*, size_t, ucp_datatype_t, ucp_tag_t, ucp_send_callback_t);
+typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h,
+                                       void*,
+                                       size_t count,
+                                       ucp_datatype_t datatype,
+                                       ucp_tag_t,
+                                       ucp_tag_t,
+                                       ucp_tag_recv_callback_t);
 
 /**
  * Standard UCX request object that will be passed
@@ -55,9 +58,9 @@ struct ucx_context {
  */
 class ucp_request {
  public:
-  struct ucx_context *req;
-  bool needs_release = true;
-  int other_rank = -1;
+  struct ucx_context* req;
+  bool needs_release   = true;
+  int other_rank       = -1;
   bool is_send_request = false;
 };
 
@@ -67,18 +70,19 @@ static const ucp_tag_t default_tag_mask = -1;
 /**
  * @brief Asynchronous send callback sets request to completed
  */
-static void send_callback(void *request, ucs_status_t status) {
-  struct ucx_context *context = (struct ucx_context *)request;
-  context->completed = 1;
+static void send_callback(void* request, ucs_status_t status)
+{
+  struct ucx_context* context = (struct ucx_context*)request;
+  context->completed          = 1;
 }
 
 /**
  * @brief Asynchronous recv callback sets request to completed
  */
-static void recv_callback(void *request, ucs_status_t status,
-                          ucp_tag_recv_info_t *info) {
-  struct ucx_context *context = (struct ucx_context *)request;
-  context->completed = 1;
+static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_t* info)
+{
+  struct ucx_context* context = (struct ucx_context*)request;
+  context->completed          = 1;
 }
 
 /**
@@ -87,7 +91,8 @@ static void recv_callback(void *request, ucs_status_t status,
  */
 class comms_ucp_handler {
  public:
-  comms_ucp_handler() {
+  comms_ucp_handler()
+  {
     load_ucp_handle();
     load_send_func();
     load_recv_func();
@@ -99,7 +104,7 @@ class comms_ucp_handler {
   ~comms_ucp_handler() { dlclose(ucp_handle); }
 
  private:
-  void *ucp_handle;
+  void* ucp_handle;
 
   dlsym_print_info print_info_func;
   dlsym_rec_free req_free_func;
@@ -107,7 +112,8 @@ class comms_ucp_handler {
   dlsym_send send_func;
   dlsym_recv recv_func;
 
-  void load_ucp_handle() {
+  void load_ucp_handle()
+  {
     ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE);
     if (!ucp_handle) {
       ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE);
@@ -117,51 +123,56 @@ class comms_ucp_handler {
     dlerror();
   }
 
-  void assert_dlerror() {
-    char *error = dlerror();
+  void assert_dlerror()
+  {
+    char* error = dlerror();
     ASSERT(error == NULL, "Error loading function symbol: %s\n", error);
   }
 
-  void load_send_func() {
+  void load_send_func()
+  {
     send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb");
     assert_dlerror();
   }
 
-  void load_free_req_func() {
+  void load_free_req_func()
+  {
     req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free");
     assert_dlerror();
   }
 
-  void load_print_info_func() {
+  void load_print_info_func()
+  {
     print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info");
     assert_dlerror();
   }
 
-  void load_worker_progress_func() {
-    worker_progress_func =
-      (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress");
+  void load_worker_progress_func()
+  {
+    worker_progress_func = (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress");
     assert_dlerror();
   }
 
-  void load_recv_func() {
+  void load_recv_func()
+  {
     recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb");
     assert_dlerror();
   }
 
-  ucp_tag_t build_message_tag(int rank, int tag) const {
+  ucp_tag_t build_message_tag(int rank, int tag) const
+  {
     // keeping the rank in the lower bits enables debugging.
     return ((uint32_t)tag << 31) | (uint32_t)rank;
   }
 
  public:
-  int ucp_progress(ucp_worker_h worker) const {
-    return (*(worker_progress_func))(worker);
-  }
+  int ucp_progress(ucp_worker_h worker) const { return (*(worker_progress_func))(worker); }
 
   /**
    * @brief Frees any memory underlying the given ucp request object
    */
-  void free_ucp_request(ucp_request *request) const {
+  void free_ucp_request(ucp_request* request) const
+  {
     if (request->needs_release) {
       request->req->completed = 0;
       (*(req_free_func))(request->req);
@@ -172,56 +183,67 @@ class comms_ucp_handler {
   /**
    * @brief Asynchronously send data to the given endpoint using the given tag
    */
-  void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf,
-                 size_t size, int tag, ucp_tag_t tag_mask, int rank) const {
+  void ucp_isend(ucp_request* req,
+                 ucp_ep_h ep_ptr,
+                 const void* buf,
+                 size_t size,
+                 int tag,
+                 ucp_tag_t tag_mask,
+                 int rank) const
+  {
     ucp_tag_t ucp_tag = build_message_tag(rank, tag);
 
-    ucs_status_ptr_t send_result = (*(send_func))(
-      ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback);
-    struct ucx_context *ucp_req = (struct ucx_context *)send_result;
+    ucs_status_ptr_t send_result =
+      (*(send_func))(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback);
+    struct ucx_context* ucp_req = (struct ucx_context*)send_result;
 
     if (UCS_PTR_IS_ERR(send_result)) {
       ASSERT(!UCS_PTR_IS_ERR(send_result),
              "unable to send UCX data message (%d)\n",
              UCS_PTR_STATUS(send_result));
       /**
-     * If the request didn't fail, but it's not OK, it is in flight.
-     * Expect the handler to be invoked
-     */
+       * If the request didn't fail, but it's not OK, it is in flight.
+       * Expect the handler to be invoked
+       */
     } else if (UCS_PTR_STATUS(send_result) != UCS_OK) {
       /**
-      * If the request is OK, it's already been completed and we don't need to wait on it.
-      * The request will be a nullptr, however, so we need to create a new request
-      * and set it to completed to make the "waitall()" function work properly.
-      */
+       * If the request is OK, it's already been completed and we don't need to wait on it.
+       * The request will be a nullptr, however, so we need to create a new request
+       * and set it to completed to make the "waitall()" function work properly.
+       */
       req->needs_release = true;
     } else {
       req->needs_release = false;
     }
 
-    req->other_rank = rank;
+    req->other_rank      = rank;
     req->is_send_request = true;
-    req->req = ucp_req;
+    req->req             = ucp_req;
   }
 
   /**
    * @brief Asynchronously receive data from given endpoint with the given tag.
    */
-  void ucp_irecv(ucp_request *req, ucp_worker_h worker, ucp_ep_h ep_ptr,
-                 void *buf, size_t size, int tag, ucp_tag_t tag_mask,
-                 int sender_rank) const {
+  void ucp_irecv(ucp_request* req,
+                 ucp_worker_h worker,
+                 ucp_ep_h ep_ptr,
+                 void* buf,
+                 size_t size,
+                 int tag,
+                 ucp_tag_t tag_mask,
+                 int sender_rank) const
+  {
     ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag);
 
     ucs_status_ptr_t recv_result =
-      (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag,
-                     tag_mask, recv_callback);
+      (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback);
 
-    struct ucx_context *ucp_req = (struct ucx_context *)recv_result;
+    struct ucx_context* ucp_req = (struct ucx_context*)recv_result;
 
-    req->req = ucp_req;
-    req->needs_release = true;
+    req->req             = ucp_req;
+    req->needs_release   = true;
     req->is_send_request = false;
-    req->other_rank = sender_rank;
+    req->other_rank      = sender_rank;
 
     ASSERT(!UCS_PTR_IS_ERR(recv_result),
            "unable to receive UCX data message (%d)\n",
diff --git a/cpp/include/raft/comms/util.hpp b/cpp/include/raft/comms/util.hpp
index f3216abc37..1b0548fc00 100644
--- a/cpp/include/raft/comms/util.hpp
+++ b/cpp/include/raft/comms/util.hpp
@@ -26,88 +26,70 @@
  * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
  * exception detailing the NCCL error that occurred
  */
-#define NCCL_TRY(call)                                                        \
-  do {                                                                        \
-    ncclResult_t const status = (call);                                       \
-    if (ncclSuccess != status) {                                              \
-      std::string msg{};                                                      \
-      SET_ERROR_MSG(msg,                                                      \
-                    "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \
-                    #call, status, ncclGetErrorString(status));               \
-      throw raft::logic_error(msg);                                           \
-    }                                                                         \
+#define NCCL_TRY(call)                             \
+  do {                                             \
+    ncclResult_t const status = (call);            \
+    if (ncclSuccess != status) {                   \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "NCCL error encountered at: ", \
+                    "call='%s', Reason=%d:%s",     \
+                    #call,                         \
+                    status,                        \
+                    ncclGetErrorString(status));   \
+      throw raft::logic_error(msg);                \
+    }                                              \
   } while (0);
 
-#define NCCL_TRY_NO_THROW(call)                           \
-  do {                                                    \
-    ncclResult_t status = call;                           \
-    if (ncclSuccess != status) {                          \
-      printf("NCCL call='%s' failed. Reason:%s\n", #call, \
-             ncclGetErrorString(status));                 \
-    }                                                     \
+#define NCCL_TRY_NO_THROW(call)                                                        \
+  do {                                                                                 \
+    ncclResult_t status = call;                                                        \
+    if (ncclSuccess != status) {                                                       \
+      printf("NCCL call='%s' failed. Reason:%s\n", #call, ncclGetErrorString(status)); \
+    }                                                                                  \
   } while (0)
 
 namespace raft {
 namespace comms {
 
-constexpr size_t get_datatype_size(const datatype_t datatype) {
+constexpr size_t get_datatype_size(const datatype_t datatype)
+{
   switch (datatype) {
-    case datatype_t::CHAR:
-      return sizeof(char);
-    case datatype_t::UINT8:
-      return sizeof(uint8_t);
-    case datatype_t::INT32:
-      return sizeof(int);
-    case datatype_t::UINT32:
-      return sizeof(unsigned int);
-    case datatype_t::INT64:
-      return sizeof(int64_t);
-    case datatype_t::UINT64:
-      return sizeof(uint64_t);
-    case datatype_t::FLOAT32:
-      return sizeof(float);
-    case datatype_t::FLOAT64:
-      return sizeof(double);
-    default:
-      throw "Unsupported datatype";
+    case datatype_t::CHAR: return sizeof(char);
+    case datatype_t::UINT8: return sizeof(uint8_t);
+    case datatype_t::INT32: return sizeof(int);
+    case datatype_t::UINT32: return sizeof(unsigned int);
+    case datatype_t::INT64: return sizeof(int64_t);
+    case datatype_t::UINT64: return sizeof(uint64_t);
+    case datatype_t::FLOAT32: return sizeof(float);
+    case datatype_t::FLOAT64: return sizeof(double);
+    default: throw "Unsupported datatype";
   }
 }
 
-constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) {
+constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype)
+{
   switch (datatype) {
-    case datatype_t::CHAR:
-      return ncclChar;
-    case datatype_t::UINT8:
-      return ncclUint8;
-    case datatype_t::INT32:
-      return ncclInt;
-    case datatype_t::UINT32:
-      return ncclUint32;
-    case datatype_t::INT64:
-      return ncclInt64;
-    case datatype_t::UINT64:
-      return ncclUint64;
-    case datatype_t::FLOAT32:
-      return ncclFloat;
-    case datatype_t::FLOAT64:
-      return ncclDouble;
-    default:
-      throw "Unsupported datatype";
+    case datatype_t::CHAR: return ncclChar;
+    case datatype_t::UINT8: return ncclUint8;
+    case datatype_t::INT32: return ncclInt;
+    case datatype_t::UINT32: return ncclUint32;
+    case datatype_t::INT64: return ncclInt64;
+    case datatype_t::UINT64: return ncclUint64;
+    case datatype_t::FLOAT32: return ncclFloat;
+    case datatype_t::FLOAT64: return ncclDouble;
+    default: throw "Unsupported datatype";
   }
 }
 
-constexpr ncclRedOp_t get_nccl_op(const op_t op) {
+constexpr ncclRedOp_t get_nccl_op(const op_t op)
+{
   switch (op) {
-    case op_t::SUM:
-      return ncclSum;
-    case op_t::PROD:
-      return ncclProd;
-    case op_t::MIN:
-      return ncclMin;
-    case op_t::MAX:
-      return ncclMax;
-    default:
-      throw "Unsupported datatype";
+    case op_t::SUM: return ncclSum;
+    case op_t::PROD: return ncclProd;
+    case op_t::MIN: return ncclMin;
+    case op_t::MAX: return ncclMax;
+    default: throw "Unsupported datatype";
   }
 }
 };  // namespace comms
diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh
index 14274043f5..8a66eff242 100644
--- a/cpp/include/raft/cuda_utils.cuh
+++ b/cpp/include/raft/cuda_utils.cuh
@@ -36,16 +36,17 @@
 namespace raft {
 
 /** helper macro for device inlined functions */
-#define DI inline __device__
+#define DI  inline __device__
 #define HDI inline __host__ __device__
-#define HD __host__ __device__
+#define HD  __host__ __device__
 
 /**
  * @brief Provide a ceiling division operation ie. ceil(a / b)
  * @tparam IntType supposed to be only integers for now!
  */
 template <typename IntType>
-constexpr HDI IntType ceildiv(IntType a, IntType b) {
+constexpr HDI IntType ceildiv(IntType a, IntType b)
+{
   return (a + b - 1) / b;
 }
 
@@ -54,7 +55,8 @@ constexpr HDI IntType ceildiv(IntType a, IntType b) {
  * @tparam IntType supposed to be only integers for now!
  */
 template <typename IntType>
-constexpr HDI IntType alignTo(IntType a, IntType b) {
+constexpr HDI IntType alignTo(IntType a, IntType b)
+{
   return ceildiv(a, b) * b;
 }
 
@@ -63,7 +65,8 @@ constexpr HDI IntType alignTo(IntType a, IntType b) {
  * @tparam IntType supposed to be only integers for now!
  */
 template <typename IntType>
-constexpr HDI IntType alignDown(IntType a, IntType b) {
+constexpr HDI IntType alignDown(IntType a, IntType b)
+{
   return (a / b) * b;
 }
 
@@ -72,7 +75,8 @@ constexpr HDI IntType alignDown(IntType a, IntType b) {
  * @tparam IntType data type (checked only for integers)
  */
 template <typename IntType>
-constexpr HDI bool isPo2(IntType num) {
+constexpr HDI bool isPo2(IntType num)
+{
   return (num && !(num & (num - 1)));
 }
 
@@ -81,14 +85,16 @@ constexpr HDI bool isPo2(IntType num) {
  * @tparam IntType data type (checked only for integers)
  */
 template <typename IntType>
-constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) {
+constexpr HDI IntType log2(IntType num, IntType ret = IntType(0))
+{
   return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret);
 }
 
 /** Device function to apply the input lambda across threads in the grid */
 template <int ItemsPerThread, typename L>
-DI void forEach(int num, L lambda) {
-  int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
+DI void forEach(int num, L lambda)
+{
+  int idx              = (blockDim.x * blockIdx.x) + threadIdx.x;
   const int numThreads = blockDim.x * gridDim.x;
 #pragma unroll
   for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) {
@@ -100,7 +106,8 @@ DI void forEach(int num, L lambda) {
 static const int WarpSize = 32;
 
 /** get the laneId of the current thread */
-DI int laneId() {
+DI int laneId()
+{
   int id;
   asm("mov.s32 %0, %laneid;" : "=r"(id));
   return id;
@@ -113,15 +120,17 @@ DI int laneId() {
  * @param b second input
  */
 template <typename T>
-HDI void swapVals(T &a, T &b) {
+HDI void swapVals(T& a, T& b)
+{
   T tmp = a;
-  a = b;
-  b = tmp;
+  a     = b;
+  b     = tmp;
 }
 
 /** Device function to have atomic add support for older archs */
 template <typename Type>
-DI void myAtomicAdd(Type *address, Type val) {
+DI void myAtomicAdd(Type* address, Type val)
+{
   atomicAdd(address, val);
 }
 
@@ -129,105 +138,114 @@ DI void myAtomicAdd(Type *address, Type val) {
 // Ref:
 // http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf
 template <>
-DI void myAtomicAdd(double *address, double val) {
-  unsigned long long int *address_as_ull = (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull, assumed;
+DI void myAtomicAdd(double* address, double val)
+{
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old             = *address_as_ull, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
+    old =
+      atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
   } while (assumed != old);
 }
 #endif
 
 template <typename T, typename ReduceLambda>
-DI void myAtomicReduce(T *address, T val, ReduceLambda op);
+DI void myAtomicReduce(T* address, T val, ReduceLambda op);
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(double *address, double val, ReduceLambda op) {
-  unsigned long long int *address_as_ull = (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull, assumed;
+DI void myAtomicReduce(double* address, double val, ReduceLambda op)
+{
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old             = *address_as_ull, assumed;
   do {
     assumed = old;
-    old =
-      atomicCAS(address_as_ull, assumed,
-                __double_as_longlong(op(val, __longlong_as_double(assumed))));
+    old     = atomicCAS(
+      address_as_ull, assumed, __double_as_longlong(op(val, __longlong_as_double(assumed))));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(float *address, float val, ReduceLambda op) {
-  unsigned int *address_as_uint = (unsigned int *)address;
-  unsigned int old = *address_as_uint, assumed;
+DI void myAtomicReduce(float* address, float val, ReduceLambda op)
+{
+  unsigned int* address_as_uint = (unsigned int*)address;
+  unsigned int old              = *address_as_uint, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address_as_uint, assumed,
-                    __float_as_uint(op(val, __uint_as_float(assumed))));
+    old = atomicCAS(address_as_uint, assumed, __float_as_uint(op(val, __uint_as_float(assumed))));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(int *address, int val, ReduceLambda op) {
+DI void myAtomicReduce(int* address, int val, ReduceLambda op)
+{
   int old = *address, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address, assumed, op(val, assumed));
+    old     = atomicCAS(address, assumed, op(val, assumed));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(long long *address, long long val, ReduceLambda op) {
+DI void myAtomicReduce(long long* address, long long val, ReduceLambda op)
+{
   long long old = *address, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address, assumed, op(val, assumed));
+    old     = atomicCAS(address, assumed, op(val, assumed));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(unsigned long long *address, unsigned long long val,
-                       ReduceLambda op) {
+DI void myAtomicReduce(unsigned long long* address, unsigned long long val, ReduceLambda op)
+{
   unsigned long long old = *address, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address, assumed, op(val, assumed));
+    old     = atomicCAS(address, assumed, op(val, assumed));
   } while (assumed != old);
 }
 
 /**
  * @brief Provide atomic min operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val)
+ * @param[in] address: address to read old value from, and to atomically update w/ min(old value,
+ * val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMin(T *address, T val);
+DI T myAtomicMin(T* address, T val);
 
 /**
  * @brief Provide atomic max operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val)
+ * @param[in] address: address to read old value from, and to atomically update w/ max(old value,
+ * val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMax(T *address, T val);
+DI T myAtomicMax(T* address, T val);
 
-DI float myAtomicMin(float *address, float val) {
+DI float myAtomicMin(float* address, float val)
+{
   myAtomicReduce(address, val, fminf);
   return *address;
 }
 
-DI float myAtomicMax(float *address, float val) {
+DI float myAtomicMax(float* address, float val)
+{
   myAtomicReduce(address, val, fmaxf);
   return *address;
 }
 
-DI double myAtomicMin(double *address, double val) {
+DI double myAtomicMin(double* address, double val)
+{
   myAtomicReduce<double(double, double)>(address, val, fmin);
   return *address;
 }
 
-DI double myAtomicMax(double *address, double val) {
+DI double myAtomicMax(double* address, double val)
+{
   myAtomicReduce<double(double, double)>(address, val, fmax);
   return *address;
 }
@@ -239,11 +257,13 @@ DI double myAtomicMax(double *address, double val) {
 template <typename T>
 HDI T myMax(T x, T y);
 template <>
-HDI float myMax<float>(float x, float y) {
+HDI float myMax<float>(float x, float y)
+{
   return fmaxf(x, y);
 }
 template <>
-HDI double myMax<double>(double x, double y) {
+HDI double myMax<double>(double x, double y)
+{
   return fmax(x, y);
 }
 /** @} */
@@ -255,11 +275,13 @@ HDI double myMax<double>(double x, double y) {
 template <typename T>
 HDI T myMin(T x, T y);
 template <>
-HDI float myMin<float>(float x, float y) {
+HDI float myMin<float>(float x, float y)
+{
   return fminf(x, y);
 }
 template <>
-HDI double myMin<double>(double x, double y) {
+HDI double myMin<double>(double x, double y)
+{
   return fmin(x, y);
 }
 /** @} */
@@ -267,11 +289,13 @@ HDI double myMin<double>(double x, double y) {
 /**
  * @brief Provide atomic min operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val)
+ * @param[in] address: address to read old value from, and to atomically update w/ min(old value,
+ * val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMin(T *address, T val) {
+DI T myAtomicMin(T* address, T val)
+{
   myAtomicReduce(address, val, myMin<T>);
   return *address;
 }
@@ -279,11 +303,13 @@ DI T myAtomicMin(T *address, T val) {
 /**
  * @brief Provide atomic max operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val)
+ * @param[in] address: address to read old value from, and to atomically update w/ max(old value,
+ * val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMax(T *address, T val) {
+DI T myAtomicMax(T* address, T val)
+{
   myAtomicReduce(address, val, myMax<T>);
   return *address;
 }
@@ -292,7 +318,8 @@ DI T myAtomicMax(T *address, T val) {
  * Sign function
  */
 template <typename T>
-HDI int sgn(const T val) {
+HDI int sgn(const T val)
+{
   return (T(0) < val) - (val < T(0));
 }
 
@@ -303,11 +330,13 @@ HDI int sgn(const T val) {
 template <typename T>
 HDI T myExp(T x);
 template <>
-HDI float myExp(float x) {
+HDI float myExp(float x)
+{
   return expf(x);
 }
 template <>
-HDI double myExp(double x) {
+HDI double myExp(double x)
+{
   return exp(x);
 }
 /** @} */
@@ -319,11 +348,13 @@ HDI double myExp(double x) {
 template <typename T>
 inline __device__ T myInf();
 template <>
-inline __device__ float myInf<float>() {
+inline __device__ float myInf<float>()
+{
   return CUDART_INF_F;
 }
 template <>
-inline __device__ double myInf<double>() {
+inline __device__ double myInf<double>()
+{
   return CUDART_INF;
 }
 /** @} */
@@ -335,11 +366,13 @@ inline __device__ double myInf<double>() {
 template <typename T>
 HDI T myLog(T x);
 template <>
-HDI float myLog(float x) {
+HDI float myLog(float x)
+{
   return logf(x);
 }
 template <>
-HDI double myLog(double x) {
+HDI double myLog(double x)
+{
   return log(x);
 }
 /** @} */
@@ -351,11 +384,13 @@ HDI double myLog(double x) {
 template <typename T>
 HDI T mySqrt(T x);
 template <>
-HDI float mySqrt(float x) {
+HDI float mySqrt(float x)
+{
   return sqrtf(x);
 }
 template <>
-HDI double mySqrt(double x) {
+HDI double mySqrt(double x)
+{
   return sqrt(x);
 }
 /** @} */
@@ -365,13 +400,15 @@ HDI double mySqrt(double x) {
  * @{
  */
 template <typename T>
-DI void mySinCos(T x, T &s, T &c);
+DI void mySinCos(T x, T& s, T& c);
 template <>
-DI void mySinCos(float x, float &s, float &c) {
+DI void mySinCos(float x, float& s, float& c)
+{
   sincosf(x, &s, &c);
 }
 template <>
-DI void mySinCos(double x, double &s, double &c) {
+DI void mySinCos(double x, double& s, double& c)
+{
   sincos(x, &s, &c);
 }
 /** @} */
@@ -383,11 +420,13 @@ DI void mySinCos(double x, double &s, double &c) {
 template <typename T>
 DI T mySin(T x);
 template <>
-DI float mySin(float x) {
+DI float mySin(float x)
+{
   return sinf(x);
 }
 template <>
-DI double mySin(double x) {
+DI double mySin(double x)
+{
   return sin(x);
 }
 /** @} */
@@ -397,15 +436,18 @@ DI double mySin(double x) {
  * @{
  */
 template <typename T>
-DI T myAbs(T x) {
+DI T myAbs(T x)
+{
   return x < 0 ? -x : x;
 }
 template <>
-DI float myAbs(float x) {
+DI float myAbs(float x)
+{
   return fabsf(x);
 }
 template <>
-DI double myAbs(double x) {
+DI double myAbs(double x)
+{
   return fabs(x);
 }
 /** @} */
@@ -417,11 +459,13 @@ DI double myAbs(double x) {
 template <typename T>
 HDI T myPow(T x, T power);
 template <>
-HDI float myPow(float x, float power) {
+HDI float myPow(float x, float power)
+{
   return powf(x, power);
 }
 template <>
-HDI double myPow(double x, double power) {
+HDI double myPow(double x, double power)
+{
   return pow(x, power);
 }
 /** @} */
@@ -433,11 +477,13 @@ HDI double myPow(double x, double power) {
 template <typename T>
 HDI T myTanh(T x);
 template <>
-HDI float myTanh(float x) {
+HDI float myTanh(float x)
+{
   return tanhf(x);
 }
 template <>
-HDI double myTanh(double x) {
+HDI double myTanh(double x)
+{
   return tanh(x);
 }
 /** @} */
@@ -449,11 +495,13 @@ HDI double myTanh(double x) {
 template <typename T>
 HDI T myATanh(T x);
 template <>
-HDI float myATanh(float x) {
+HDI float myATanh(float x)
+{
   return atanhf(x);
 }
 template <>
-HDI double myATanh(double x) {
+HDI double myATanh(double x)
+{
   return atanh(x);
 }
 /** @} */
@@ -492,15 +540,18 @@ struct Sum {
  * @{
  */
 template <typename T>
-DI T signPrim(T x) {
+DI T signPrim(T x)
+{
   return x < 0 ? -1 : +1;
 }
 template <>
-DI float signPrim(float x) {
+DI float signPrim(float x)
+{
   return signbit(x) == true ? -1.0f : +1.0f;
 }
 template <>
-DI double signPrim(double x) {
+DI double signPrim(double x)
+{
   return signbit(x) == true ? -1.0 : +1.0;
 }
 /** @} */
@@ -514,28 +565,33 @@ DI double signPrim(double x) {
  * @{
  */
 template <typename T>
-DI T maxPrim(T x, T y) {
+DI T maxPrim(T x, T y)
+{
   return x > y ? x : y;
 }
 template <>
-DI float maxPrim(float x, float y) {
+DI float maxPrim(float x, float y)
+{
   return fmaxf(x, y);
 }
 template <>
-DI double maxPrim(double x, double y) {
+DI double maxPrim(double x, double y)
+{
   return fmax(x, y);
 }
 /** @} */
 
 /** apply a warp-wide fence (useful from Volta+ archs) */
-DI void warpFence() {
+DI void warpFence()
+{
 #if __CUDA_ARCH__ >= 700
   __syncwarp();
 #endif
 }
 
 /** warp-wide any boolean aggregator */
-DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) {
+DI bool any(bool inFlag, uint32_t mask = 0xffffffffu)
+{
 #if CUDART_VERSION >= 9000
   inFlag = __any_sync(mask, inFlag);
 #else
@@ -545,7 +601,8 @@ DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) {
 }
 
 /** warp-wide all boolean aggregator */
-DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) {
+DI bool all(bool inFlag, uint32_t mask = 0xffffffffu)
+{
 #if CUDART_VERSION >= 9000
   inFlag = __all_sync(mask, inFlag);
 #else
@@ -564,8 +621,8 @@ DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) {
  * @return the shuffled data
  */
 template <typename T>
-DI T shfl(T val, int srcLane, int width = WarpSize,
-          uint32_t mask = 0xffffffffu) {
+DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu)
+{
 #if CUDART_VERSION >= 9000
   return __shfl_sync(mask, val, srcLane, width);
 #else
@@ -583,8 +640,8 @@ DI T shfl(T val, int srcLane, int width = WarpSize,
  * @return the shuffled data
  */
 template <typename T>
-DI T shfl_xor(T val, int laneMask, int width = WarpSize,
-              uint32_t mask = 0xffffffffu) {
+DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xffffffffu)
+{
 #if CUDART_VERSION >= 9000
   return __shfl_xor_sync(mask, val, laneMask, width);
 #else
@@ -602,7 +659,8 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize,
  * @todo Expand this to support arbitrary reduction ops
  */
 template <typename T>
-DI T warpReduce(T val) {
+DI T warpReduce(T val)
+{
 #pragma unroll
   for (int i = WarpSize / 2; i > 0; i >>= 1) {
     T tmp = shfl(val, laneId() + i);
@@ -623,12 +681,13 @@ DI T warpReduce(T val) {
  * @todo Expand this to support arbitrary reduction ops
  */
 template <typename T>
-DI T blockReduce(T val, char *smem) {
-  auto *sTemp = reinterpret_cast<T *>(smem);
-  int nWarps = (blockDim.x + WarpSize - 1) / WarpSize;
-  int lid = laneId();
-  int wid = threadIdx.x / WarpSize;
-  val = warpReduce(val);
+DI T blockReduce(T val, char* smem)
+{
+  auto* sTemp = reinterpret_cast<T*>(smem);
+  int nWarps  = (blockDim.x + WarpSize - 1) / WarpSize;
+  int lid     = laneId();
+  int wid     = threadIdx.x / WarpSize;
+  val         = warpReduce(val);
   if (lid == 0) sTemp[wid] = val;
   __syncthreads();
   val = lid < nWarps ? sTemp[lid] : T(0);
@@ -644,8 +703,10 @@ DI T blockReduce(T val, char *smem) {
  * @param idx the index for which to query the stream
  */
 inline cudaStream_t select_stream(cudaStream_t user_stream,
-                                  cudaStream_t *int_streams, int n_int_streams,
-                                  int idx) {
+                                  cudaStream_t* int_streams,
+                                  int n_int_streams,
+                                  int idx)
+{
   return n_int_streams > 0 ? int_streams[idx % n_int_streams] : user_stream;
 }
 
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 86c60addf2..872dab7d82 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -49,17 +49,20 @@ struct cuda_error : public raft::exception {
  * exception detailing the CUDA error that occurred
  *
  */
-#define CUDA_TRY(call)                                                        \
-  do {                                                                        \
-    cudaError_t const status = call;                                          \
-    if (status != cudaSuccess) {                                              \
-      cudaGetLastError();                                                     \
-      std::string msg{};                                                      \
-      SET_ERROR_MSG(                                                          \
-        msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \
-        cudaGetErrorName(status), cudaGetErrorString(status));                \
-      throw raft::cuda_error(msg);                                            \
-    }                                                                         \
+#define CUDA_TRY(call)                             \
+  do {                                             \
+    cudaError_t const status = call;               \
+    if (status != cudaSuccess) {                   \
+      cudaGetLastError();                          \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "CUDA error encountered at: ", \
+                    "call='%s', Reason=%s:%s",     \
+                    #call,                         \
+                    cudaGetErrorName(status),      \
+                    cudaGetErrorString(status));   \
+      throw raft::cuda_error(msg);                 \
+    }                                              \
   } while (0)
 
 /**
@@ -89,13 +92,16 @@ struct cuda_error : public raft::exception {
 //  * @brief check for cuda runtime API errors but log error instead of raising
 //  *        exception.
 //  */
-#define CUDA_CHECK_NO_THROW(call)                                         \
-  do {                                                                    \
-    cudaError_t const status = call;                                      \
-    if (cudaSuccess != status) {                                          \
-      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \
-             __FILE__, __LINE__, cudaGetErrorString(status));             \
-    }                                                                     \
+#define CUDA_CHECK_NO_THROW(call)                                  \
+  do {                                                             \
+    cudaError_t const status = call;                               \
+    if (cudaSuccess != status) {                                   \
+      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                \
+             __FILE__,                                             \
+             __LINE__,                                             \
+             cudaGetErrorString(status));                          \
+    }                                                              \
   } while (0)
 
 namespace raft {
@@ -103,9 +109,7 @@ namespace raft {
 /** Helper method to get to know warp size in device code */
 __host__ __device__ constexpr inline int warp_size() { return 32; }
 
-__host__ __device__ constexpr inline unsigned int warp_full_mask() {
-  return 0xffffffff;
-}
+__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
@@ -124,13 +128,16 @@ class grid_1d_thread_t {
    * @param elements_per_thread Typically, a single kernel thread processes more than a single
    * element; this affects the number of threads the grid must contain
    */
-  grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block,
-                   size_t max_num_blocks_1d, size_t elements_per_thread = 1)
+  grid_1d_thread_t(size_t overall_num_elements,
+                   size_t num_threads_per_block,
+                   size_t max_num_blocks_1d,
+                   size_t elements_per_thread = 1)
     : block_size(num_threads_per_block),
-      num_blocks(std::min((overall_num_elements +
-                           (elements_per_thread * num_threads_per_block) - 1) /
-                            (elements_per_thread * num_threads_per_block),
-                          max_num_blocks_1d)) {
+      num_blocks(
+        std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
+                   (elements_per_thread * num_threads_per_block),
+                 max_num_blocks_1d))
+  {
     RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
     RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
                  "num_threads_per_block / warp_size() must be > 0");
@@ -153,13 +160,14 @@ class grid_1d_warp_t {
    * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
    * this can't be determined generically/automatically (as opposed to the number of blocks)
    */
-  grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block,
+  grid_1d_warp_t(size_t overall_num_elements,
+                 size_t num_threads_per_block,
                  size_t max_num_blocks_1d)
     : block_size(num_threads_per_block),
-      num_blocks(std::min(
-        (overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
-          (num_threads_per_block / warp_size()),
-        max_num_blocks_1d)) {
+      num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
+                            (num_threads_per_block / warp_size()),
+                          max_num_blocks_1d))
+  {
     RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
     RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
                  "num_threads_per_block / warp_size() must be > 0");
@@ -181,10 +189,12 @@ class grid_1d_block_t {
    * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
    * this can't be determined generically/automatically (as opposed to the number of blocks)
    */
-  grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block,
+  grid_1d_block_t(size_t overall_num_elements,
+                  size_t num_threads_per_block,
                   size_t max_num_blocks_1d)
     : block_size(num_threads_per_block),
-      num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) {
+      num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
+  {
     RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
     RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
                  "num_threads_per_block / warp_size() must be > 0");
@@ -200,9 +210,9 @@ class grid_1d_block_t {
  * @param stream cuda stream
  */
 template <typename Type>
-void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) {
-  CUDA_CHECK(
-    cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
 }
 
 /**
@@ -213,23 +223,22 @@ void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) {
  */
 /** performs a host to device copy */
 template <typename Type>
-void update_device(Type* d_ptr, const Type* h_ptr, size_t len,
-                   cudaStream_t stream) {
+void update_device(Type* d_ptr, const Type* h_ptr, size_t len, cudaStream_t stream)
+{
   copy(d_ptr, h_ptr, len, stream);
 }
 
 /** performs a device to host copy */
 template <typename Type>
-void update_host(Type* h_ptr, const Type* d_ptr, size_t len,
-                 cudaStream_t stream) {
+void update_host(Type* h_ptr, const Type* d_ptr, size_t len, cudaStream_t stream)
+{
   copy(h_ptr, d_ptr, len, stream);
 }
 
 template <typename Type>
-void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len,
-                cudaStream_t stream) {
-  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type),
-                             cudaMemcpyDeviceToDevice, stream));
+void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, cudaStream_t stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
 }
 /** @} */
 
@@ -238,8 +247,11 @@ void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len,
  * @{
  */
 template <class T, class OutStream>
-void print_host_vector(const char* variable_name, const T* host_mem,
-                       size_t componentsCount, OutStream& out) {
+void print_host_vector(const char* variable_name,
+                       const T* host_mem,
+                       size_t componentsCount,
+                       OutStream& out)
+{
   out << variable_name << "=[";
   for (size_t i = 0; i < componentsCount; ++i) {
     if (i != 0) out << ",";
@@ -249,11 +261,13 @@ void print_host_vector(const char* variable_name, const T* host_mem,
 }
 
 template <class T, class OutStream>
-void print_device_vector(const char* variable_name, const T* devMem,
-                         size_t componentsCount, OutStream& out) {
+void print_device_vector(const char* variable_name,
+                         const T* devMem,
+                         size_t componentsCount,
+                         OutStream& out)
+{
   T* host_mem = new T[componentsCount];
-  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T),
-                        cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
   print_host_vector(variable_name, host_mem, componentsCount, out);
   delete[] host_mem;
 }
@@ -261,35 +275,36 @@ void print_device_vector(const char* variable_name, const T* devMem,
 
 /** cuda malloc */
 template <typename Type>
-void allocate(Type*& ptr, size_t len, bool setZero = false) {
+void allocate(Type*& ptr, size_t len, bool setZero = false)
+{
   CUDA_CHECK(cudaMalloc((void**)&ptr, sizeof(Type) * len));
   if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, sizeof(Type) * len));
 }
 
 /** helper method to get max usable shared mem per block parameter */
-inline int getSharedMemPerBlock() {
+inline int getSharedMemPerBlock()
+{
   int devId;
   CUDA_CHECK(cudaGetDevice(&devId));
   int smemPerBlk;
-  CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk,
-                                    cudaDevAttrMaxSharedMemoryPerBlock, devId));
+  CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
   return smemPerBlk;
 }
 
 /** helper method to get multi-processor count parameter */
-inline int getMultiProcessorCount() {
+inline int getMultiProcessorCount()
+{
   int devId;
   CUDA_CHECK(cudaGetDevice(&devId));
   int mpCount;
-  CUDA_CHECK(
-    cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+  CUDA_CHECK(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
   return mpCount;
 }
 
 /** helper method to convert an array on device to a string on host */
 template <typename T>
-std::string arr2Str(const T* arr, int size, std::string name,
-                    cudaStream_t stream, int width = 4) {
+std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
+{
   std::stringstream ss;
 
   T* arr_h = (T*)malloc(size * sizeof(T));
@@ -311,53 +326,54 @@ std::string arr2Str(const T* arr, int size, std::string name,
 
 /** this seems to be unused, but may be useful in the future */
 template <typename T>
-void ASSERT_DEVICE_MEM(T* ptr, std::string name) {
+void ASSERT_DEVICE_MEM(T* ptr, std::string name)
+{
   cudaPointerAttributes s_att;
   cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
 
   if (s_err != 0 || s_att.device == -1)
-    std::cout << "Invalid device pointer encountered in " << name
-              << ". device=" << s_att.device << ", err=" << s_err << std::endl;
+    std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
+              << ", err=" << s_err << std::endl;
 }
 
-inline uint32_t curTimeMillis() {
-  auto now = std::chrono::high_resolution_clock::now();
+inline uint32_t curTimeMillis()
+{
+  auto now      = std::chrono::high_resolution_clock::now();
   auto duration = now.time_since_epoch();
-  return std::chrono::duration_cast<std::chrono::milliseconds>(duration)
-    .count();
+  return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
 }
 
 /** Helper function to calculate need memory for allocate to store dense matrix.
-    * @param rows number of rows in matrix
-    * @param columns number of columns in matrix
-    * @return need number of items to allocate via allocate()
-    * @sa allocate()
-    */
-inline size_t allocLengthForMatrix(size_t rows, size_t columns) {
-  return rows * columns;
-}
+ * @param rows number of rows in matrix
+ * @param columns number of columns in matrix
+ * @return need number of items to allocate via allocate()
+ * @sa allocate()
+ */
+inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
 
 /** Helper function to check alignment of pointer.
-    * @param ptr the pointer to check
-    * @param alignment to be checked for
-    * @return true if address in bytes is a multiple of alignment
-    */
+ * @param ptr the pointer to check
+ * @param alignment to be checked for
+ * @return true if address in bytes is a multiple of alignment
+ */
 template <typename Type>
-bool is_aligned(Type* ptr, size_t alignment) {
+bool is_aligned(Type* ptr, size_t alignment)
+{
   return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
 }
 
 /** calculate greatest common divisor of two numbers
-* @a integer
-* @b integer
-* @ return gcd of a and b
-*/
+ * @a integer
+ * @b integer
+ * @ return gcd of a and b
+ */
 template <typename IntType>
-IntType gcd(IntType a, IntType b) {
+IntType gcd(IntType a, IntType b)
+{
   while (b != 0) {
     IntType tmp = b;
-    b = a % b;
-    a = tmp;
+    b           = a % b;
+    a           = tmp;
   }
   return a;
 }
diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh
index dc8093ca1d..e113ca92eb 100644
--- a/cpp/include/raft/device_atomics.cuh
+++ b/cpp/include/raft/device_atomics.cuh
@@ -39,9 +39,9 @@ namespace detail {
 
 /* @brief binary `sum` operator */
 struct DeviceSum {
-  template <typename T,
-            typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return lhs + rhs;
   }
 };
@@ -49,7 +49,8 @@ struct DeviceSum {
 /* @brief binary `min` operator */
 struct DeviceMin {
   template <typename T>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return lhs < rhs ? lhs : rhs;
   }
 };
@@ -57,43 +58,44 @@ struct DeviceMin {
 /* @brief binary `max` operator */
 struct DeviceMax {
   template <typename T>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return lhs > rhs ? lhs : rhs;
   }
 };
 
 /* @brief binary `product` operator */
 struct DeviceProduct {
-  template <typename T,
-            typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return lhs * rhs;
   }
 };
 
 /* @brief binary `and` operator */
 struct DeviceAnd {
-  template <typename T,
-            typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return (lhs & rhs);
   }
 };
 
 /* @brief binary `or` operator */
 struct DeviceOr {
-  template <typename T,
-            typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return (lhs | rhs);
   }
 };
 
 /* @brief binary `xor` operator */
 struct DeviceXor {
-  template <typename T,
-            typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return (lhs ^ rhs);
   }
 };
@@ -103,9 +105,9 @@ struct DeviceXor {
 #define errmsg_cast "size mismatch."
 
 template <typename T_output, typename T_input>
-__forceinline__ __device__ T_output type_reinterpret(T_input value) {
-  static_assert(sizeof(T_output) == sizeof(T_input),
-                "type_reinterpret for different size");
+__forceinline__ __device__ T_output type_reinterpret(T_input value)
+{
+  static_assert(sizeof(T_output) == sizeof(T_input), "type_reinterpret for different size");
   return *(reinterpret_cast<T_output*>(&value));
 }
 
@@ -118,25 +120,22 @@ struct genericAtomicOperationImpl;
 // single byte atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 1> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          Op op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
+  {
     using T_int = unsigned int;
 
-    T_int* address_uint32 =
-      reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
-    T_int shift = ((reinterpret_cast<size_t>(addr) & 3) * 8);
+    T_int* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
+    T_int shift           = ((reinterpret_cast<size_t>(addr) & 3) * 8);
 
     T_int old = *address_uint32;
     T_int assumed;
 
     do {
-      assumed = old;
-      T target_value = T((old >> shift) & 0xff);
-      uint8_t updating_value =
-        type_reinterpret<uint8_t, T>(op(target_value, update_value));
-      T_int new_value =
-        (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift);
-      old = atomicCAS(address_uint32, assumed, new_value);
+      assumed                = old;
+      T target_value         = T((old >> shift) & 0xff);
+      uint8_t updating_value = type_reinterpret<uint8_t, T>(op(target_value, update_value));
+      T_int new_value        = (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift);
+      old                    = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return T((old >> shift) & 0xff);
@@ -146,26 +145,24 @@ struct genericAtomicOperationImpl<T, Op, 1> {
 // 2 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 2> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          Op op) {
-    using T_int = unsigned int;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
+  {
+    using T_int      = unsigned int;
     bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) ? false : true;
-    T_int* address_uint32 = reinterpret_cast<T_int*>(
-      reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
+    T_int* address_uint32 =
+      reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
 
     T_int old = *address_uint32;
     T_int assumed;
 
     do {
-      assumed = old;
-      T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
-      uint16_t updating_value =
-        type_reinterpret<uint16_t, T>(op(target_value, update_value));
-
-      T_int new_value = (is_32_align)
-                          ? (old & 0xffff0000) | updating_value
-                          : (old & 0xffff) | (T_int(updating_value) << 16);
-      old = atomicCAS(address_uint32, assumed, new_value);
+      assumed                 = old;
+      T target_value          = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
+      uint16_t updating_value = type_reinterpret<uint16_t, T>(op(target_value, update_value));
+
+      T_int new_value = (is_32_align) ? (old & 0xffff0000) | updating_value
+                                      : (old & 0xffff) | (T_int(updating_value) << 16);
+      old             = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return (is_32_align) ? T(old & 0xffff) : T(old >> 16);
@@ -176,15 +173,15 @@ struct genericAtomicOperationImpl<T, Op, 2> {
 // 4 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          Op op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
+  {
     using T_int = unsigned int;
 
     T old_value = *addr;
     T assumed{old_value};
 
     do {
-      assumed = old_value;
+      assumed           = old_value;
       const T new_value = op(old_value, update_value);
 
       T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
@@ -201,8 +198,8 @@ struct genericAtomicOperationImpl<T, Op, 4> {
 // 8 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          Op op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
 
@@ -210,7 +207,7 @@ struct genericAtomicOperationImpl<T, Op, 8> {
     T assumed{old_value};
 
     do {
-      assumed = old_value;
+      assumed           = old_value;
       const T new_value = op(old_value, update_value);
 
       T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
@@ -226,8 +223,8 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 
 // -------------------------------------------------------------------------------------------------
 // specialized functions for operators
-// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is not supproted.)
-// `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int
+// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is
+// not supproted.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int
 // `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int
 
 // CUDA natively supports `unsigned long long int` for `atomicAdd`,
@@ -240,12 +237,11 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 template <>
 struct genericAtomicOperationImpl<long int, DeviceSum, 8> {
   using T = long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceSum op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -253,12 +249,11 @@ struct genericAtomicOperationImpl<long int, DeviceSum, 8> {
 template <>
 struct genericAtomicOperationImpl<unsigned long int, DeviceSum, 8> {
   using T = unsigned long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceSum op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -273,12 +268,11 @@ struct genericAtomicOperationImpl<unsigned long int, DeviceSum, 8> {
 template <>
 struct genericAtomicOperationImpl<long long int, DeviceSum, 8> {
   using T = long long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceSum op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -286,12 +280,11 @@ struct genericAtomicOperationImpl<long long int, DeviceSum, 8> {
 template <>
 struct genericAtomicOperationImpl<unsigned long int, DeviceMin, 8> {
   using T = unsigned long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceMin op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T ret = atomicMin(reinterpret_cast<T_int*>(addr),
-                      type_reinterpret<T_int, T>(update_value));
+    T ret = atomicMin(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -299,48 +292,44 @@ struct genericAtomicOperationImpl<unsigned long int, DeviceMin, 8> {
 template <>
 struct genericAtomicOperationImpl<unsigned long int, DeviceMax, 8> {
   using T = unsigned long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceMax op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T ret = atomicMax(reinterpret_cast<T_int*>(addr),
-                      type_reinterpret<T_int, T>(update_value));
+    T ret = atomicMax(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
 
 template <typename T>
 struct genericAtomicOperationImpl<T, DeviceAnd, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceAnd op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAnd(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAnd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
 
 template <typename T>
 struct genericAtomicOperationImpl<T, DeviceOr, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceOr op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicOr(reinterpret_cast<T_int*>(addr),
-                         type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicOr(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
 
 template <typename T>
 struct genericAtomicOperationImpl<T, DeviceXor, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceXor op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicXor(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicXor(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -353,13 +342,12 @@ struct typesAtomicCASImpl;
 
 template <typename T>
 struct typesAtomicCASImpl<T, 1> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare,
-                                          T const& update_value) {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
+  {
     using T_int = unsigned int;
 
-    T_int shift = ((reinterpret_cast<size_t>(addr) & 3) * 8);
-    T_int* address_uint32 =
-      reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
+    T_int shift           = ((reinterpret_cast<size_t>(addr) & 3) * 8);
+    T_int* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
 
     // the 'target_value' in `old` can be different from `compare`
     // because other thread may update the value
@@ -370,15 +358,14 @@ struct typesAtomicCASImpl<T, 1> {
     uint8_t u_val = type_reinterpret<uint8_t, T>(update_value);
 
     do {
-      assumed = old;
+      assumed      = old;
       target_value = T((old >> shift) & 0xff);
       // have to compare `target_value` and `compare` before calling atomicCAS
       // the `target_value` in `old` can be different with `compare`
       if (target_value != compare) break;
 
-      T_int new_value =
-        (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift);
-      old = atomicCAS(address_uint32, assumed, new_value);
+      T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift);
+      old             = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return target_value;
@@ -387,13 +374,13 @@ struct typesAtomicCASImpl<T, 1> {
 
 template <typename T>
 struct typesAtomicCASImpl<T, 2> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare,
-                                          T const& update_value) {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
+  {
     using T_int = unsigned int;
 
     bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) ? false : true;
-    T_int* address_uint32 = reinterpret_cast<T_int*>(
-      reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
+    T_int* address_uint32 =
+      reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
 
     T_int old = *address_uint32;
     T_int assumed;
@@ -401,12 +388,12 @@ struct typesAtomicCASImpl<T, 2> {
     uint16_t u_val = type_reinterpret<uint16_t, T>(update_value);
 
     do {
-      assumed = old;
+      assumed      = old;
       target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
       if (target_value != compare) break;
 
-      T_int new_value = (is_32_align) ? (old & 0xffff0000) | u_val
-                                      : (old & 0xffff) | (T_int(u_val) << 16);
+      T_int new_value =
+        (is_32_align) ? (old & 0xffff0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16);
       old = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
@@ -416,8 +403,8 @@ struct typesAtomicCASImpl<T, 2> {
 
 template <typename T>
 struct typesAtomicCASImpl<T, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare,
-                                          T const& update_value) {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
+  {
     using T_int = unsigned int;
 
     T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
@@ -431,8 +418,8 @@ struct typesAtomicCASImpl<T, 4> {
 // 8 bytes atomic operation
 template <typename T>
 struct typesAtomicCASImpl<T, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare,
-                                          T const& update_value) {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
 
@@ -464,11 +451,10 @@ struct typesAtomicCASImpl<T, 8> {
  * @returns The old value at `address`
  * -------------------------------------------------------------------------**/
 template <typename T, typename BinaryOp>
-typename std::enable_if_t<std::is_arithmetic<T>::value, T> __forceinline__
-  __device__
-  genericAtomicOperation(T* address, T const& update_value, BinaryOp op) {
-  auto fun =
-    raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
+typename std::enable_if_t<std::is_arithmetic<T>::value, T> __forceinline__ __device__
+genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
+{
+  auto fun = raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
   return T(fun(address, update_value, op));
 }
 
@@ -476,11 +462,11 @@ typename std::enable_if_t<std::is_arithmetic<T>::value, T> __forceinline__
 template <typename BinaryOp>
 __forceinline__ __device__ bool genericAtomicOperation(bool* address,
                                                        bool const& update_value,
-                                                       BinaryOp op) {
+                                                       BinaryOp op)
+{
   using T = bool;
   // don't use underlying type to apply operation for bool
-  auto fun =
-    raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
+  auto fun = raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
   return T(fun(address, update_value, op));
 }
 
@@ -502,9 +488,9 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address,
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicAdd(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceSum{});
+__forceinline__ __device__ T atomicAdd(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceSum{});
 }
 
 /**
@@ -523,9 +509,9 @@ __forceinline__ __device__ T atomicAdd(T* address, T val) {
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMin(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceMin{});
+__forceinline__ __device__ T atomicMin(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMin{});
 }
 
 /**
@@ -544,9 +530,9 @@ __forceinline__ __device__ T atomicMin(T* address, T val) {
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMax(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceMax{});
+__forceinline__ __device__ T atomicMax(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMax{});
 }
 
 /**
@@ -566,9 +552,9 @@ __forceinline__ __device__ T atomicMax(T* address, T val) {
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) {
-  return raft::device_atomics::detail::typesAtomicCASImpl<T>()(address, compare,
-                                                               val);
+__forceinline__ __device__ T atomicCAS(T* address, T compare, T val)
+{
+  return raft::device_atomics::detail::typesAtomicCASImpl<T>()(address, compare, val);
 }
 
 /**
@@ -586,11 +572,10 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val) {
  *
  * @returns The old value at `address`
  */
-template <typename T,
-          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-__forceinline__ __device__ T atomicAnd(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceAnd{});
+template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__forceinline__ __device__ T atomicAnd(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceAnd{});
 }
 
 /**
@@ -608,11 +593,10 @@ __forceinline__ __device__ T atomicAnd(T* address, T val) {
  *
  * @returns The old value at `address`
  */
-template <typename T,
-          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-__forceinline__ __device__ T atomicOr(T* address, T val) {
-  return raft::genericAtomicOperation(address, val,
-                                      raft::device_atomics::detail::DeviceOr{});
+template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__forceinline__ __device__ T atomicOr(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceOr{});
 }
 
 /**
@@ -630,9 +614,8 @@ __forceinline__ __device__ T atomicOr(T* address, T val) {
  *
  * @returns The old value at `address`
  */
-template <typename T,
-          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-__forceinline__ __device__ T atomicXor(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceXor{});
+template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__forceinline__ __device__ T atomicXor(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceXor{});
 }
diff --git a/cpp/include/raft/distance/canberra.cuh b/cpp/include/raft/distance/canberra.cuh
index b87c295eb0..61622d7c87 100644
--- a/cpp/include/raft/distance/canberra.cuh
+++ b/cpp/include/raft/distance/canberra.cuh
@@ -44,75 +44,108 @@ namespace distance {
  * @param fin_op    the final gemm epilogue lambda
  * @param stream    cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void canberraImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                         IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                         FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void canberraImpl(const DataT* x,
+                         const DataT* y,
+                         IdxT m,
+                         IdxT n,
+                         IdxT k,
+                         IdxT lda,
+                         IdxT ldb,
+                         IdxT ldd,
+                         OutT* dOutput,
+                         FinalLambda fin_op,
+                         cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     const auto diff = raft::L1Op<AccT, IdxT>()(x - y);
-    const auto add = raft::myAbs(x) + raft::myAbs(y);
+    const auto add  = raft::myAbs(x) + raft::myAbs(y);
     // deal with potential for 0 in denominator by
     // forcing 1/0 instead
     acc += ((add != 0) * diff / (add + (add == 0)));
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) { return; };
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) { return; };
 
   if (isRowMajor) {
-    auto canberraRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraRowMajor);
+    auto canberraRowMajor = pairwiseDistanceMatKernel<false,
+                                                      DataT,
+                                                      AccT,
+                                                      OutT,
+                                                      IdxT,
+                                                      KPolicy,
+                                                      decltype(core_lambda),
+                                                      decltype(epilog_lambda),
+                                                      FinalLambda,
+                                                      true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraRowMajor);
 
     canberraRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto canberraColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraColMajor);
+    auto canberraColMajor = pairwiseDistanceMatKernel<false,
+                                                      DataT,
+                                                      AccT,
+                                                      OutT,
+                                                      IdxT,
+                                                      KPolicy,
+                                                      decltype(core_lambda),
+                                                      decltype(epilog_lambda),
+                                                      FinalLambda,
+                                                      false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraColMajor);
     canberraColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-              const DataT *x, const DataT *y, OutT *dOutput, FinalLambda fin_op,
-              cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void canberra(IdxT m,
+              IdxT n,
+              IdxT k,
+              IdxT lda,
+              IdxT ldb,
+              IdxT ldd,
+              const DataT* x,
+              const DataT* y,
+              OutT* dOutput,
+              FinalLambda fin_op,
+              cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    canberraImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                 isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                             stream);
+    canberraImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    canberraImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                 isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                             stream);
+    canberraImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     canberraImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -137,16 +170,25 @@ void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param[in] stream cuda stream to launch work
  * @param[in] isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void canberraImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                  OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                  bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void canberraImpl(int m,
+                  int n,
+                  int k,
+                  const InType* pA,
+                  const InType* pB,
+                  OutType* pD,
+                  FinalLambda fin_op,
+                  cudaStream_t stream,
+                  bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    canberraOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type canberraOutType;
   Index_ lda, ldb, ldd;
-  canberraOutType *pDcast = reinterpret_cast<canberraOutType *>(pD);
+  canberraOutType* pDcast = reinterpret_cast<canberraOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     canberra<InType, AccType, canberraOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/chebyshev.cuh b/cpp/include/raft/distance/chebyshev.cuh
index 8d53408cf8..b7ecdb945b 100644
--- a/cpp/include/raft/distance/chebyshev.cuh
+++ b/cpp/include/raft/distance/chebyshev.cuh
@@ -44,72 +44,105 @@ namespace distance {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void chebyshevImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
-                          IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                          FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void chebyshevImpl(const DataT* x,
+                          const DataT* y,
+                          IdxT m,
+                          IdxT n,
+                          IdxT k,
+                          IdxT lda,
+                          IdxT ldb,
+                          IdxT ldd,
+                          OutT* dOutput,
+                          FinalLambda fin_op,
+                          cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     const auto diff = raft::L1Op<AccT, IdxT>()(x - y);
-    acc = raft::myMax(acc, diff);
+    acc             = raft::myMax(acc, diff);
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) { return; };
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) { return; };
 
   if (isRowMajor) {
-    auto chebyshevRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               chebyshevRowMajor);
+    auto chebyshevRowMajor = pairwiseDistanceMatKernel<false,
+                                                       DataT,
+                                                       AccT,
+                                                       OutT,
+                                                       IdxT,
+                                                       KPolicy,
+                                                       decltype(core_lambda),
+                                                       decltype(epilog_lambda),
+                                                       FinalLambda,
+                                                       true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, chebyshevRowMajor);
 
     chebyshevRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto chebyshevColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               chebyshevColMajor);
+    auto chebyshevColMajor = pairwiseDistanceMatKernel<false,
+                                                       DataT,
+                                                       AccT,
+                                                       OutT,
+                                                       IdxT,
+                                                       KPolicy,
+                                                       decltype(core_lambda),
+                                                       decltype(epilog_lambda),
+                                                       FinalLambda,
+                                                       false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, chebyshevColMajor);
     chebyshevColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-               const DataT *x, const DataT *y, OutT *dOutput,
-               FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void chebyshev(IdxT m,
+               IdxT n,
+               IdxT k,
+               IdxT lda,
+               IdxT ldb,
+               IdxT ldd,
+               const DataT* x,
+               const DataT* y,
+               OutT* dOutput,
+               FinalLambda fin_op,
+               cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                              stream);
+    chebyshevImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                              stream);
+    chebyshevImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     chebyshevImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -134,16 +167,25 @@ void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param[in] stream cuda stream to launch work
  * @param[in] isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void chebyshevImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                   OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                   bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void chebyshevImpl(int m,
+                   int n,
+                   int k,
+                   const InType* pA,
+                   const InType* pB,
+                   OutType* pD,
+                   FinalLambda fin_op,
+                   cudaStream_t stream,
+                   bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    chebyshevOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type chebyshevOutType;
   Index_ lda, ldb, ldd;
-  chebyshevOutType *pDcast = reinterpret_cast<chebyshevOutType *>(pD);
+  chebyshevOutType* pDcast = reinterpret_cast<chebyshevOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     chebyshev<InType, AccType, chebyshevOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/cosine.cuh b/cpp/include/raft/distance/cosine.cuh
index ed9bd28b7f..3e034e15d2 100644
--- a/cpp/include/raft/distance/cosine.cuh
+++ b/cpp/include/raft/distance/cosine.cuh
@@ -24,7 +24,7 @@ namespace distance {
 
 /**
  * @brief the cosine distance matrix calculation implementer
- *  It computes the following equation: 
+ *  It computes the following equation:
  *    C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2)))
  * @tparam DataT input data-type (for A and B matrices)
  * @tparam AccT   accumulation data-type
@@ -49,30 +49,43 @@ namespace distance {
  * @param fin_op  the final gemm epilogue lambda
 *  @param stream  cuda stream to launch cuda operations.
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-void cosineImpl(const DataT *x, const DataT *y, const DataT *xn,
-                const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb,
-                IdxT ldd, OutT *dOutput, FinalLambda fin_op,
-                cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+void cosineImpl(const DataT* x,
+                const DataT* y,
+                const DataT* xn,
+                const DataT* yn,
+                IdxT m,
+                IdxT n,
+                IdxT k,
+                IdxT lda,
+                IdxT ldb,
+                IdxT ldd,
+                OutT* dOutput,
+                FinalLambda fin_op,
+                cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -85,43 +98,66 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn,
   constexpr size_t shmemSize =
     KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
   if (isRowMajor) {
-    auto cosineRowMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
+    auto cosineRowMajor = pairwiseDistanceMatKernel<true,
+                                                    DataT,
+                                                    AccT,
+                                                    OutT,
+                                                    IdxT,
+                                                    KPolicy,
+                                                    decltype(core_lambda),
+                                                    decltype(epilog_lambda),
+                                                    FinalLambda,
+                                                    true>;
+    dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
     cosineRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto cosineColMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
+    auto cosineColMajor = pairwiseDistanceMatKernel<true,
+                                                    DataT,
+                                                    AccT,
+                                                    OutT,
+                                                    IdxT,
+                                                    KPolicy,
+                                                    decltype(core_lambda),
+                                                    decltype(epilog_lambda),
+                                                    FinalLambda,
+                                                    false>;
+    dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
     cosineColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-            const DataT *x, const DataT *y, const DataT *xn, const DataT *yn,
-            OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void cosine(IdxT m,
+            IdxT n,
+            IdxT k,
+            IdxT lda,
+            IdxT ldb,
+            IdxT ldd,
+            const DataT* x,
+            const DataT* y,
+            const DataT* xn,
+            const DataT* yn,
+            OutT* dOutput,
+            FinalLambda fin_op,
+            cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    cosineImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-               isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput,
-                           fin_op, stream);
+    cosineImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    cosineImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-               isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput,
-                           fin_op, stream);
+    cosineImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     cosineImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -130,7 +166,7 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the expanded cosine distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
  *              C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2)))
  * @tparam IType input data-type (for A and B matrices)
  * @tparam AccType accumulation data-type
@@ -151,12 +187,23 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
-                 const InType *pB, OutType *pD, AccType *workspace,
-                 size_t worksize, FinalLambda fin_op, cudaStream_t stream,
-                 bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void cosineAlgo1(Index_ m,
+                 Index_ n,
+                 Index_ k,
+                 const InType* pA,
+                 const InType* pB,
+                 OutType* pD,
+                 AccType* workspace,
+                 size_t worksize,
+                 FinalLambda fin_op,
+                 cudaStream_t stream,
+                 bool isRowMajor)
+{
   auto norm_op = [] __device__(AccType in) { return raft::mySqrt(in); };
 
   // Wrap fin_op to allow computing 1 - pA before calling fin_op
@@ -165,39 +212,33 @@ void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
   };
 
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    CosOutType;
-  CosOutType *pDcast = reinterpret_cast<CosOutType *>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type CosOutType;
+  CosOutType* pDcast = reinterpret_cast<CosOutType*>(pD);
 
-  ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) ||
-           (worksize < m * sizeof(AccType))),
-         "workspace size error");
+  ASSERT(
+    !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))),
+    "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
   Index_ lda, ldb, ldd;
-  InType *col_vec = workspace;
-  InType *row_vec = workspace;
+  InType* col_vec = workspace;
+  InType* row_vec = workspace;
   if (pA != pB) {
     row_vec += m;
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
-    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
   } else {
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
   }
 
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op), true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op,
-      stream);
+      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, stream);
   } else {
     lda = n, ldb = m, ldd = m;
-    cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op),
-           false>(n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast,
-                  wrapped_fin_op, stream);
+    cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op), false>(
+      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, wrapped_fin_op, stream);
   }
 }
 
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index 1b39a6ec18..1627753b43 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -32,140 +32,314 @@ namespace raft {
 namespace distance {
 
 namespace {
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename FinalLambda,
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
           typename Index_>
 struct DistanceImpl {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg = 2.0f) {}
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg = 2.0f)
+  {
+  }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2Expanded, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
-    raft::distance::euclideanAlgo1<InType, AccType, OutType, FinalLambda,
-                                   Index_>(m, n, k, x, y, dist, false,
-                                           (AccType *)workspace, worksize,
-                                           fin_op, stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2Expanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg)
+  {
+    raft::distance::euclideanAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, false, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2SqrtExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
-    raft::distance::euclideanAlgo1<InType, AccType, OutType, FinalLambda,
-                                   Index_>(m, n, k, x, y, dist, true,
-                                           (AccType *)workspace, worksize,
-                                           fin_op, stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2SqrtExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg)
+  {
+    raft::distance::euclideanAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, true, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::CosineExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::CosineExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg)
+  {
     raft::distance::cosineAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream,
-      isRowMajor);
+      m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2Unexpanded, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
-    raft::distance::euclideanAlgo2<InType, AccType, OutType, FinalLambda,
-                                   Index_>(m, n, k, x, y, dist, false, fin_op,
-                                           stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2Unexpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg)
+  {
+    raft::distance::euclideanAlgo2<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, false, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2SqrtUnexpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
-    raft::distance::euclideanAlgo2<InType, AccType, OutType, FinalLambda,
-                                   Index_>(m, n, k, x, y, dist, true, fin_op,
-                                           stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2SqrtUnexpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg)
+  {
+    raft::distance::euclideanAlgo2<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, true, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L1, InType, AccType, OutType,
-                    FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L1,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg)
+  {
     raft::distance::l1Impl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::Linf, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
-    raft::distance::chebyshevImpl<InType, AccType, OutType, FinalLambda,
-                                  Index_>(m, n, k, x, y, dist, fin_op, stream,
-                                          isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::Linf,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg)
+  {
+    raft::distance::chebyshevImpl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::HellingerExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
-    raft::distance::hellingerImpl<InType, AccType, OutType, FinalLambda,
-                                  Index_>(m, n, k, x, y, dist, fin_op, stream,
-                                          isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::HellingerExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg)
+  {
+    raft::distance::hellingerImpl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::LpUnexpanded, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
-    raft::distance::minkowskiImpl<InType, AccType, OutType, FinalLambda,
-                                  Index_>(m, n, k, x, y, dist, fin_op, stream,
-                                          isRowMajor, metric_arg);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::LpUnexpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg)
+  {
+    raft::distance::minkowskiImpl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor, metric_arg);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::Canberra, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::Canberra,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg)
+  {
     raft::distance::canberraImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
@@ -189,13 +363,15 @@ struct DistanceImpl<raft::distance::DistanceType::Canberra, InType, AccType,
  * @note If the specifed distanceType doesn't need the workspace at all, it
  * returns 0.
  */
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
-                        Index_ k) {
-  size_t worksize = 0;
-  constexpr bool is_allocated =
-    distanceType <= raft::distance::DistanceType::CosineExpanded;
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
+{
+  size_t worksize             = 0;
+  constexpr bool is_allocated = distanceType <= raft::distance::DistanceType::CosineExpanded;
   if (is_allocated) {
     worksize += m * sizeof(AccType);
     if (x != y) worksize += n * sizeof(AccType);
@@ -228,17 +404,27 @@ size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
  * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
  * any other parameters, feel free to pass them via closure.
  */
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename FinalLambda,
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
           typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, void *workspace, size_t worksize,
-              FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
-  DistanceImpl<distanceType, InType, AccType, OutType, FinalLambda, Index_>
-    distImpl;
-  distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream,
-               isRowMajor, metric_arg);
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              FinalLambda fin_op,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  DistanceImpl<distanceType, InType, AccType, OutType, FinalLambda, Index_> distImpl;
+  distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -263,18 +449,26 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
  * @note if workspace is passed as nullptr, this will return in
  *  worksize, the number of bytes of workspace required
  */
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, void *workspace, size_t worksize,
-              cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
-  auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) {
-    return d_val;
-  };
-  distance<distanceType, InType, AccType, OutType, decltype(default_fin_op),
-           Index_>(x, y, dist, m, n, k, workspace, worksize, default_fin_op,
-                   stream, isRowMajor, metric_arg);
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; };
+  distance<distanceType, InType, AccType, OutType, decltype(default_fin_op), Index_>(
+    x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream, isRowMajor, metric_arg);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -298,39 +492,47 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
  * @param isRowMajor whether the matrices are row-major or col-major
  */
 template <typename Type, typename Index_, raft::distance::DistanceType DistType>
-void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m,
-                            Index_ n, Index_ k,
-                            raft::mr::device::buffer<char> &workspace,
-                            cudaStream_t stream, bool isRowMajor,
-                            Type metric_arg = 2.0f) {
-  auto worksize =
-    getWorkspaceSize<DistType, Type, Type, Type, Index_>(x, y, m, n, k);
+void pairwise_distance_impl(const Type* x,
+                            const Type* y,
+                            Type* dist,
+                            Index_ m,
+                            Index_ n,
+                            Index_ k,
+                            raft::mr::device::buffer<char>& workspace,
+                            cudaStream_t stream,
+                            bool isRowMajor,
+                            Type metric_arg = 2.0f)
+{
+  auto worksize = getWorkspaceSize<DistType, Type, Type, Type, Index_>(x, y, m, n, k);
   workspace.resize(worksize, stream);
-  distance<DistType, Type, Type, Type, Index_>(x, y, dist, m, n, k,
-                                               workspace.data(), worksize,
-                                               stream, isRowMajor, metric_arg);
+  distance<DistType, Type, Type, Type, Index_>(
+    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
 }
 
 template <typename Type, typename Index_ = int>
-void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m,
-                       Index_ n, Index_ k,
-                       raft::mr::device::buffer<char> &workspace,
-                       raft::distance::DistanceType metric, cudaStream_t stream,
-                       bool isRowMajor = true, Type metric_arg = 2.0f) {
+void pairwise_distance(const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       Index_ m,
+                       Index_ n,
+                       Index_ k,
+                       raft::mr::device::buffer<char>& workspace,
+                       raft::distance::DistanceType metric,
+                       cudaStream_t stream,
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f)
+{
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::L2Expanded>(
+      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Expanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtExpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::L2SqrtExpanded>(
+      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::CosineExpanded>(
+      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CosineExpanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::L1:
@@ -338,13 +540,11 @@ void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m,
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::L2Unexpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::L2Unexpanded>(
+      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtUnexpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::L2SqrtUnexpanded>(
+      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::Linf:
@@ -352,22 +552,18 @@ void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m,
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::HellingerExpanded>(
+      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::LpUnexpanded>(
+      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::Canberra:
-      pairwise_distance_impl<Type, Index_,
-                             raft::distance::DistanceType::Canberra>(
+      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Canberra>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
-    default:
-      THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
+    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
 }
 /** @} */
diff --git a/cpp/include/raft/distance/euclidean.cuh b/cpp/include/raft/distance/euclidean.cuh
index 484da0e5bf..46d0a1a4a9 100644
--- a/cpp/include/raft/distance/euclidean.cuh
+++ b/cpp/include/raft/distance/euclidean.cuh
@@ -48,30 +48,44 @@ namespace distance {
  * @param fin_op  the final gemm epilogue lambda
 *  @param stream  cuda stream to launch cuda operations.
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn,
-                      const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda,
-                      IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput,
-                      FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+void euclideanExpImpl(const DataT* x,
+                      const DataT* y,
+                      const DataT* xn,
+                      const DataT* yn,
+                      IdxT m,
+                      IdxT n,
+                      IdxT k,
+                      IdxT lda,
+                      IdxT ldb,
+                      IdxT ldd,
+                      bool sqrt,
+                      OutT* dOutput,
+                      FinalLambda fin_op,
+                      cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [sqrt] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                         DataT * regxn,
+                                         DataT * regyn,
+                                         IdxT gridStrideX,
+                                         IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -93,47 +107,68 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn,
   constexpr size_t shmemSize =
     KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
   if (isRowMajor) {
-    auto euclideanExpRowMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
+    auto euclideanExpRowMajor = pairwiseDistanceMatKernel<true,
+                                                          DataT,
+                                                          AccT,
+                                                          OutT,
+                                                          IdxT,
+                                                          KPolicy,
+                                                          decltype(core_lambda),
+                                                          decltype(epilog_lambda),
+                                                          FinalLambda,
+                                                          true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
 
     euclideanExpRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto euclideanExpColMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
+    auto euclideanExpColMajor = pairwiseDistanceMatKernel<true,
+                                                          DataT,
+                                                          AccT,
+                                                          OutT,
+                                                          IdxT,
+                                                          KPolicy,
+                                                          decltype(core_lambda),
+                                                          decltype(epilog_lambda),
+                                                          FinalLambda,
+                                                          false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
     euclideanExpColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                  const DataT *x, const DataT *y, const DataT *xn,
-                  const DataT *yn, bool sqrt, OutT *dOutput, FinalLambda fin_op,
-                  cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void euclideanExp(IdxT m,
+                  IdxT n,
+                  IdxT k,
+                  IdxT lda,
+                  IdxT ldb,
+                  IdxT ldd,
+                  const DataT* x,
+                  const DataT* y,
+                  const DataT* xn,
+                  const DataT* yn,
+                  bool sqrt,
+                  OutT* dOutput,
+                  FinalLambda fin_op,
+                  cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                     isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt,
-                                 dOutput, fin_op, stream);
+    euclideanExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                     isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt,
-                                 dOutput, fin_op, stream);
+    euclideanExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
   } else {
     euclideanExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
@@ -161,53 +196,59 @@ void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
-                    const InType *pB, OutType *pD, bool enable_sqrt,
-                    AccType *workspace, size_t &worksize, FinalLambda fin_op,
-                    cudaStream_t stream, bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void euclideanAlgo1(Index_ m,
+                    Index_ n,
+                    Index_ k,
+                    const InType* pA,
+                    const InType* pB,
+                    OutType* pD,
+                    bool enable_sqrt,
+                    AccType* workspace,
+                    size_t& worksize,
+                    FinalLambda fin_op,
+                    cudaStream_t stream,
+                    bool isRowMajor)
+{
   auto norm_op = [] __device__(InType in) { return in; };
 
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    ExpOutType;
-  ExpOutType *pDcast = reinterpret_cast<ExpOutType *>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type ExpOutType;
+  ExpOutType* pDcast = reinterpret_cast<ExpOutType*>(pD);
 
-  ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) ||
-           (worksize < m * sizeof(AccType))),
-         "workspace size error");
+  ASSERT(
+    !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))),
+    "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
   Index_ lda, ldb, ldd;
-  InType *col_vec = workspace;
-  InType *row_vec = workspace;
+  InType* col_vec = workspace;
+  InType* row_vec = workspace;
   if (pA != pB) {
     row_vec += m;
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
-    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
   } else {
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
   }
 
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     euclideanExp<InType, AccType, ExpOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast,
-      fin_op, stream);
+      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, fin_op, stream);
   } else {
     lda = n, ldb = m, ldd = m;
     euclideanExp<InType, AccType, ExpOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast,
-      fin_op, stream);
+      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, fin_op, stream);
   }
 }
 
 /**
- * @brief the unexpanded euclidean distance matrix calculation 
+ * @brief the unexpanded euclidean distance matrix calculation
  *  It computes the following equation: cij = op((ai-bj)^2)
  * @tparam DataT          input data-type (for A and B matrices)
  * @tparam AccT           accumulation data-type
@@ -227,16 +268,30 @@ void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
  * @param[output]   pD output matrix
  * @param fin_op    the final gemm epilogue lambda
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                        IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput,
-                        FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+void euclideanUnExpImpl(const DataT* x,
+                        const DataT* y,
+                        IdxT m,
+                        IdxT n,
+                        IdxT k,
+                        IdxT lda,
+                        IdxT ldb,
+                        IdxT ldd,
+                        bool sqrt,
+                        OutT* dOutput,
+                        FinalLambda fin_op,
+                        cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -247,10 +302,11 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [sqrt] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                         DataT * regxn,
+                                         DataT * regyn,
+                                         IdxT gridStrideX,
+                                         IdxT gridStrideY) {
     if (sqrt) {
 #pragma unroll
       for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
@@ -263,48 +319,68 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   if (isRowMajor) {
-    auto euclideanUnExpRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               euclideanUnExpRowMajor);
+    auto euclideanUnExpRowMajor = pairwiseDistanceMatKernel<false,
+                                                            DataT,
+                                                            AccT,
+                                                            OutT,
+                                                            IdxT,
+                                                            KPolicy,
+                                                            decltype(core_lambda),
+                                                            decltype(epilog_lambda),
+                                                            FinalLambda,
+                                                            true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, euclideanUnExpRowMajor);
 
     euclideanUnExpRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
 
   } else {
-    auto euclideanUnExpColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               euclideanUnExpColMajor);
+    auto euclideanUnExpColMajor = pairwiseDistanceMatKernel<false,
+                                                            DataT,
+                                                            AccT,
+                                                            OutT,
+                                                            IdxT,
+                                                            KPolicy,
+                                                            decltype(core_lambda),
+                                                            decltype(epilog_lambda),
+                                                            FinalLambda,
+                                                            false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, euclideanUnExpColMajor);
 
     euclideanUnExpColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                    const DataT *x, const DataT *y, bool sqrt, OutT *dOutput,
-                    FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void euclideanUnExp(IdxT m,
+                    IdxT n,
+                    IdxT k,
+                    IdxT lda,
+                    IdxT ldb,
+                    IdxT ldd,
+                    const DataT* x,
+                    const DataT* y,
+                    bool sqrt,
+                    OutT* dOutput,
+                    FinalLambda fin_op,
+                    cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput,
-                                   fin_op, stream);
+    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput,
-                                   fin_op, stream);
+    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
   } else {
     euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
@@ -330,15 +406,25 @@ void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void euclideanAlgo2(Index_ m, Index_ n, Index_ k, const InType *pA,
-                    const InType *pB, OutType *pD, bool enable_sqrt,
-                    FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void euclideanAlgo2(Index_ m,
+                    Index_ n,
+                    Index_ k,
+                    const InType* pA,
+                    const InType* pB,
+                    OutType* pD,
+                    bool enable_sqrt,
+                    FinalLambda fin_op,
+                    cudaStream_t stream,
+                    bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    UnExpOutType;
-  UnExpOutType *pDcast = reinterpret_cast<UnExpOutType *>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type UnExpOutType;
+  UnExpOutType* pDcast = reinterpret_cast<UnExpOutType*>(pD);
   Index_ lda, ldb, ldd;
 
   if (isRowMajor) {
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh
index b96a536e38..f80b4eb8f7 100644
--- a/cpp/include/raft/distance/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn.cuh
@@ -35,24 +35,24 @@ template <typename LabelT, typename DataT>
 struct KVPMinReduce {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
 };  // KVPMinReduce
 
 template <typename LabelT, typename DataT>
 struct MinAndDistanceReduceOp {
   typedef typename cub::KeyValuePair<LabelT, DataT> KVP;
-  DI void operator()(LabelT rid, KVP* out, const KVP& other) {
+  DI void operator()(LabelT rid, KVP* out, const KVP& other)
+  {
     if (other.value < out->value) {
-      out->key = other.key;
+      out->key   = other.key;
       out->value = other.value;
     }
   }
 
-  DI void init(KVP* out, DataT maxVal) {
-    out->key = -1;
+  DI void init(KVP* out, DataT maxVal)
+  {
+    out->key   = -1;
     out->value = maxVal;
   }
 };
@@ -60,30 +60,28 @@ struct MinAndDistanceReduceOp {
 template <typename LabelT, typename DataT>
 struct MinReduceOp {
   typedef typename cub::KeyValuePair<LabelT, DataT> KVP;
-  DI void operator()(LabelT rid, DataT* out, const KVP& other) {
-    if (other.value < *out) {
-      *out = other.value;
-    }
+  DI void operator()(LabelT rid, DataT* out, const KVP& other)
+  {
+    if (other.value < *out) { *out = other.value; }
   }
 
   DI void init(DataT* out, DataT maxVal) { *out = maxVal; }
 };
 
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) {
+__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+{
   auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (tid < m) {
-    redOp.init(min + tid, maxVal);
-  }
+  if (tid < m) { redOp.init(min + tid, maxVal); }
 }
 
 // TODO: specialize this function for MinAndDistanceReduceOp<int, float>
 // with atomicCAS of 64 bit which will eliminate mutex and shfls
-template <typename P, typename OutT, typename IdxT, typename KVPair,
-          typename ReduceOpT>
-DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op,
-                         IdxT m, IdxT gridStrideY) {
-  const auto lid = threadIdx.x % raft::WarpSize;
+template <typename P, typename OutT, typename IdxT, typename KVPair, typename ReduceOpT>
+DI void updateReducedVal(
+  int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY)
+{
+  const auto lid      = threadIdx.x % raft::WarpSize;
   const auto accrowid = threadIdx.x / P::AccThCols;
 
   // for now have first lane from each warp update a unique output row. This
@@ -108,21 +106,38 @@ DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op,
     if (j < (raft::WarpSize / P::AccThCols) - 1) {
 #pragma unroll
       for (int i = 0; i < P::AccRowsPerTh; ++i) {
-        auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols);
+        auto tmpkey   = raft::shfl(val[i].key, (j + 1) * P::AccThCols);
         auto tmpvalue = raft::shfl(val[i].value, (j + 1) * P::AccThCols);
-        val[i] = {tmpkey, tmpvalue};
+        val[i]        = {tmpkey, tmpvalue};
       }
     }
   }
 }
 
-template <typename DataT, typename OutT, typename IdxT, bool Sqrt, typename P,
-          typename ReduceOpT, typename KVPReduceOpT, typename CoreLambda,
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          bool Sqrt,
+          typename P,
+          typename ReduceOpT,
+          typename KVPReduceOpT,
+          typename CoreLambda,
           typename FinalLambda>
-__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(
-  OutT* min, const DataT* x, const DataT* y, const DataT* xn, const DataT* yn,
-  IdxT m, IdxT n, IdxT k, DataT maxVal, int* mutex, ReduceOpT redOp,
-  KVPReduceOpT pairRedOp, CoreLambda core_op, FinalLambda fin_op) {
+__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
+                                                                  const DataT* x,
+                                                                  const DataT* y,
+                                                                  const DataT* xn,
+                                                                  const DataT* yn,
+                                                                  IdxT m,
+                                                                  IdxT n,
+                                                                  IdxT k,
+                                                                  DataT maxVal,
+                                                                  int* mutex,
+                                                                  ReduceOpT redOp,
+                                                                  KVPReduceOpT pairRedOp,
+                                                                  CoreLambda core_op,
+                                                                  FinalLambda fin_op)
+{
   extern __shared__ char smem[];
 
   typedef cub::KeyValuePair<IdxT, DataT> KVPair;
@@ -135,7 +150,9 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(
   // epilogue operation lambda for final value calculation
   auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__(
                          DataT acc[P::AccRowsPerTh][P::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         DataT * regxn,
+                         DataT * regyn,
+                         IdxT gridStrideX,
                          IdxT gridStrideY) {
     KVPReduceOpT pairRed_op(pairRedOp);
 
@@ -164,72 +181,105 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(
 #pragma unroll
       for (int j = 0; j < P::AccColsPerTh; ++j) {
         auto tmpkey = acccolid + j * P::AccThCols + gridStrideX;
-        KVPair tmp = {tmpkey, acc[i][j]};
+        KVPair tmp  = {tmpkey, acc[i][j]};
         if (tmpkey < n) {
-          val[i] =
-            pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
+          val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
         }
       }
     }
   };
 
-  auto rowEpilog_lambda = [m, mutex, min, pairRedOp, redOp, &val,
-                           maxVal] __device__(IdxT gridStrideY) {
-    KVPReduceOpT pairRed_op(pairRedOp);
-    ReduceOpT red_op(redOp);
+  auto rowEpilog_lambda =
+    [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT gridStrideY) {
+      KVPReduceOpT pairRed_op(pairRedOp);
+      ReduceOpT red_op(redOp);
 
-    const auto accrowid = threadIdx.x / P::AccThCols;
-    const auto lid = raft::laneId();
+      const auto accrowid = threadIdx.x / P::AccThCols;
+      const auto lid      = raft::laneId();
 
     // reduce
 #pragma unroll
-    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
-      for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
-        auto tmpkey = raft::shfl(val[i].key, lid + j);
-        auto tmpvalue = raft::shfl(val[i].value, lid + j);
-        KVPair tmp = {tmpkey, tmpvalue};
-        val[i] =
-          pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
+        for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
+          auto tmpkey   = raft::shfl(val[i].key, lid + j);
+          auto tmpvalue = raft::shfl(val[i].value, lid + j);
+          KVPair tmp    = {tmpkey, tmpvalue};
+          val[i]        = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
+        }
       }
-    }
 
-    updateReducedVal<P, OutT, IdxT, KVPair, ReduceOpT>(mutex, min, val, red_op,
-                                                       m, gridStrideY);
+      updateReducedVal<P, OutT, IdxT, KVPair, ReduceOpT>(mutex, min, val, red_op, m, gridStrideY);
 
     // reset the val array.
 #pragma unroll
-    for (int i = 0; i < P::AccRowsPerTh; ++i) {
-      val[i] = {-1, maxVal};
-    }
-  };
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+        val[i] = {-1, maxVal};
+      }
+    };
 
   IdxT lda = k, ldb = k, ldd = n;
-  PairwiseDistances<true, DataT, DataT, DataT, IdxT, P, CoreLambda,
-                    decltype(epilog_lambda), FinalLambda,
-                    decltype(rowEpilog_lambda), true, false>
-    obj(x, y, m, n, k, lda, ldb, ldd, xn, yn, nullptr, smem, core_op,
-        epilog_lambda, fin_op, rowEpilog_lambda);
+  PairwiseDistances<true,
+                    DataT,
+                    DataT,
+                    DataT,
+                    IdxT,
+                    P,
+                    CoreLambda,
+                    decltype(epilog_lambda),
+                    FinalLambda,
+                    decltype(rowEpilog_lambda),
+                    true,
+                    false>
+    obj(x,
+        y,
+        m,
+        n,
+        k,
+        lda,
+        ldb,
+        ldd,
+        xn,
+        yn,
+        nullptr,
+        smem,
+        core_op,
+        epilog_lambda,
+        fin_op,
+        rowEpilog_lambda);
   obj.run();
 }
 
-template <typename DataT, typename OutT, typename IdxT, int VecLen,
-          typename ReduceOpT, typename KVPReduceOpT>
-void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
-                   const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace,
-                   ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt,
-                   bool initOutBuffer, cudaStream_t stream) {
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename ReduceOpT,
+          typename KVPReduceOpT>
+void fusedL2NNImpl(OutT* min,
+                   const DataT* x,
+                   const DataT* y,
+                   const DataT* xn,
+                   const DataT* yn,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   int* workspace,
+                   ReduceOpT redOp,
+                   KVPReduceOpT pairRedOp,
+                   bool sqrt,
+                   bool initOutBuffer,
+                   cudaStream_t stream)
+{
   typedef typename linalg::Policy4x4<DataT, VecLen>::Policy P;
 
   dim3 blk(P::Nthreads);
-  auto nblks = raft::ceildiv<int>(m, P::Nthreads);
+  auto nblks            = raft::ceildiv<int>(m, P::Nthreads);
   constexpr auto maxVal = std::numeric_limits<DataT>::max();
   typedef cub::KeyValuePair<IdxT, DataT> KVPair;
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
   if (initOutBuffer) {
@@ -240,25 +290,34 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
 
   auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; };
 
-  constexpr size_t shmemSize =
-    P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
+  constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
   if (sqrt) {
-    auto fusedL2NNSqrt =
-      fusedL2NNkernel<DataT, OutT, IdxT, true, P, ReduceOpT, KVPReduceOpT,
-                      decltype(core_lambda), decltype(fin_op)>;
-    dim3 grid = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NNSqrt);
+    auto fusedL2NNSqrt = fusedL2NNkernel<DataT,
+                                         OutT,
+                                         IdxT,
+                                         true,
+                                         P,
+                                         ReduceOpT,
+                                         KVPReduceOpT,
+                                         decltype(core_lambda),
+                                         decltype(fin_op)>;
+    dim3 grid          = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NNSqrt);
 
     fusedL2NNSqrt<<<grid, blk, shmemSize, stream>>>(
-      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp,
-      core_lambda, fin_op);
+      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op);
   } else {
-    auto fusedL2NN =
-      fusedL2NNkernel<DataT, OutT, IdxT, false, P, ReduceOpT, KVPReduceOpT,
-                      decltype(core_lambda), decltype(fin_op)>;
-    dim3 grid = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NN);
-    fusedL2NN<<<grid, blk, shmemSize, stream>>>(min, x, y, xn, yn, m, n, k,
-                                                maxVal, workspace, redOp,
-                                                pairRedOp, core_lambda, fin_op);
+    auto fusedL2NN = fusedL2NNkernel<DataT,
+                                     OutT,
+                                     IdxT,
+                                     false,
+                                     P,
+                                     ReduceOpT,
+                                     KVPReduceOpT,
+                                     decltype(core_lambda),
+                                     decltype(fin_op)>;
+    dim3 grid      = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NN);
+    fusedL2NN<<<grid, blk, shmemSize, stream>>>(
+      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
@@ -299,25 +358,32 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
  *                           main kernel launch
  * @param[in]  stream        cuda stream
  */
-template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT,
-          typename KVPReduceOpT>
-void fusedL2NN(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
-               const DataT* yn, IdxT m, IdxT n, IdxT k, void* workspace,
-               ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt,
-               bool initOutBuffer, cudaStream_t stream) {
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
+void fusedL2NN(OutT* min,
+               const DataT* x,
+               const DataT* y,
+               const DataT* xn,
+               const DataT* yn,
+               IdxT m,
+               IdxT n,
+               IdxT k,
+               void* workspace,
+               ReduceOpT redOp,
+               KVPReduceOpT pairRedOp,
+               bool sqrt,
+               bool initOutBuffer,
+               cudaStream_t stream)
+{
   size_t bytes = sizeof(DataT) * k;
   if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
     fusedL2NNImpl<DataT, OutT, IdxT, 16 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
-      initOutBuffer, stream);
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
   } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
     fusedL2NNImpl<DataT, OutT, IdxT, 8 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
-      initOutBuffer, stream);
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
   } else {
     fusedL2NNImpl<DataT, OutT, IdxT, 1, ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
-      initOutBuffer, stream);
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
   }
 }
 
diff --git a/cpp/include/raft/distance/hellinger.cuh b/cpp/include/raft/distance/hellinger.cuh
index f7ad3ed1ba..c8c7dad7d4 100644
--- a/cpp/include/raft/distance/hellinger.cuh
+++ b/cpp/include/raft/distance/hellinger.cuh
@@ -23,7 +23,7 @@ namespace distance {
 
 /**
  * @brief the Hellinger distance matrix using the expanded form:
- *  It computes the following equation: 
+ *  It computes the following equation:
     cij = sqrt(1 - sum(sqrt(x_k * y_k)))
  * This distance computation modifies A and B by computing a sqrt
  * and then performing a `pow(x, 2)` to convert it back. Because of this,
@@ -51,29 +51,40 @@ namespace distance {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
-                          IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                          FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void hellingerImpl(const DataT* x,
+                          const DataT* y,
+                          IdxT m,
+                          IdxT n,
+                          IdxT k,
+                          IdxT lda,
+                          IdxT ldb,
+                          IdxT ldd,
+                          OutT* dOutput,
+                          FinalLambda fin_op,
+                          cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
-  auto unaryOp_lambda = [] __device__(DataT input) {
-    return raft::mySqrt(input);
-  };
+  auto unaryOp_lambda = [] __device__(DataT input) { return raft::mySqrt(input); };
   // First sqrt x and y
   raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-    (DataT *)x, x, m * k, unaryOp_lambda, stream);
+    (DataT*)x, x, m * k, unaryOp_lambda, stream);
 
   if (x != y) {
     raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-      (DataT *)y, y, n * k, unaryOp_lambda, stream);
+      (DataT*)y, y, n * k, unaryOp_lambda, stream);
   }
 
   // Accumulation operation lambda
@@ -84,71 +95,91 @@ static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
-        const auto finalVal = (1 - acc[i][j]);
+        const auto finalVal  = (1 - acc[i][j]);
         const auto rectifier = (!signbit(finalVal));
-        acc[i][j] = raft::mySqrt(rectifier * finalVal);
+        acc[i][j]            = raft::mySqrt(rectifier * finalVal);
       }
     }
   };
 
   if (isRowMajor) {
-    auto hellingerRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               hellingerRowMajor);
+    auto hellingerRowMajor = pairwiseDistanceMatKernel<false,
+                                                       DataT,
+                                                       AccT,
+                                                       OutT,
+                                                       IdxT,
+                                                       KPolicy,
+                                                       decltype(core_lambda),
+                                                       decltype(epilog_lambda),
+                                                       FinalLambda,
+                                                       true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hellingerRowMajor);
 
     hellingerRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto hellingerColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               hellingerColMajor);
+    auto hellingerColMajor = pairwiseDistanceMatKernel<false,
+                                                       DataT,
+                                                       AccT,
+                                                       OutT,
+                                                       IdxT,
+                                                       KPolicy,
+                                                       decltype(core_lambda),
+                                                       decltype(epilog_lambda),
+                                                       FinalLambda,
+                                                       false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hellingerColMajor);
     hellingerColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   // Revert sqrt of x and y
   raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-    (DataT *)x, x, m * k, unaryOp_lambda, stream);
+    (DataT*)x, x, m * k, unaryOp_lambda, stream);
   if (x != y) {
     raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-      (DataT *)y, y, n * k, unaryOp_lambda, stream);
+      (DataT*)y, y, n * k, unaryOp_lambda, stream);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-               const DataT *x, const DataT *y, OutT *dOutput,
-               FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void hellinger(IdxT m,
+               IdxT n,
+               IdxT k,
+               IdxT lda,
+               IdxT ldb,
+               IdxT ldd,
+               const DataT* x,
+               const DataT* y,
+               OutT* dOutput,
+               FinalLambda fin_op,
+               cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                              stream);
+    hellingerImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                              stream);
+    hellingerImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     hellingerImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -157,7 +188,7 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the Hellinger distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
     sqrt(1 - sum(sqrt(x_k * y_k))
  * This distance computation modifies A and B by computing a sqrt
  * and then performing a `pow(x, 2)` to convert it back. Because of this,
@@ -179,16 +210,25 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void hellingerImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                   OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                   bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void hellingerImpl(int m,
+                   int n,
+                   int k,
+                   const InType* pA,
+                   const InType* pB,
+                   OutType* pD,
+                   FinalLambda fin_op,
+                   cudaStream_t stream,
+                   bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    hellingerOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type hellingerOutType;
   Index_ lda, ldb, ldd;
-  hellingerOutType *pDcast = reinterpret_cast<hellingerOutType *>(pD);
+  hellingerOutType* pDcast = reinterpret_cast<hellingerOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     hellinger<InType, AccType, hellingerOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/l1.cuh b/cpp/include/raft/distance/l1.cuh
index 6ab084f041..268e269391 100644
--- a/cpp/include/raft/distance/l1.cuh
+++ b/cpp/include/raft/distance/l1.cuh
@@ -42,16 +42,29 @@ namespace distance {
  * @param[output]   pD output matrix
  * @param fin_op    the final gemm epilogue lambda
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                   IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                   FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void l1Impl(const DataT* x,
+                   const DataT* y,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   IdxT lda,
+                   IdxT ldb,
+                   IdxT ldd,
+                   OutT* dOutput,
+                   FinalLambda fin_op,
+                   cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -62,47 +75,69 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) { return; };
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) { return; };
 
   if (isRowMajor) {
-    auto l1RowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1RowMajor);
+    auto l1RowMajor = pairwiseDistanceMatKernel<false,
+                                                DataT,
+                                                AccT,
+                                                OutT,
+                                                IdxT,
+                                                KPolicy,
+                                                decltype(core_lambda),
+                                                decltype(epilog_lambda),
+                                                FinalLambda,
+                                                true>;
+    dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1RowMajor);
 
     l1RowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto l1ColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1ColMajor);
+    auto l1ColMajor = pairwiseDistanceMatKernel<false,
+                                                DataT,
+                                                AccT,
+                                                OutT,
+                                                IdxT,
+                                                KPolicy,
+                                                decltype(core_lambda),
+                                                decltype(epilog_lambda),
+                                                FinalLambda,
+                                                false>;
+    dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1ColMajor);
     l1ColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x,
-        const DataT *y, OutT *dOutput, FinalLambda fin_op,
-        cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void l1(IdxT m,
+        IdxT n,
+        IdxT k,
+        IdxT lda,
+        IdxT ldb,
+        IdxT ldd,
+        const DataT* x,
+        const DataT* y,
+        OutT* dOutput,
+        FinalLambda fin_op,
+        cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    l1Impl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-           isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    l1Impl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
     l1Impl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -130,16 +165,25 @@ void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void l1Impl(int m, int n, int k, const InType *pA, const InType *pB,
-            OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-            bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void l1Impl(int m,
+            int n,
+            int k,
+            const InType* pA,
+            const InType* pB,
+            OutType* pD,
+            FinalLambda fin_op,
+            cudaStream_t stream,
+            bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef
-    typename std::conditional<is_bool::value, OutType, AccType>::type L1OutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type L1OutType;
   Index_ lda, ldb, ldd;
-  L1OutType *pDcast = reinterpret_cast<L1OutType *>(pD);
+  L1OutType* pDcast = reinterpret_cast<L1OutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     l1<InType, AccType, L1OutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/minkowski.cuh b/cpp/include/raft/distance/minkowski.cuh
index 803f5fc78a..c021954f32 100644
--- a/cpp/include/raft/distance/minkowski.cuh
+++ b/cpp/include/raft/distance/minkowski.cuh
@@ -21,7 +21,7 @@ namespace raft {
 namespace distance {
 
 /**
- * @brief the unexpanded Minkowski distance matrix calculation 
+ * @brief the unexpanded Minkowski distance matrix calculation
  *  It computes the following equation: cij = sum(|x - y|^p)^(1/p)
  * @tparam DataT          input data-type (for A and B matrices)
  * @tparam AccT           accumulation data-type
@@ -44,16 +44,30 @@ namespace distance {
  * @param[in]       stream cuda stream to launch work
  * @param[in]       the value of `p` for Minkowski (l-p) distances.
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                        IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                        FinalLambda fin_op, cudaStream_t stream, DataT p) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+void minkowskiUnExpImpl(const DataT* x,
+                        const DataT* y,
+                        IdxT m,
+                        IdxT n,
+                        IdxT k,
+                        IdxT lda,
+                        IdxT ldb,
+                        IdxT ldd,
+                        OutT* dOutput,
+                        FinalLambda fin_op,
+                        cudaStream_t stream,
+                        DataT p)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -64,10 +78,11 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [p] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [p] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                      DataT * regxn,
+                                      DataT * regyn,
+                                      IdxT gridStrideX,
+                                      IdxT gridStrideY) {
     const auto one_over_p = 1.0f / p;
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
@@ -79,48 +94,68 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   if (isRowMajor) {
-    auto minkowskiUnExpRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               minkowskiUnExpRowMajor);
+    auto minkowskiUnExpRowMajor = pairwiseDistanceMatKernel<false,
+                                                            DataT,
+                                                            AccT,
+                                                            OutT,
+                                                            IdxT,
+                                                            KPolicy,
+                                                            decltype(core_lambda),
+                                                            decltype(epilog_lambda),
+                                                            FinalLambda,
+                                                            true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, minkowskiUnExpRowMajor);
 
     minkowskiUnExpRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
 
   } else {
-    auto minkowskiUnExpColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               minkowskiUnExpColMajor);
+    auto minkowskiUnExpColMajor = pairwiseDistanceMatKernel<false,
+                                                            DataT,
+                                                            AccT,
+                                                            OutT,
+                                                            IdxT,
+                                                            KPolicy,
+                                                            decltype(core_lambda),
+                                                            decltype(epilog_lambda),
+                                                            FinalLambda,
+                                                            false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, minkowskiUnExpColMajor);
 
     minkowskiUnExpColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                    const DataT *x, const DataT *y, OutT *dOutput,
-                    FinalLambda fin_op, cudaStream_t stream, DataT metric_arg) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void minkowskiUnExp(IdxT m,
+                    IdxT n,
+                    IdxT k,
+                    IdxT lda,
+                    IdxT ldb,
+                    IdxT ldd,
+                    const DataT* x,
+                    const DataT* y,
+                    OutT* dOutput,
+                    FinalLambda fin_op,
+                    cudaStream_t stream,
+                    DataT metric_arg)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput,
-                                   fin_op, stream, metric_arg);
+    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput,
-                                   fin_op, stream, metric_arg);
+    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
   } else {
     minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
@@ -146,15 +181,25 @@ void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param[in] isRowMajor whether the input and output matrices are row major
  * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances.
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void minkowskiImpl(Index_ m, Index_ n, Index_ k, const InType *pA,
-                   const InType *pB, OutType *pD, FinalLambda fin_op,
-                   cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void minkowskiImpl(Index_ m,
+                   Index_ n,
+                   Index_ k,
+                   const InType* pA,
+                   const InType* pB,
+                   OutType* pD,
+                   FinalLambda fin_op,
+                   cudaStream_t stream,
+                   bool isRowMajor,
+                   InType metric_arg)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    LpUnexpOutType;
-  LpUnexpOutType *pDcast = reinterpret_cast<LpUnexpOutType *>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type LpUnexpOutType;
+  LpUnexpOutType* pDcast = reinterpret_cast<LpUnexpOutType*>(pD);
   Index_ lda, ldb, ldd;
 
   if (isRowMajor) {
diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh
index 43abc9eb65..3db4dc0131 100644
--- a/cpp/include/raft/distance/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/pairwise_distance_base.cuh
@@ -31,11 +31,11 @@ namespace distance {
  * @tparam OutT           output data-type (for C and D matrices)
  * @tparam IdxT           index data-type
  * @tparam Policy         struct which tunes the Contraction kernel
- * @tparam CoreLambda     tells how to accumulate an x and y into 
+ * @tparam CoreLambda     tells how to accumulate an x and y into
                           acc. its signature:
     template <typename AccT, typename DataT> void core_lambda(AccT& acc,
       const DataT& x, const DataT& y)
- * @tparam EpilogueLambda applies an elementwise function to compute final 
+ * @tparam EpilogueLambda applies an elementwise function to compute final
     values. Its signature is:
     template <typename AccT, typename DataT> void epilogue_lambda
     (AccT acc[][], DataT* regxn, DataT* regyn);
@@ -57,13 +57,19 @@ namespace distance {
  * @param fin_op the final gemm epilogue lambda
  */
 
-template <bool useNorms, typename DataT, typename AccT, typename OutT,
-          typename IdxT, typename Policy, typename CoreLambda,
-          typename EpilogueLambda, typename FinalLambda,
-          typename rowEpilogueLambda, bool isRowMajor = true,
-          bool writeOut = true,
-          typename BaseClass =
-            raft::linalg::Contractions_NT<DataT, IdxT, Policy, isRowMajor>>
+template <bool useNorms,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename CoreLambda,
+          typename EpilogueLambda,
+          typename FinalLambda,
+          typename rowEpilogueLambda,
+          bool isRowMajor    = true,
+          bool writeOut      = true,
+          typename BaseClass = raft::linalg::Contractions_NT<DataT, IdxT, Policy, isRowMajor>>
 struct PairwiseDistances : public BaseClass {
  private:
   typedef Policy P;
@@ -81,11 +87,21 @@ struct PairwiseDistances : public BaseClass {
 
  public:
   // Constructor
-  DI PairwiseDistances(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-                       IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd,
-                       const DataT* _xn, const DataT* _yn, OutT* _dOutput,
-                       char* _smem, CoreLambda _core_op,
-                       EpilogueLambda _epilog_op, FinalLambda _fin_op,
+  DI PairwiseDistances(const DataT* _x,
+                       const DataT* _y,
+                       IdxT _m,
+                       IdxT _n,
+                       IdxT _k,
+                       IdxT _lda,
+                       IdxT _ldb,
+                       IdxT _ldd,
+                       const DataT* _xn,
+                       const DataT* _yn,
+                       OutT* _dOutput,
+                       char* _smem,
+                       CoreLambda _core_op,
+                       EpilogueLambda _epilog_op,
+                       FinalLambda _fin_op,
                        rowEpilogueLambda _rowEpilog_op)
     : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem),
       xn(_xn),
@@ -96,9 +112,12 @@ struct PairwiseDistances : public BaseClass {
       core_op(_core_op),
       epilog_op(_epilog_op),
       fin_op(_fin_op),
-      rowEpilog_op(_rowEpilog_op) {}
+      rowEpilog_op(_rowEpilog_op)
+  {
+  }
 
-  DI void run() {
+  DI void run()
+  {
     for (auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m;
          gridStrideY += P::Mblk * gridDim.y) {
       for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n;
@@ -112,7 +131,8 @@ struct PairwiseDistances : public BaseClass {
   }
 
  private:
-  DI void updateIndicesY() {
+  DI void updateIndicesY()
+  {
     const auto stride = P::Nblk * gridDim.x;
     if (isRowMajor) {
       this->y += stride * this->ldb;
@@ -122,21 +142,23 @@ struct PairwiseDistances : public BaseClass {
     this->yrowid += stride;
   }
 
-  DI void updateIndicesXY() {
+  DI void updateIndicesXY()
+  {
     const auto stride = P::Mblk * gridDim.y;
     if (isRowMajor) {
       this->x += stride * this->lda;
       this->yrowid = IdxT(blockIdx.x) * P::Nblk + this->srowid;
-      this->y = yBase + this->yrowid * this->ldb;
+      this->y      = yBase + this->yrowid * this->ldb;
     } else {
       this->x += stride;
       this->yrowid = IdxT(blockIdx.x) * P::Nblk;
-      this->y = yBase + this->yrowid + this->srowid * this->ldb;
+      this->y      = yBase + this->yrowid + this->srowid * this->ldb;
     }
     this->xrowid += stride;
   }
 
-  DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) {
+  DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY)
+  {
     // Fetch next grid stride ldg if within range
     if ((gridStrideX + gridDim.x * P::Nblk) < this->n) {
       updateIndicesY();
@@ -147,10 +169,9 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void prolog(IdxT gridStrideX, IdxT gridStrideY) {
-    if (gridStrideX == blockIdx.x * P::Nblk) {
-      this->ldgXY(0);
-    }
+  DI void prolog(IdxT gridStrideX, IdxT gridStrideY)
+  {
+    if (gridStrideX == blockIdx.x * P::Nblk) { this->ldgXY(0); }
 
 #pragma unroll
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
@@ -165,7 +186,8 @@ struct PairwiseDistances : public BaseClass {
     this->pageWr ^= 1;
   }
 
-  DI void loop() {
+  DI void loop()
+  {
     for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
       this->ldgXY(kidx);
       accumulate();  // on the previous k-block
@@ -182,7 +204,8 @@ struct PairwiseDistances : public BaseClass {
     this->pageRd ^= 1;
   }
 
-  DI void accumulate() {
+  DI void accumulate()
+  {
 #pragma unroll
     for (int ki = 0; ki < P::Kblk; ki += P::Veclen) {
       this->ldsXY(ki);
@@ -199,7 +222,8 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void epilog(IdxT gridStrideX, IdxT gridStrideY) {
+  DI void epilog(IdxT gridStrideX, IdxT gridStrideY)
+  {
     if (useNorms) {
       DataT* sxNorm = (DataT*)(&smem[P::SmemSize]);
       DataT* syNorm = (&sxNorm[P::Mblk]);
@@ -207,13 +231,13 @@ struct PairwiseDistances : public BaseClass {
       // Load x & y norms required by this threadblock in shmem buffer
       if (gridStrideX == blockIdx.x * P::Nblk) {
         for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) {
-          auto idx = gridStrideY + i;
+          auto idx  = gridStrideY + i;
           sxNorm[i] = idx < this->m ? xn[idx] : 0;
         }
       }
 
       for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) {
-        auto idx = gridStrideX + i;
+        auto idx  = gridStrideX + i;
         syNorm[i] = idx < this->n ? yn[idx] : 0;
       }
 
@@ -288,42 +312,67 @@ struct PairwiseDistances : public BaseClass {
  * @param fin_op    the final gemm epilogue lambda
  */
 
-template <bool useNorms, typename DataT, typename AccT, typename OutT,
-          typename IdxT, typename Policy, typename CoreLambda,
-          typename EpilogueLambda, typename FinalLambda, bool isRowMajor = true,
-          bool writeOut = true>
-__global__ __launch_bounds__(
-  Policy::Nthreads,
-  2) void pairwiseDistanceMatKernel(const DataT* x, const DataT* y,
-                                    const DataT* _xn, const DataT* _yn, IdxT m,
-                                    IdxT n, IdxT k, IdxT lda, IdxT ldb,
-                                    IdxT ldd, OutT* dOutput, CoreLambda core_op,
-                                    EpilogueLambda epilog_op,
-                                    FinalLambda fin_op) {
+template <bool useNorms,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename CoreLambda,
+          typename EpilogueLambda,
+          typename FinalLambda,
+          bool isRowMajor = true,
+          bool writeOut   = true>
+__global__ __launch_bounds__(Policy::Nthreads,
+                             2) void pairwiseDistanceMatKernel(const DataT* x,
+                                                               const DataT* y,
+                                                               const DataT* _xn,
+                                                               const DataT* _yn,
+                                                               IdxT m,
+                                                               IdxT n,
+                                                               IdxT k,
+                                                               IdxT lda,
+                                                               IdxT ldb,
+                                                               IdxT ldd,
+                                                               OutT* dOutput,
+                                                               CoreLambda core_op,
+                                                               EpilogueLambda epilog_op,
+                                                               FinalLambda fin_op)
+{
   extern __shared__ char smem[];
   auto rowEpilog = [] __device__(IdxT starty) { return; };
 
-  PairwiseDistances<useNorms, DataT, AccT, OutT, IdxT, Policy, CoreLambda,
-                    EpilogueLambda, FinalLambda, decltype(rowEpilog),
-                    isRowMajor, writeOut>
-    obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op,
-        epilog_op, fin_op, rowEpilog);
+  PairwiseDistances<useNorms,
+                    DataT,
+                    AccT,
+                    OutT,
+                    IdxT,
+                    Policy,
+                    CoreLambda,
+                    EpilogueLambda,
+                    FinalLambda,
+                    decltype(rowEpilog),
+                    isRowMajor,
+                    writeOut>
+    obj(
+      x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog);
   obj.run();
 }
 
 template <typename P, typename IdxT, typename T>
-dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) {
-  const auto numSMs = raft::getMultiProcessorCount();
+dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func)
+{
+  const auto numSMs  = raft::getMultiProcessorCount();
   int numBlocksPerSm = 0;
   dim3 grid;
 
-  CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &numBlocksPerSm, func, P::Nthreads, sMemSize));
+  CUDA_CHECK(
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, func, P::Nthreads, sMemSize));
   int minGridSize = numSMs * numBlocksPerSm;
-  int yChunks = raft::ceildiv<int>(m, P::Mblk);
-  int xChunks = raft::ceildiv<int>(n, P::Nblk);
-  grid.y = yChunks > minGridSize ? minGridSize : yChunks;
-  grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks;
+  int yChunks     = raft::ceildiv<int>(m, P::Mblk);
+  int xChunks     = raft::ceildiv<int>(n, P::Nblk);
+  grid.y          = yChunks > minGridSize ? minGridSize : yChunks;
+  grid.x          = (minGridSize - grid.y) <= 0 ? 1 : xChunks;
   if (grid.x != 1) {
     int i = 1;
     while (grid.y * i < minGridSize) {
diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index c62f2e5f79..773b83ab13 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -31,14 +31,14 @@ class exception : public std::exception {
   explicit exception() noexcept : std::exception(), msg_() {}
 
   /** copy ctor */
-  exception(exception const& src) noexcept
-    : std::exception(), msg_(src.what()) {
+  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
+  {
     collect_call_stack();
   }
 
   /** ctor from an input message */
-  explicit exception(std::string const msg) noexcept
-    : std::exception(), msg_(std::move(msg)) {
+  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
+  {
     collect_call_stack();
   }
 
@@ -51,7 +51,8 @@ class exception : public std::exception {
 
   /** append call stack info to this exception's message for ease of debug */
   // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-  void collect_call_stack() noexcept {
+  void collect_call_stack() noexcept
+  {
 #ifdef __GNUC__
     constexpr int kMaxStackDepth = 64;
     void* stack[kMaxStackDepth];  // NOLINT
@@ -90,16 +91,16 @@ struct logic_error : public raft::exception {
 
 // FIXME: Need to be replaced with RAFT_FAIL
 /** macro to throw a runtime error */
-#define THROW(fmt, ...)                                                        \
-  do {                                                                         \
-    std::string msg;                                                           \
-    char errMsg[2048]; /* NOLINT */                                            \
-    std::snprintf(errMsg, sizeof(errMsg),                                      \
-                  "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
-    msg += errMsg;                                                             \
-    std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__);                 \
-    msg += errMsg;                                                             \
-    throw raft::exception(msg);                                                \
+#define THROW(fmt, ...)                                                                    \
+  do {                                                                                     \
+    std::string msg;                                                                       \
+    char errMsg[2048]; /* NOLINT */                                                        \
+    std::snprintf(                                                                         \
+      errMsg, sizeof(errMsg), "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
+    msg += errMsg;                                                                         \
+    std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__);                             \
+    msg += errMsg;                                                                         \
+    throw raft::exception(msg);                                                            \
   } while (0)
 
 // FIXME: Need to be replaced with RAFT_EXPECTS
@@ -109,16 +110,15 @@ struct logic_error : public raft::exception {
     if (!(check)) THROW(fmt, ##__VA_ARGS__); \
   } while (0)
 
-#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                      \
-  do {                                                                     \
-    char err_msg[2048]; /* NOLINT */                                       \
-    std::snprintf(err_msg, sizeof(err_msg), location_prefix);              \
-    msg += err_msg;                                                        \
-    std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \
-                  __LINE__);                                               \
-    msg += err_msg;                                                        \
-    std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__);           \
-    msg += err_msg;                                                        \
+#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                 \
+  do {                                                                                \
+    char err_msg[2048]; /* NOLINT */                                                  \
+    std::snprintf(err_msg, sizeof(err_msg), location_prefix);                         \
+    msg += err_msg;                                                                   \
+    std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, __LINE__); \
+    msg += err_msg;                                                                   \
+    std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__);                      \
+    msg += err_msg;                                                                   \
   } while (0)
 
 /**
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index dbe7e83189..bb7d22e079 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -65,29 +65,29 @@ class handle_t {
       }()),
       streams_(n_streams),
       device_allocator_(std::make_shared<mr::device::default_allocator>()),
-      host_allocator_(std::make_shared<mr::host::default_allocator>()) {
+      host_allocator_(std::make_shared<mr::host::default_allocator>())
+  {
     create_resources();
   }
 
   /**
-   * @brief Construct a light handle copy from another 
+   * @brief Construct a light handle copy from another
    * user stream, cuda handles, comms and worker pool are not copied
-   * The user_stream of the returned handle is set to the specified stream 
-   * of the other handle worker pool 
-   * @param[in] stream_id stream id in `other` worker streams 
+   * The user_stream of the returned handle is set to the specified stream
+   * of the other handle worker pool
+   * @param[in] stream_id stream id in `other` worker streams
    * to be set as user stream in the constructed handle
    * @param[in] n_streams number worker streams to be created
    */
-  handle_t(const handle_t& other, int stream_id,
-           int n_streams = kNumDefaultWorkerStreams)
-    : dev_id_(other.get_device()), streams_(n_streams) {
-    RAFT_EXPECTS(
-      other.get_num_internal_streams() > 0,
-      "ERROR: the main handle must have at least one worker stream\n");
-    prop_ = other.get_device_properties();
+  handle_t(const handle_t& other, int stream_id, int n_streams = kNumDefaultWorkerStreams)
+    : dev_id_(other.get_device()), streams_(n_streams)
+  {
+    RAFT_EXPECTS(other.get_num_internal_streams() > 0,
+                 "ERROR: the main handle must have at least one worker stream\n");
+    prop_                    = other.get_device_properties();
     device_prop_initialized_ = true;
-    device_allocator_ = other.get_device_allocator();
-    host_allocator_ = other.get_host_allocator();
+    device_allocator_        = other.get_device_allocator();
+    host_allocator_          = other.get_host_allocator();
     create_resources();
     set_stream(other.get_internal_stream(stream_id));
   }
@@ -99,25 +99,22 @@ class handle_t {
 
   void set_stream(cudaStream_t stream) { user_stream_ = stream; }
   cudaStream_t get_stream() const { return user_stream_; }
-  rmm::cuda_stream_view get_stream_view() const {
-    return rmm::cuda_stream_view(user_stream_);
-  }
+  rmm::cuda_stream_view get_stream_view() const { return rmm::cuda_stream_view(user_stream_); }
 
-  void set_device_allocator(std::shared_ptr<mr::device::allocator> allocator) {
+  void set_device_allocator(std::shared_ptr<mr::device::allocator> allocator)
+  {
     device_allocator_ = allocator;
   }
-  std::shared_ptr<mr::device::allocator> get_device_allocator() const {
-    return device_allocator_;
-  }
+  std::shared_ptr<mr::device::allocator> get_device_allocator() const { return device_allocator_; }
 
-  void set_host_allocator(std::shared_ptr<mr::host::allocator> allocator) {
+  void set_host_allocator(std::shared_ptr<mr::host::allocator> allocator)
+  {
     host_allocator_ = allocator;
   }
-  std::shared_ptr<mr::host::allocator> get_host_allocator() const {
-    return host_allocator_;
-  }
+  std::shared_ptr<mr::host::allocator> get_host_allocator() const { return host_allocator_; }
 
-  cublasHandle_t get_cublas_handle() const {
+  cublasHandle_t get_cublas_handle() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cublas_initialized_) {
       CUBLAS_CHECK(cublasCreate(&cublas_handle_));
@@ -126,7 +123,8 @@ class handle_t {
     return cublas_handle_;
   }
 
-  cusolverDnHandle_t get_cusolver_dn_handle() const {
+  cusolverDnHandle_t get_cusolver_dn_handle() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_dn_initialized_) {
       CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_));
@@ -135,7 +133,8 @@ class handle_t {
     return cusolver_dn_handle_;
   }
 
-  cusolverSpHandle_t get_cusolver_sp_handle() const {
+  cusolverSpHandle_t get_cusolver_sp_handle() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_sp_initialized_) {
       CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_));
@@ -144,7 +143,8 @@ class handle_t {
     return cusolver_sp_handle_;
   }
 
-  cusparseHandle_t get_cusparse_handle() const {
+  cusparseHandle_t get_cusparse_handle() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusparse_initialized_) {
       CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
@@ -154,16 +154,13 @@ class handle_t {
   }
 
   // legacy compatibility for cuML
-  cudaStream_t get_internal_stream(int sid) const {
-    return streams_.get_stream(sid).value();
-  }
+  cudaStream_t get_internal_stream(int sid) const { return streams_.get_stream(sid).value(); }
   // new accessor return rmm::cuda_stream_view
-  rmm::cuda_stream_view get_internal_stream_view(int sid) const {
-    return streams_.get_stream(sid);
-  }
+  rmm::cuda_stream_view get_internal_stream_view(int sid) const { return streams_.get_stream(sid); }
 
   int get_num_internal_streams() const { return streams_.get_pool_size(); }
-  std::vector<cudaStream_t> get_internal_streams() const {
+  std::vector<cudaStream_t> get_internal_streams() const
+  {
     std::vector<cudaStream_t> int_streams_vec;
     for (int i = 0; i < get_num_internal_streams(); i++) {
       int_streams_vec.push_back(get_internal_stream(i));
@@ -171,49 +168,51 @@ class handle_t {
     return int_streams_vec;
   }
 
-  void wait_on_user_stream() const {
+  void wait_on_user_stream() const
+  {
     CUDA_CHECK(cudaEventRecord(event_, user_stream_));
     for (int i = 0; i < get_num_internal_streams(); i++) {
       CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0));
     }
   }
 
-  void wait_on_internal_streams() const {
+  void wait_on_internal_streams() const
+  {
     for (int i = 0; i < get_num_internal_streams(); i++) {
       CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i)));
       CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0));
     }
   }
 
-  void set_comms(std::shared_ptr<comms::comms_t> communicator) {
-    communicator_ = communicator;
-  }
+  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
 
-  const comms::comms_t& get_comms() const {
-    RAFT_EXPECTS(this->comms_initialized(),
-                 "ERROR: Communicator was not initialized\n");
+  const comms::comms_t& get_comms() const
+  {
+    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
     return *communicator_;
   }
 
-  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm) {
+  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
+  {
     subcomms_[key] = subcomm;
   }
 
-  const comms::comms_t& get_subcomm(std::string key) const {
-    RAFT_EXPECTS(subcomms_.find(key) != subcomms_.end(),
-                 "%s was not found in subcommunicators.", key.c_str());
+  const comms::comms_t& get_subcomm(std::string key) const
+  {
+    RAFT_EXPECTS(
+      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
 
     auto subcomm = subcomms_.at(key);
 
-    RAFT_EXPECTS(nullptr != subcomm.get(),
-                 "ERROR: Subcommunicator was not initialized");
+    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
 
     return *subcomm;
   }
 
   bool comms_initialized() const { return (nullptr != communicator_.get()); }
 
-  const cudaDeviceProp& get_device_properties() const {
+  const cudaDeviceProp& get_device_properties() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!device_prop_initialized_) {
       CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_));
@@ -244,29 +243,28 @@ class handle_t {
   mutable bool device_prop_initialized_{false};
   mutable std::mutex mutex_;
 
-  void create_resources() {
-    CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-  }
+  void create_resources() { CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); }
 
-  void destroy_resources() {
+  void destroy_resources()
+  {
     ///@todo: enable *_NO_THROW variants once we have enabled logging
     if (cusparse_initialized_) {
-      //CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_));
+      // CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_));
       CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_));
     }
     if (cusolver_dn_initialized_) {
-      //CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+      // CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
       CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_));
     }
     if (cusolver_sp_initialized_) {
-      //CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+      // CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
       CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_));
     }
     if (cublas_initialized_) {
-      //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_));
+      // CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_));
       CUBLAS_CHECK(cublasDestroy(cublas_handle_));
     }
-    //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_));
+    // CUDA_CHECK_NO_THROW(cudaEventDestroy(event_));
     CUDA_CHECK(cudaEventDestroy(event_));
   }
 };  // class handle_t
@@ -276,7 +274,8 @@ class handle_t {
  */
 class stream_syncer {
  public:
-  explicit stream_syncer(const handle_t& handle) : handle_(handle) {
+  explicit stream_syncer(const handle_t& handle) : handle_(handle)
+  {
     handle_.wait_on_user_stream();
   }
   ~stream_syncer() { handle_.wait_on_internal_streams(); }
diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h
index a7cfb9287b..5fc56de14b 100644
--- a/cpp/include/raft/integer_utils.h
+++ b/cpp/include/raft/integer_utils.h
@@ -34,15 +34,13 @@ namespace raft {
  * `modulus` is positive.
  */
 template <typename S>
-inline S round_up_safe(S number_to_round, S modulus) {
+inline S round_up_safe(S number_to_round, S modulus)
+{
   auto remainder = number_to_round % modulus;
-  if (remainder == 0) {
-    return number_to_round;
-  }
+  if (remainder == 0) { return number_to_round; }
   auto rounded_up = number_to_round - remainder + modulus;
   if (rounded_up < number_to_round) {
-    throw std::invalid_argument(
-      "Attempt to round up beyond the type's maximum value");
+    throw std::invalid_argument("Attempt to round up beyond the type's maximum value");
   }
   return rounded_up;
 }
@@ -53,8 +51,9 @@ inline S round_up_safe(S number_to_round, S modulus) {
  * `modulus` is positive.
  */
 template <typename S>
-inline S round_down_safe(S number_to_round, S modulus) {
-  auto remainder = number_to_round % modulus;
+inline S round_down_safe(S number_to_round, S modulus)
+{
+  auto remainder    = number_to_round % modulus;
   auto rounded_down = number_to_round - remainder;
   return rounded_down;
 }
@@ -72,25 +71,28 @@ inline S round_down_safe(S number_to_round, S modulus) {
  * the result will be incorrect
  */
 template <typename S, typename T>
-constexpr inline S div_rounding_up_unsafe(const S& dividend,
-                                          const T& divisor) noexcept {
+constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept
+{
   return (dividend + divisor - 1) / divisor;
 }
 
 namespace detail {
 template <typename I>
 constexpr inline I div_rounding_up_safe(std::integral_constant<bool, false>,
-                                        I dividend, I divisor) noexcept {
+                                        I dividend,
+                                        I divisor) noexcept
+{
   // TODO: This could probably be implemented faster
-  return (dividend > divisor)
-           ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
-           : (dividend > 0);
+  return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
+                              : (dividend > 0);
 }
 
 template <typename I>
 constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
-                                        I dividend, I divisor) noexcept {
-  auto quotient = dividend / divisor;
+                                        I dividend,
+                                        I divisor) noexcept
+{
+  auto quotient  = dividend / divisor;
   auto remainder = dividend % divisor;
   return quotient + (remainder != 0);
 }
@@ -110,16 +112,17 @@ constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
  * approach of using (dividend + divisor - 1) / divisor
  */
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, I>
-div_rounding_up_safe(I dividend, I divisor) noexcept {
-  using i_is_a_signed_type =
-    std::integral_constant<bool, std::is_signed<I>::value>;
+constexpr inline std::enable_if_t<std::is_integral<I>::value, I> div_rounding_up_safe(
+  I dividend, I divisor) noexcept
+{
+  using i_is_a_signed_type = std::integral_constant<bool, std::is_signed<I>::value>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, bool>
-is_a_power_of_two(I val) noexcept {
+constexpr inline std::enable_if_t<std::is_integral<I>::value, bool> is_a_power_of_two(
+  I val) noexcept
+{
   return ((val - 1) & val) == 0;
 }
 
@@ -147,14 +150,14 @@ is_a_power_of_two(I val) noexcept {
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(
-  T value) {
+std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
+{
   return std::abs(value);
 }
 // Unsigned type just returns itself.
 template <typename T>
-std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(
-  T value) {
+std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
+{
   return value;
 }
 
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index 0da7da2eb6..0bbfa2bb3c 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -43,33 +43,35 @@ namespace label {
  * \param [in] allocator device allocator
  */
 template <typename value_t>
-void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique,
+void getUniquelabels(value_t* y,
+                     size_t n,
+                     value_t** y_unique,
+                     int* n_unique,
                      cudaStream_t stream,
-                     std::shared_ptr<raft::mr::device::allocator> allocator) {
+                     std::shared_ptr<raft::mr::device::allocator> allocator)
+{
   raft::mr::device::buffer<value_t> y2(allocator, stream, n);
   raft::mr::device::buffer<value_t> y3(allocator, stream, n);
   raft::mr::device::buffer<int> d_num_selected(allocator, stream, 1);
-  size_t bytes = 0;
+  size_t bytes  = 0;
   size_t bytes2 = 0;
 
   // Query how much temporary storage we will need for cub operations
   // and allocate it
   cub::DeviceRadixSort::SortKeys(NULL, bytes, y, y2.data(), n);
-  cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(),
-                            d_num_selected.data(), n);
+  cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(), d_num_selected.data(), n);
   bytes = max(bytes, bytes2);
   raft::mr::device::buffer<char> cub_storage(allocator, stream, bytes);
 
   // Select Unique classes
   cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n);
-  cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(),
-                            d_num_selected.data(), n);
+  cub::DeviceSelect::Unique(
+    cub_storage.data(), bytes, y2.data(), y3.data(), d_num_selected.data(), n);
   raft::update_host(n_unique, d_num_selected.data(), 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   // Copy unique classes to output
-  *y_unique =
-    (value_t *)allocator->allocate(*n_unique * sizeof(value_t), stream);
+  *y_unique = (value_t*)allocator->allocate(*n_unique * sizeof(value_t), stream);
   raft::copy(*y_unique, y3.data(), *n_unique, stream);
 }
 
@@ -92,16 +94,17 @@ void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique,
  * \param [in] stream cuda stream
  */
 template <typename value_t>
-void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes,
-                  value_t *y_out, int idx, cudaStream_t stream) {
+void getOvrlabels(
+  value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream)
+{
   ASSERT(idx < n_classes,
          "Parameter idx should not be larger than the number "
          "of classes");
   raft::linalg::unaryOp(
-    y_out, y, n,
-    [idx, y_unique] __device__(value_t y) {
-      return y == y_unique[idx] ? +1 : -1;
-    },
+    y_out,
+    y,
+    n,
+    [idx, y_unique] __device__(value_t y) { return y == y_unique[idx] ? +1 : -1; },
     stream);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -110,9 +113,14 @@ void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes,
 // +/-1, return array with the new class labels and corresponding indices.
 
 template <typename Type, int TPB_X, typename Lambda>
-__global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in,
-                                 Type *out, size_t N, Lambda filter_op,
-                                 bool zero_based = false) {
+__global__ void map_label_kernel(Type* map_ids,
+                                 size_t N_labels,
+                                 Type* in,
+                                 Type* out,
+                                 size_t N,
+                                 Lambda filter_op,
+                                 bool zero_based = false)
+{
   int tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     if (!filter_op(in[tid])) {
@@ -127,68 +135,75 @@ __global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in,
 }
 
 /**
-   * Maps an input array containing a series of numbers into a new array
-   * where numbers have been mapped to a monotonically increasing set
-   * of labels. This can be useful in machine learning algorithms, for instance,
-   * where a given set of labels is not taken from a monotonically increasing
-   * set. This can happen if they are filtered or if only a subset of the
-   * total labels are used in a dataset. This is also useful in graph algorithms
-   * where a set of vertices need to be labeled in a monotonically increasing
-   * order.
-   * @tparam Type the numeric type of the input and output arrays
-   * @tparam Lambda the type of an optional filter function, which determines
-   * which items in the array to map.
-   * @param out the output monotonic array
-   * @param in input label array
-   * @param N number of elements in the input array
-   * @param stream cuda stream to use
-   * @param filter_op an optional function for specifying which values
-   * should have monotonically increasing labels applied to them.
-   */
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @tparam Lambda the type of an optional filter function, which determines
+ * which items in the array to map.
+ * @param out the output monotonic array
+ * @param in input label array
+ * @param N number of elements in the input array
+ * @param stream cuda stream to use
+ * @param filter_op an optional function for specifying which values
+ * should have monotonically increasing labels applied to them.
+ */
 template <typename Type, typename Lambda>
-void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
+void make_monotonic(Type* out,
+                    Type* in,
+                    size_t N,
+                    cudaStream_t stream,
                     Lambda filter_op,
                     std::shared_ptr<raft::mr::device::allocator> allocator,
-                    bool zero_based = false) {
+                    bool zero_based = false)
+{
   static const size_t TPB_X = 256;
 
   dim3 blocks(raft::ceildiv(N, TPB_X));
   dim3 threads(TPB_X);
 
-  Type *map_ids;
+  Type* map_ids;
   int num_clusters;
   getUniquelabels(in, N, &map_ids, &num_clusters, stream, allocator);
 
-  map_label_kernel<Type, TPB_X><<<blocks, threads, 0, stream>>>(
-    map_ids, num_clusters, in, out, N, filter_op, zero_based);
+  map_label_kernel<Type, TPB_X>
+    <<<blocks, threads, 0, stream>>>(map_ids, num_clusters, in, out, N, filter_op, zero_based);
 
   allocator->deallocate(map_ids, num_clusters * sizeof(Type), stream);
 }
 
 /**
-   * Maps an input array containing a series of numbers into a new array
-   * where numbers have been mapped to a monotonically increasing set
-   * of labels. This can be useful in machine learning algorithms, for instance,
-   * where a given set of labels is not taken from a monotonically increasing
-   * set. This can happen if they are filtered or if only a subset of the
-   * total labels are used in a dataset. This is also useful in graph algorithms
-   * where a set of vertices need to be labeled in a monotonically increasing
-   * order.
-   * @tparam Type the numeric type of the input and output arrays
-   * @tparam Lambda the type of an optional filter function, which determines
-   * which items in the array to map.
-   * @param out output label array with labels assigned monotonically
-   * @param in input label array
-   * @param N number of elements in the input array
-   * @param stream cuda stream to use
-   */
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @tparam Lambda the type of an optional filter function, which determines
+ * which items in the array to map.
+ * @param out output label array with labels assigned monotonically
+ * @param in input label array
+ * @param N number of elements in the input array
+ * @param stream cuda stream to use
+ */
 template <typename Type>
-void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
+void make_monotonic(Type* out,
+                    Type* in,
+                    size_t N,
+                    cudaStream_t stream,
                     std::shared_ptr<raft::mr::device::allocator> allocator,
-                    bool zero_based = false) {
+                    bool zero_based = false)
+{
   make_monotonic<Type>(
-    out, in, N, stream, [] __device__(Type val) { return false; }, allocator,
-    zero_based);
+    out, in, N, stream, [] __device__(Type val) { return false; }, allocator, zero_based);
 }
 };  // namespace label
 };  // end namespace raft
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
index bed74581a2..1ee0659b0d 100644
--- a/cpp/include/raft/label/merge_labels.cuh
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -35,8 +35,10 @@ __global__ void __launch_bounds__(TPB_X)
   propagate_label_kernel(const value_idx* __restrict__ labels_a,
                          const value_idx* __restrict__ labels_b,
                          value_idx* __restrict__ R,
-                         const bool* __restrict__ mask, bool* __restrict__ m,
-                         value_idx N) {
+                         const bool* __restrict__ mask,
+                         bool* __restrict__ m,
+                         value_idx N)
+{
   value_idx tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     if (__ldg((char*)mask + tid)) {
@@ -65,15 +67,17 @@ template <typename value_idx, int TPB_X = 256>
 __global__ void __launch_bounds__(TPB_X)
   reassign_label_kernel(value_idx* __restrict__ labels_a,
                         const value_idx* __restrict__ labels_b,
-                        const value_idx* __restrict__ R, value_idx N,
-                        value_idx MAX_LABEL) {
+                        const value_idx* __restrict__ R,
+                        value_idx N,
+                        value_idx MAX_LABEL)
+{
   value_idx tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     // Note: labels are from 1 to N
-    value_idx la = labels_a[tid];
-    value_idx lb = __ldg(labels_b + tid);
-    value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1;
-    value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1;
+    value_idx la  = labels_a[tid];
+    value_idx lb  = __ldg(labels_b + tid);
+    value_idx ra  = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1;
+    value_idx rb  = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1;
     labels_a[tid] = min(ra, rb);
   }
 }
@@ -108,9 +112,14 @@ __global__ void __launch_bounds__(TPB_X)
  * @param[in]    stream      CUDA stream
  */
 template <typename value_idx = int, int TPB_X = 256>
-void merge_labels(value_idx* labels_a, const value_idx* labels_b,
-                  const bool* mask, value_idx* R, bool* m, value_idx N,
-                  cudaStream_t stream) {
+void merge_labels(value_idx* labels_a,
+                  const value_idx* labels_b,
+                  const bool* mask,
+                  value_idx* R,
+                  bool* m,
+                  value_idx N,
+                  cudaStream_t stream)
+{
   dim3 blocks(raft::ceildiv(N, value_idx(TPB_X)));
   dim3 threads(TPB_X);
   value_idx MAX_LABEL = std::numeric_limits<value_idx>::max();
diff --git a/cpp/include/raft/lap/d_structs.h b/cpp/include/raft/lap/d_structs.h
index ed545b7198..e488dc528f 100644
--- a/cpp/include/raft/lap/d_structs.h
+++ b/cpp/include/raft/lap/d_structs.h
@@ -26,18 +26,18 @@
 
 template <typename vertex_t, typename weight_t>
 struct Vertices {
-  vertex_t *row_assignments;
-  vertex_t *col_assignments;
-  int *row_covers;
-  int *col_covers;
-  weight_t *row_duals;
-  weight_t *col_duals;
-  weight_t *col_slacks;
+  vertex_t* row_assignments;
+  vertex_t* col_assignments;
+  int* row_covers;
+  int* col_covers;
+  weight_t* row_duals;
+  weight_t* col_duals;
+  weight_t* col_slacks;
 };
 
 template <typename vertex_t>
 struct VertexData {
-  vertex_t *parents;
-  vertex_t *children;
-  int *is_visited;
+  vertex_t* parents;
+  vertex_t* children;
+  int* is_visited;
 };
diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh
index 6bc1c08029..64b6a31efb 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.cuh
@@ -38,12 +38,12 @@ class LinearAssignmentProblem {
   vertex_t batchsize_;
   weight_t epsilon_;
 
-  weight_t const *d_costs_;
+  weight_t const* d_costs_;
 
   Vertices<vertex_t, weight_t> d_vertices_dev;
   VertexData<vertex_t> d_row_data_dev, d_col_data_dev;
 
-  raft::handle_t const &handle_;
+  raft::handle_t const& handle_;
   raft::mr::device::buffer<int> row_covers_v;
   raft::mr::device::buffer<int> col_covers_v;
   raft::mr::device::buffer<weight_t> row_duals_v;
@@ -59,8 +59,10 @@ class LinearAssignmentProblem {
   raft::mr::device::buffer<weight_t> obj_val_dual_v;
 
  public:
-  LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size,
-                          vertex_t batchsize, weight_t epsilon)
+  LinearAssignmentProblem(raft::handle_t const& handle,
+                          vertex_t size,
+                          vertex_t batchsize,
+                          weight_t epsilon)
     : handle_(handle),
       size_(size),
       batchsize_(batchsize),
@@ -78,11 +80,13 @@ class LinearAssignmentProblem {
       row_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
       col_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
       obj_val_primal_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) {}
+      obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0)
+  {
+  }
 
   // Executes Hungarian algorithm on the input cost matrix.
-  void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment,
-             vertex_t *d_col_assignment) {
+  void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment)
+  {
     initializeDevice();
 
     d_vertices_dev.row_assignments = d_row_assignment;
@@ -94,27 +98,13 @@ class LinearAssignmentProblem {
 
     while (step != 100) {
       switch (step) {
-        case 0:
-          step = hungarianStep0();
-          break;
-        case 1:
-          step = hungarianStep1();
-          break;
-        case 2:
-          step = hungarianStep2();
-          break;
-        case 3:
-          step = hungarianStep3();
-          break;
-        case 4:
-          step = hungarianStep4();
-          break;
-        case 5:
-          step = hungarianStep5();
-          break;
-        case 6:
-          step = hungarianStep6();
-          break;
+        case 0: step = hungarianStep0(); break;
+        case 1: step = hungarianStep1(); break;
+        case 2: step = hungarianStep2(); break;
+        case 3: step = hungarianStep3(); break;
+        case 4: step = hungarianStep4(); break;
+        case 5: step = hungarianStep5(); break;
+        case 6: step = hungarianStep6(); break;
       }
     }
 
@@ -122,36 +112,39 @@ class LinearAssignmentProblem {
   }
 
   // Function for getting optimal row dual vector for subproblem spId.
-  std::pair<const weight_t *, vertex_t> getRowDualVector(int spId) const {
+  std::pair<const weight_t*, vertex_t> getRowDualVector(int spId) const
+  {
     return std::make_pair(row_duals_v.data() + spId * size_, size_);
   }
 
   // Function for getting optimal col dual vector for subproblem spId.
-  std::pair<const weight_t *, vertex_t> getColDualVector(int spId) {
+  std::pair<const weight_t*, vertex_t> getColDualVector(int spId)
+  {
     return std::make_pair(col_duals_v.data() + spId * size_, size_);
   }
 
   // Function for getting optimal primal objective value for subproblem spId.
-  weight_t getPrimalObjectiveValue(int spId) {
+  weight_t getPrimalObjectiveValue(int spId)
+  {
     weight_t result;
-    raft::update_host(&result, obj_val_primal_v.data() + spId, 1,
-                      handle_.get_stream());
+    raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream());
     CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
   // Function for getting optimal dual objective value for subproblem spId.
-  weight_t getDualObjectiveValue(int spId) {
+  weight_t getDualObjectiveValue(int spId)
+  {
     weight_t result;
-    raft::update_host(&result, obj_val_dual_v.data() + spId, 1,
-                      handle_.get_stream());
+    raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream());
     CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
  private:
   // Helper function for initializing global variables and arrays on a single host.
-  void initializeDevice() {
+  void initializeDevice()
+  {
     row_covers_v.resize(batchsize_ * size_);
     col_covers_v.resize(batchsize_ * size_);
     row_duals_v.resize(batchsize_ * size_);
@@ -169,39 +162,36 @@ class LinearAssignmentProblem {
     d_vertices_dev.row_covers = row_covers_v.data();
     d_vertices_dev.col_covers = col_covers_v.data();
 
-    d_vertices_dev.row_duals = row_duals_v.data();
-    d_vertices_dev.col_duals = col_duals_v.data();
+    d_vertices_dev.row_duals  = row_duals_v.data();
+    d_vertices_dev.col_duals  = col_duals_v.data();
     d_vertices_dev.col_slacks = col_slacks_v.data();
 
     d_row_data_dev.is_visited = row_is_visited_v.data();
     d_col_data_dev.is_visited = col_is_visited_v.data();
-    d_row_data_dev.parents = row_parents_v.data();
-    d_row_data_dev.children = row_children_v.data();
-    d_col_data_dev.parents = col_parents_v.data();
-    d_col_data_dev.children = col_children_v.data();
-
-    thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(),
-                 int{0});
-    thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(),
-                 int{0});
-    thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(),
-                 weight_t{0});
-    thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(),
-                 weight_t{0});
+    d_row_data_dev.parents    = row_parents_v.data();
+    d_row_data_dev.children   = row_children_v.data();
+    d_col_data_dev.parents    = col_parents_v.data();
+    d_col_data_dev.children   = col_children_v.data();
+
+    thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0});
+    thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0});
+    thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0});
+    thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0});
   }
 
   // Function for calculating initial zeros by subtracting row and column minima from each element.
-  int hungarianStep0() {
-    detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_,
-                             size_);
+  int hungarianStep0()
+  {
+    detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_);
 
     return 1;
   }
 
   // Function for calculating initial zeros by subtracting row and column minima from each element.
-  int hungarianStep1() {
-    detail::computeInitialAssignments(handle_, d_costs_, d_vertices_dev,
-                                      batchsize_, size_, epsilon_);
+  int hungarianStep1()
+  {
+    detail::computeInitialAssignments(
+      handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_);
 
     int next = 2;
 
@@ -217,10 +207,10 @@ class LinearAssignmentProblem {
   }
 
   // Function for checking optimality and constructing predicates and covers.
-  int hungarianStep2() {
-    int cover_count =
-      detail::computeRowCovers(handle_, d_vertices_dev, d_row_data_dev,
-                               d_col_data_dev, batchsize_, size_);
+  int hungarianStep2()
+  {
+    int cover_count = detail::computeRowCovers(
+      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
 
     int next = (cover_count == batchsize_ * size_) ? 6 : 3;
 
@@ -228,17 +218,23 @@ class LinearAssignmentProblem {
   }
 
   // Function for building alternating tree rooted at unassigned rows.
-  int hungarianStep3() {
+  int hungarianStep3()
+  {
     int next;
 
-    raft::mr::device::buffer<bool> flag_v(handle_.get_device_allocator(),
-                                          handle_.get_stream(), 1);
+    raft::mr::device::buffer<bool> flag_v(handle_.get_device_allocator(), handle_.get_stream(), 1);
 
     bool h_flag = false;
     raft::update_device(flag_v.data(), &h_flag, 1, handle_.get_stream());
 
-    detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev,
-                             d_col_data_dev, flag_v.data(), batchsize_, size_,
+    detail::executeZeroCover(handle_,
+                             d_costs_,
+                             d_vertices_dev,
+                             d_row_data_dev,
+                             d_col_data_dev,
+                             flag_v.data(),
+                             batchsize_,
+                             size_,
                              epsilon_);
 
     raft::update_host(&h_flag, flag_v.data(), 1, handle_.get_stream());
@@ -249,31 +245,36 @@ class LinearAssignmentProblem {
   }
 
   // Function for augmenting the solution along multiple node-disjoint alternating trees.
-  int hungarianStep4() {
-    detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_,
-                        size_);
+  int hungarianStep4()
+  {
+    detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_);
 
-    detail::augmentationPass(handle_, d_vertices_dev, d_row_data_dev,
-                             d_col_data_dev, batchsize_, size_);
+    detail::augmentationPass(
+      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
 
     return 2;
   }
 
   // Function for updating dual solution to introduce new zero-cost arcs.
-  int hungarianStep5() {
-    detail::dualUpdate(handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev,
-                       batchsize_, size_, epsilon_);
+  int hungarianStep5()
+  {
+    detail::dualUpdate(
+      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_);
 
     return 3;
   }
 
   // Function for calculating primal and dual objective values at optimality.
-  int hungarianStep6() {
-    detail::calcObjValPrimal(handle_, obj_val_primal_v.data(), d_costs_,
-                             d_vertices_dev.row_assignments, batchsize_, size_);
+  int hungarianStep6()
+  {
+    detail::calcObjValPrimal(handle_,
+                             obj_val_primal_v.data(),
+                             d_costs_,
+                             d_vertices_dev.row_assignments,
+                             batchsize_,
+                             size_);
 
-    detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev,
-                           batchsize_, size_);
+    detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_);
 
     return 100;
   }
diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh
index 0079f50e82..9bbd44bf09 100644
--- a/cpp/include/raft/lap/lap_functions.cuh
+++ b/cpp/include/raft/lap/lap_functions.cuh
@@ -46,20 +46,26 @@ const int BLOCKDIMX{64};
 const int BLOCKDIMY{1};
 
 // Function for calculating grid and block dimensions from the given input size.
-inline void calculateLinearDims(dim3 &blocks_per_grid, dim3 &threads_per_block,
-                                int &total_blocks, int size) {
+inline void calculateLinearDims(dim3& blocks_per_grid,
+                                dim3& threads_per_block,
+                                int& total_blocks,
+                                int size)
+{
   threads_per_block.x = BLOCKDIMX * BLOCKDIMY;
 
   int value = size / threads_per_block.x;
   if (size % threads_per_block.x > 0) value++;
 
-  total_blocks = value;
+  total_blocks      = value;
   blocks_per_grid.x = value;
 }
 
 // Function for calculating grid and block dimensions from the given input size for square grid.
-inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block,
-                                int &total_blocks, int size) {
+inline void calculateSquareDims(dim3& blocks_per_grid,
+                                dim3& threads_per_block,
+                                int& total_blocks,
+                                int size)
+{
   threads_per_block.x = BLOCKDIMX;
   threads_per_block.y = BLOCKDIMY;
 
@@ -68,15 +74,16 @@ inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block,
   int valuex = (int)ceil((float)(sq_size) / BLOCKDIMX);
   int valuey = (int)ceil((float)(sq_size) / BLOCKDIMY);
 
-  total_blocks = valuex * valuey;
+  total_blocks      = valuex * valuey;
   blocks_per_grid.x = valuex;
   blocks_per_grid.y = valuey;
 }
 
-// Function for calculating grid and block dimensions from the given input size for rectangular grid.
-inline void calculateRectangularDims(dim3 &blocks_per_grid,
-                                     dim3 &threads_per_block, int &total_blocks,
-                                     int xsize, int ysize) {
+// Function for calculating grid and block dimensions from the given input size for rectangular
+// grid.
+inline void calculateRectangularDims(
+  dim3& blocks_per_grid, dim3& threads_per_block, int& total_blocks, int xsize, int ysize)
+{
   threads_per_block.x = BLOCKDIMX;
   threads_per_block.y = BLOCKDIMY;
 
@@ -86,16 +93,18 @@ inline void calculateRectangularDims(dim3 &blocks_per_grid,
   int valuey = ysize / threads_per_block.y;
   if (ysize % threads_per_block.y > 0) valuey++;
 
-  total_blocks = valuex * valuey;
+  total_blocks      = valuex * valuey;
   blocks_per_grid.x = valuex;
   blocks_per_grid.y = valuey;
 }
 
 template <typename vertex_t, typename weight_t>
-inline void initialReduction(raft::handle_t const &handle,
-                             weight_t const *d_costs,
-                             Vertices<vertex_t, weight_t> &d_vertices_dev,
-                             int SP, vertex_t N) {
+inline void initialReduction(raft::handle_t const& handle,
+                             weight_t const* d_costs,
+                             Vertices<vertex_t, weight_t>& d_vertices_dev,
+                             int SP,
+                             vertex_t N)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -103,34 +112,38 @@ inline void initialReduction(raft::handle_t const &handle,
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
-  kernel_rowReduction<<<blocks_per_grid, threads_per_block, 0,
-                        handle.get_stream()>>>(
-    d_costs, d_vertices_dev.row_duals, SP, N,
-    std::numeric_limits<weight_t>::max());
+  kernel_rowReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits<weight_t>::max());
 
   CHECK_CUDA(handle.get_stream());
-  kernel_columnReduction<<<blocks_per_grid, threads_per_block, 0,
-                           handle.get_stream()>>>(
-    d_costs, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N,
+  kernel_columnReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_costs,
+    d_vertices_dev.row_duals,
+    d_vertices_dev.col_duals,
+    SP,
+    N,
     std::numeric_limits<weight_t>::max());
   CHECK_CUDA(handle.get_stream());
 }
 
 template <typename vertex_t, typename weight_t>
-inline void computeInitialAssignments(raft::handle_t const &handle,
-                                      weight_t const *d_costs,
-                                      Vertices<vertex_t, weight_t> &d_vertices,
-                                      int SP, vertex_t N, weight_t epsilon) {
+inline void computeInitialAssignments(raft::handle_t const& handle,
+                                      weight_t const* d_costs,
+                                      Vertices<vertex_t, weight_t>& d_vertices,
+                                      int SP,
+                                      vertex_t N,
+                                      weight_t epsilon)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
 
   std::size_t size = SP * N;
 
-  raft::mr::device::buffer<int> row_lock_v(handle.get_device_allocator(),
-                                           handle.get_stream(), size);
-  raft::mr::device::buffer<int> col_lock_v(handle.get_device_allocator(),
-                                           handle.get_stream(), size);
+  raft::mr::device::buffer<int> row_lock_v(
+    handle.get_device_allocator(), handle.get_stream(), size);
+  raft::mr::device::buffer<int> col_lock_v(
+    handle.get_device_allocator(), handle.get_stream(), size);
 
   thrust::fill_n(thrust::device, d_vertices.row_assignments, size, -1);
   thrust::fill_n(thrust::device, d_vertices.col_assignments, size, -1);
@@ -140,21 +153,29 @@ inline void computeInitialAssignments(raft::handle_t const &handle,
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
-  kernel_computeInitialAssignments<<<blocks_per_grid, threads_per_block, 0,
-                                     handle.get_stream()>>>(
-    d_costs, d_vertices.row_duals, d_vertices.col_duals,
-    d_vertices.row_assignments, d_vertices.col_assignments, row_lock_v.data(),
-    col_lock_v.data(), SP, N, epsilon);
+  kernel_computeInitialAssignments<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_costs,
+    d_vertices.row_duals,
+    d_vertices.col_duals,
+    d_vertices.row_assignments,
+    d_vertices.col_assignments,
+    row_lock_v.data(),
+    col_lock_v.data(),
+    SP,
+    N,
+    epsilon);
   CHECK_CUDA(handle.get_stream());
 }
 
 // Function for finding row cover on individual devices.
 template <typename vertex_t, typename weight_t>
-inline int computeRowCovers(raft::handle_t const &handle,
-                            Vertices<vertex_t, weight_t> &d_vertices,
-                            VertexData<vertex_t> &d_row_data,
-                            VertexData<vertex_t> &d_col_data, int SP,
-                            vertex_t N) {
+inline int computeRowCovers(raft::handle_t const& handle,
+                            Vertices<vertex_t, weight_t>& d_vertices,
+                            VertexData<vertex_t>& d_row_data,
+                            VertexData<vertex_t>& d_col_data,
+                            int SP,
+                            vertex_t N)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -163,8 +184,7 @@ inline int computeRowCovers(raft::handle_t const &handle,
 
   thrust::fill_n(thrust::device, d_vertices.row_covers, size, int{0});
   thrust::fill_n(thrust::device, d_vertices.col_covers, size, int{0});
-  thrust::fill_n(thrust::device, d_vertices.col_slacks, size,
-                 std::numeric_limits<weight_t>::max());
+  thrust::fill_n(thrust::device, d_vertices.col_slacks, size, std::numeric_limits<weight_t>::max());
   thrust::fill_n(thrust::device, d_row_data.is_visited, size, DORMANT);
   thrust::fill_n(thrust::device, d_col_data.is_visited, size, DORMANT);
   thrust::fill_n(thrust::device, d_row_data.parents, size, vertex_t{-1});
@@ -174,25 +194,28 @@ inline int computeRowCovers(raft::handle_t const &handle,
 
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
-  kernel_computeRowCovers<<<blocks_per_grid, threads_per_block, 0,
-                            handle.get_stream()>>>(
-    d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited,
-    SP, N);
+  kernel_computeRowCovers<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N);
 
   CHECK_CUDA(handle.get_stream());
 
-  return thrust::reduce(thrust::device, d_vertices.row_covers,
-                        d_vertices.row_covers + size);
+  return thrust::reduce(thrust::device, d_vertices.row_covers, d_vertices.row_covers + size);
 }
 
 // Function for covering the zeros in uncovered rows and expanding the frontier.
 template <typename vertex_t, typename weight_t>
-inline void coverZeroAndExpand(
-  raft::handle_t const &handle, weight_t const *d_costs_dev,
-  vertex_t const *d_rows_csr_neighbors, vertex_t const *d_rows_csr_ptrs,
-  Vertices<vertex_t, weight_t> &d_vertices_dev,
-  VertexData<vertex_t> &d_row_data_dev, VertexData<vertex_t> &d_col_data_dev,
-  bool *d_flag, int SP, vertex_t N, weight_t epsilon) {
+inline void coverZeroAndExpand(raft::handle_t const& handle,
+                               weight_t const* d_costs_dev,
+                               vertex_t const* d_rows_csr_neighbors,
+                               vertex_t const* d_rows_csr_ptrs,
+                               Vertices<vertex_t, weight_t>& d_vertices_dev,
+                               VertexData<vertex_t>& d_row_data_dev,
+                               VertexData<vertex_t>& d_col_data_dev,
+                               bool* d_flag,
+                               int SP,
+                               vertex_t N,
+                               weight_t epsilon)
+{
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
@@ -200,24 +223,34 @@ inline void coverZeroAndExpand(
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
-  kernel_coverAndExpand<<<blocks_per_grid, threads_per_block, 0,
-                          handle.get_stream()>>>(
-    d_flag, d_rows_csr_ptrs, d_rows_csr_neighbors, d_costs_dev, d_vertices_dev,
-    d_row_data_dev, d_col_data_dev, SP, N, epsilon);
+  kernel_coverAndExpand<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_flag,
+    d_rows_csr_ptrs,
+    d_rows_csr_neighbors,
+    d_costs_dev,
+    d_vertices_dev,
+    d_row_data_dev,
+    d_col_data_dev,
+    SP,
+    N,
+    epsilon);
 }
 
 template <typename vertex_t, typename weight_t>
-inline vertex_t zeroCoverIteration(raft::handle_t const &handle,
-                                   weight_t const *d_costs_dev,
-                                   Vertices<vertex_t, weight_t> &d_vertices_dev,
-                                   VertexData<vertex_t> &d_row_data_dev,
-                                   VertexData<vertex_t> &d_col_data_dev,
-                                   bool *d_flag, int SP, vertex_t N,
-                                   weight_t epsilon) {
+inline vertex_t zeroCoverIteration(raft::handle_t const& handle,
+                                   weight_t const* d_costs_dev,
+                                   Vertices<vertex_t, weight_t>& d_vertices_dev,
+                                   VertexData<vertex_t>& d_row_data_dev,
+                                   VertexData<vertex_t>& d_col_data_dev,
+                                   bool* d_flag,
+                                   int SP,
+                                   vertex_t N,
+                                   weight_t epsilon)
+{
   vertex_t M;
 
-  raft::mr::device::buffer<vertex_t> csr_ptrs_v(handle.get_device_allocator(),
-                                                handle.get_stream(), 0);
+  raft::mr::device::buffer<vertex_t> csr_ptrs_v(
+    handle.get_device_allocator(), handle.get_stream(), 0);
   raft::mr::device::buffer<vertex_t> csr_neighbors_v(
     handle.get_device_allocator(), handle.get_stream(), 0);
 
@@ -226,8 +259,8 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle,
     dim3 threads_per_block;
     int total_blocks = 0;
 
-    raft::mr::device::buffer<bool> predicates_v(handle.get_device_allocator(),
-                                                handle.get_stream(), SP * N);
+    raft::mr::device::buffer<bool> predicates_v(
+      handle.get_device_allocator(), handle.get_stream(), SP * N);
     raft::mr::device::buffer<vertex_t> addresses_v(
       handle.get_device_allocator(), handle.get_stream(), SP * N);
 
@@ -242,87 +275,108 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle,
       blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
     // construct predicate matrix for edges.
-    kernel_rowPredicateConstructionCSR<<<blocks_per_grid, threads_per_block, 0,
+    kernel_rowPredicateConstructionCSR<<<blocks_per_grid,
+                                         threads_per_block,
+                                         0,
                                          handle.get_stream()>>>(
-      predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP,
-      N);
+      predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, N);
     CHECK_CUDA(handle.get_stream());
 
     M = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
-    thrust::exclusive_scan(thrust::device, addresses_v.begin(),
-                           addresses_v.end(), addresses_v.begin());
+    thrust::exclusive_scan(
+      thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin());
 
     if (M > 0) {
       csr_neighbors_v.resize(M);
 
-      kernel_rowScatterCSR<<<blocks_per_grid, threads_per_block, 0,
-                             handle.get_stream()>>>(
-        predicates_v.data(), addresses_v.data(), csr_neighbors_v.data(),
-        csr_ptrs_v.data(), M, SP, N);
+      kernel_rowScatterCSR<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+        predicates_v.data(),
+        addresses_v.data(),
+        csr_neighbors_v.data(),
+        csr_ptrs_v.data(),
+        M,
+        SP,
+        N);
 
       CHECK_CUDA(handle.get_stream());
     }
   }
 
   if (M > 0) {
-    coverZeroAndExpand(handle, d_costs_dev, csr_neighbors_v.data(),
-                       csr_ptrs_v.data(), d_vertices_dev, d_row_data_dev,
-                       d_col_data_dev, d_flag, SP, N, epsilon);
+    coverZeroAndExpand(handle,
+                       d_costs_dev,
+                       csr_neighbors_v.data(),
+                       csr_ptrs_v.data(),
+                       d_vertices_dev,
+                       d_row_data_dev,
+                       d_col_data_dev,
+                       d_flag,
+                       SP,
+                       N,
+                       epsilon);
   }
 
   return M;
 }
 
-// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending on the presence of uncovered zeros.
+// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending
+// on the presence of uncovered zeros.
 template <typename vertex_t, typename weight_t>
-inline void executeZeroCover(raft::handle_t const &handle,
-                             weight_t const *d_costs_dev,
-                             Vertices<vertex_t, weight_t> &d_vertices_dev,
-                             VertexData<vertex_t> &d_row_data_dev,
-                             VertexData<vertex_t> &d_col_data_dev, bool *d_flag,
-                             int SP, vertex_t N, weight_t epsilon) {
+inline void executeZeroCover(raft::handle_t const& handle,
+                             weight_t const* d_costs_dev,
+                             Vertices<vertex_t, weight_t>& d_vertices_dev,
+                             VertexData<vertex_t>& d_row_data_dev,
+                             VertexData<vertex_t>& d_col_data_dev,
+                             bool* d_flag,
+                             int SP,
+                             vertex_t N,
+                             weight_t epsilon)
+{
   vertex_t M = 1;
   while (M > 0) {
-    M = zeroCoverIteration(handle, d_costs_dev, d_vertices_dev, d_row_data_dev,
-                           d_col_data_dev, d_flag, SP, N, epsilon);
+    M = zeroCoverIteration(
+      handle, d_costs_dev, d_vertices_dev, d_row_data_dev, d_col_data_dev, d_flag, SP, N, epsilon);
   }
 }
 
 // Function for executing reverse pass of the maximum matching.
 template <typename vertex_t>
-inline void reversePass(raft::handle_t const &handle,
-                        VertexData<vertex_t> &d_row_data_dev,
-                        VertexData<vertex_t> &d_col_data_dev, int SP, int N) {
+inline void reversePass(raft::handle_t const& handle,
+                        VertexData<vertex_t>& d_row_data_dev,
+                        VertexData<vertex_t>& d_col_data_dev,
+                        int SP,
+                        int N)
+{
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
 
   std::size_t size = SP * N;
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, size);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size);
 
-  raft::mr::device::buffer<bool> predicates_v(handle.get_device_allocator(),
-                                              handle.get_stream(), size);
-  raft::mr::device::buffer<vertex_t> addresses_v(handle.get_device_allocator(),
-                                                 handle.get_stream(), size);
+  raft::mr::device::buffer<bool> predicates_v(
+    handle.get_device_allocator(), handle.get_stream(), size);
+  raft::mr::device::buffer<vertex_t> addresses_v(
+    handle.get_device_allocator(), handle.get_stream(), size);
 
   thrust::fill_n(thrust::device, predicates_v.data(), size, false);
   thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0});
 
   // compact the reverse pass row vertices.
-  kernel_augmentPredicateConstruction<<<blocks_per_grid, threads_per_block, 0,
+  kernel_augmentPredicateConstruction<<<blocks_per_grid,
+                                        threads_per_block,
+                                        0,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_col_data_dev.is_visited, size);
 
   CHECK_CUDA(handle.get_stream());
 
   // calculate total number of vertices.
-  std::size_t csr_size =
-    thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
+  std::size_t csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
   // exclusive scan for calculating the scatter addresses.
-  thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(),
-                         addresses_v.begin());
+  thrust::exclusive_scan(
+    thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin());
 
   if (csr_size > 0) {
     int total_blocks_1 = 0;
@@ -334,14 +388,12 @@ inline void reversePass(raft::handle_t const &handle,
     raft::mr::device::buffer<vertex_t> elements_v(
       handle.get_device_allocator(), handle.get_stream(), csr_size);
 
-    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0,
-                            handle.get_stream()>>>(
+    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
       elements_v.data(), predicates_v.data(), addresses_v.data(), size);
 
     CHECK_CUDA(handle.get_stream());
 
-    kernel_reverseTraversal<<<blocks_per_grid_1, threads_per_block_1, 0,
-                              handle.get_stream()>>>(
+    kernel_reverseTraversal<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
       elements_v.data(), d_row_data_dev, d_col_data_dev, csr_size);
     CHECK_CUDA(handle.get_stream());
   }
@@ -349,27 +401,30 @@ inline void reversePass(raft::handle_t const &handle,
 
 // Function for executing augmentation pass of the maximum matching.
 template <typename vertex_t, typename weight_t>
-inline void augmentationPass(raft::handle_t const &handle,
-                             Vertices<vertex_t, weight_t> &d_vertices_dev,
-                             VertexData<vertex_t> &d_row_data_dev,
-                             VertexData<vertex_t> &d_col_data_dev, int SP,
-                             int N) {
+inline void augmentationPass(raft::handle_t const& handle,
+                             Vertices<vertex_t, weight_t>& d_vertices_dev,
+                             VertexData<vertex_t>& d_row_data_dev,
+                             VertexData<vertex_t>& d_col_data_dev,
+                             int SP,
+                             int N)
+{
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, SP * N);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N);
 
-  raft::mr::device::buffer<bool> predicates_v(handle.get_device_allocator(),
-                                              handle.get_stream(), SP * N);
-  raft::mr::device::buffer<vertex_t> addresses_v(handle.get_device_allocator(),
-                                                 handle.get_stream(), SP * N);
+  raft::mr::device::buffer<bool> predicates_v(
+    handle.get_device_allocator(), handle.get_stream(), SP * N);
+  raft::mr::device::buffer<vertex_t> addresses_v(
+    handle.get_device_allocator(), handle.get_stream(), SP * N);
 
   thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false);
   thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0});
 
   // compact the reverse pass row vertices.
-  kernel_augmentPredicateConstruction<<<blocks_per_grid, threads_per_block, 0,
+  kernel_augmentPredicateConstruction<<<blocks_per_grid,
+                                        threads_per_block,
+                                        0,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP * N);
 
@@ -380,8 +435,8 @@ inline void augmentationPass(raft::handle_t const &handle,
   vertex_t row_ids_csr_size =
     thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
   // exclusive scan for calculating the scatter addresses.
-  thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(),
-                         addresses_v.begin());
+  thrust::exclusive_scan(
+    thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin());
 
   if (row_ids_csr_size > 0) {
     int total_blocks_1 = 0;
@@ -393,17 +448,18 @@ inline void augmentationPass(raft::handle_t const &handle,
     raft::mr::device::buffer<vertex_t> elements_v(
       handle.get_device_allocator(), handle.get_stream(), row_ids_csr_size);
 
-    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0,
-                            handle.get_stream()>>>(
-      elements_v.data(), predicates_v.data(), addresses_v.data(),
-      vertex_t{SP * N});
+    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+      elements_v.data(), predicates_v.data(), addresses_v.data(), vertex_t{SP * N});
 
     CHECK_CUDA(handle.get_stream());
 
-    kernel_augmentation<<<blocks_per_grid_1, threads_per_block_1, 0,
-                          handle.get_stream()>>>(
-      d_vertices_dev.row_assignments, d_vertices_dev.col_assignments,
-      elements_v.data(), d_row_data_dev, d_col_data_dev, vertex_t{N},
+    kernel_augmentation<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
+      d_vertices_dev.row_assignments,
+      d_vertices_dev.col_assignments,
+      elements_v.data(),
+      d_row_data_dev,
+      d_col_data_dev,
+      vertex_t{N},
       row_ids_csr_size);
 
     CHECK_CUDA(handle.get_stream());
@@ -411,35 +467,46 @@ inline void augmentationPass(raft::handle_t const &handle,
 }
 
 template <typename vertex_t, typename weight_t>
-inline void dualUpdate(raft::handle_t const &handle,
-                       Vertices<vertex_t, weight_t> &d_vertices_dev,
-                       VertexData<vertex_t> &d_row_data_dev,
-                       VertexData<vertex_t> &d_col_data_dev, int SP, vertex_t N,
-                       weight_t epsilon) {
+inline void dualUpdate(raft::handle_t const& handle,
+                       Vertices<vertex_t, weight_t>& d_vertices_dev,
+                       VertexData<vertex_t>& d_row_data_dev,
+                       VertexData<vertex_t>& d_col_data_dev,
+                       int SP,
+                       vertex_t N,
+                       weight_t epsilon)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks;
 
-  raft::mr::device::buffer<weight_t> sp_min_v(handle.get_device_allocator(),
-                                              handle.get_stream(), 1);
+  raft::mr::device::buffer<weight_t> sp_min_v(
+    handle.get_device_allocator(), handle.get_stream(), 1);
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, SP);
-  kernel_dualUpdate_1<<<blocks_per_grid, threads_per_block, 0,
-                        handle.get_stream()>>>(
-    sp_min_v.data(), d_vertices_dev.col_slacks, d_vertices_dev.col_covers, SP,
-    N, std::numeric_limits<weight_t>::max());
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
+  kernel_dualUpdate_1<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    sp_min_v.data(),
+    d_vertices_dev.col_slacks,
+    d_vertices_dev.col_covers,
+    SP,
+    N,
+    std::numeric_limits<weight_t>::max());
 
   CHECK_CUDA(handle.get_stream());
 
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
-  kernel_dualUpdate_2<<<blocks_per_grid, threads_per_block, 0,
-                        handle.get_stream()>>>(
-    sp_min_v.data(), d_vertices_dev.row_duals, d_vertices_dev.col_duals,
-    d_vertices_dev.col_slacks, d_vertices_dev.row_covers,
-    d_vertices_dev.col_covers, d_row_data_dev.is_visited,
-    d_col_data_dev.parents, SP, N, std::numeric_limits<weight_t>::max(),
+  kernel_dualUpdate_2<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    sp_min_v.data(),
+    d_vertices_dev.row_duals,
+    d_vertices_dev.col_duals,
+    d_vertices_dev.col_slacks,
+    d_vertices_dev.row_covers,
+    d_vertices_dev.col_covers,
+    d_row_data_dev.is_visited,
+    d_col_data_dev.parents,
+    SP,
+    N,
+    std::numeric_limits<weight_t>::max(),
     epsilon);
 
   CHECK_CUDA(handle.get_stream());
@@ -447,18 +514,19 @@ inline void dualUpdate(raft::handle_t const &handle,
 
 // Function for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val,
-                           Vertices<vertex_t, weight_t> &d_vertices_dev, int SP,
-                           int N) {
+inline void calcObjValDual(raft::handle_t const& handle,
+                           weight_t* d_obj_val,
+                           Vertices<vertex_t, weight_t>& d_vertices_dev,
+                           int SP,
+                           int N)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, SP);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
 
-  kernel_calcObjValDual<<<blocks_per_grid, threads_per_block, 0,
-                          handle.get_stream()>>>(
+  kernel_calcObjValDual<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N);
 
   CHECK_CUDA(handle.get_stream());
@@ -466,20 +534,21 @@ inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val,
 
 // Function for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-inline void calcObjValPrimal(raft::handle_t const &handle, weight_t *d_obj_val,
-                             weight_t const *d_costs,
-                             vertex_t const *d_row_assignments, int SP,
-                             vertex_t N) {
+inline void calcObjValPrimal(raft::handle_t const& handle,
+                             weight_t* d_obj_val,
+                             weight_t const* d_costs,
+                             vertex_t const* d_row_assignments,
+                             int SP,
+                             vertex_t N)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, SP);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
 
-  kernel_calcObjValPrimal<<<blocks_per_grid, threads_per_block, 0,
-                            handle.get_stream()>>>(d_obj_val, d_costs,
-                                                   d_row_assignments, SP, N);
+  kernel_calcObjValPrimal<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_obj_val, d_costs, d_row_assignments, SP, N);
 
   CHECK_CUDA(handle.get_stream());
 }
diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/lap_kernels.cuh
index 8c9012ed72..45ad23afd1 100644
--- a/cpp/include/raft/lap/lap_kernels.cuh
+++ b/cpp/include/raft/lap/lap_kernels.cuh
@@ -48,42 +48,57 @@ const int AUGMENT{4};
 const int MODIFIED{5};
 
 template <typename weight_t>
-bool __device__ near_zero(weight_t w, weight_t epsilon) {
+bool __device__ near_zero(weight_t w, weight_t epsilon)
+{
   return ((w > -epsilon) && (w < epsilon));
 }
 
 template <>
-bool __device__ near_zero<int32_t>(int32_t w, int32_t epsilon) {
+bool __device__ near_zero<int32_t>(int32_t w, int32_t epsilon)
+{
   return (w == 0);
 }
 
 template <>
-bool __device__ near_zero<int64_t>(int64_t w, int64_t epsilon) {
+bool __device__ near_zero<int64_t>(int64_t w, int64_t epsilon)
+{
   return (w == 0);
 }
 
-// Device function for traversing the neighbors from start pointer to end pointer and updating the covers.
-// The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of Step 4 execution.
+// Device function for traversing the neighbors from start pointer to end pointer and updating the
+// covers. The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of
+// Step 4 execution.
 template <typename vertex_t, typename weight_t>
-__device__ void cover_and_expand_row(
-  weight_t const *d_elements, weight_t const *d_row_duals,
-  weight_t const *d_col_duals, weight_t *d_col_slacks, int *d_row_covers,
-  int *d_col_covers, vertex_t const *d_col_assignments, bool *d_flag,
-  vertex_t *d_row_parents, vertex_t *d_col_parents, int *d_row_visited,
-  int *d_col_visited, vertex_t rowid, int spid, int colid, vertex_t N,
-  weight_t epsilon) {
+__device__ void cover_and_expand_row(weight_t const* d_elements,
+                                     weight_t const* d_row_duals,
+                                     weight_t const* d_col_duals,
+                                     weight_t* d_col_slacks,
+                                     int* d_row_covers,
+                                     int* d_col_covers,
+                                     vertex_t const* d_col_assignments,
+                                     bool* d_flag,
+                                     vertex_t* d_row_parents,
+                                     vertex_t* d_col_parents,
+                                     int* d_row_visited,
+                                     int* d_col_visited,
+                                     vertex_t rowid,
+                                     int spid,
+                                     int colid,
+                                     vertex_t N,
+                                     weight_t epsilon)
+{
   int ROWID = spid * N + rowid;
   int COLID = spid * N + colid;
 
-  weight_t slack = d_elements[spid * N * N + rowid * N + colid] -
-                   d_row_duals[ROWID] - d_col_duals[COLID];
+  weight_t slack =
+    d_elements[spid * N * N + rowid * N + colid] - d_row_duals[ROWID] - d_col_duals[COLID];
 
   int nxt_rowid = d_col_assignments[COLID];
   int NXT_ROWID = spid * N + nxt_rowid;
 
   if (rowid != nxt_rowid && d_col_covers[COLID] == 0) {
     if (slack < d_col_slacks[COLID]) {
-      d_col_slacks[COLID] = slack;
+      d_col_slacks[COLID]  = slack;
       d_col_parents[COLID] = ROWID;
     }
 
@@ -92,13 +107,12 @@ __device__ void cover_and_expand_row(
         d_row_parents[NXT_ROWID] = COLID;  // update parent info
 
         d_row_covers[NXT_ROWID] = 0;
-        d_col_covers[COLID] = 1;
+        d_col_covers[COLID]     = 1;
 
-        if (d_row_visited[NXT_ROWID] != VISITED)
-          d_row_visited[NXT_ROWID] = ACTIVE;
+        if (d_row_visited[NXT_ROWID] != VISITED) d_row_visited[NXT_ROWID] = ACTIVE;
       } else {
         d_col_visited[COLID] = REVERSE;
-        *d_flag = true;
+        *d_flag              = true;
       }
     }
   }
@@ -107,28 +121,34 @@ __device__ void cover_and_expand_row(
 
 // Device function for traversing an alternating path from unassigned row to unassigned column.
 template <typename vertex_t>
-__device__ void __reverse_traversal(
-  int *d_row_visited, vertex_t *d_row_children, vertex_t *d_col_children,
-  vertex_t const *d_row_parents, vertex_t const *d_col_parents, int cur_colid) {
+__device__ void __reverse_traversal(int* d_row_visited,
+                                    vertex_t* d_row_children,
+                                    vertex_t* d_col_children,
+                                    vertex_t const* d_row_parents,
+                                    vertex_t const* d_col_parents,
+                                    int cur_colid)
+{
   int cur_rowid = -1;
 
   while (cur_colid != -1) {
     d_col_children[cur_colid] = cur_rowid;
-    cur_rowid = d_col_parents[cur_colid];
+    cur_rowid                 = d_col_parents[cur_colid];
 
     d_row_children[cur_rowid] = cur_colid;
-    cur_colid = d_row_parents[cur_rowid];
+    cur_colid                 = d_row_parents[cur_rowid];
   }
   d_row_visited[cur_rowid] = AUGMENT;
 }
 
 // Device function for augmenting the alternating path from unassigned column to unassigned row.
 template <typename vertex_t>
-__device__ void __augment(vertex_t *d_row_assignments,
-                          vertex_t *d_col_assignments,
-                          vertex_t const *d_row_children,
-                          vertex_t const *d_col_children, vertex_t cur_rowid,
-                          vertex_t N) {
+__device__ void __augment(vertex_t* d_row_assignments,
+                          vertex_t* d_col_assignments,
+                          vertex_t const* d_row_children,
+                          vertex_t const* d_col_children,
+                          vertex_t cur_rowid,
+                          vertex_t N)
+{
   int cur_colid = -1;
 
   while (cur_rowid != -1) {
@@ -145,20 +165,18 @@ __device__ void __augment(vertex_t *d_row_assignments,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_rowReduction(weight_t const *d_costs,
-                                    weight_t *d_row_duals, int SP, vertex_t N,
-                                    weight_t infinity) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
-  int rowid = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void kernel_rowReduction(
+  weight_t const* d_costs, weight_t* d_row_duals, int SP, vertex_t N, weight_t infinity)
+{
+  int spid     = blockIdx.y * blockDim.y + threadIdx.y;
+  int rowid    = blockIdx.x * blockDim.x + threadIdx.x;
   weight_t min = infinity;
 
   if (spid < SP && rowid < N) {
     for (int colid = 0; colid < N; colid++) {
       weight_t slack = d_costs[spid * N * N + rowid * N + colid];
 
-      if (slack < min) {
-        min = slack;
-      }
+      if (slack < min) { min = slack; }
     }
 
     d_row_duals[spid * N + rowid] = min;
@@ -169,25 +187,26 @@ __global__ void kernel_rowReduction(weight_t const *d_costs,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_columnReduction(weight_t const *d_costs,
-                                       weight_t const *d_row_duals,
-                                       weight_t *d_col_duals, int SP,
-                                       vertex_t N, weight_t infinity) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_columnReduction(weight_t const* d_costs,
+                                       weight_t const* d_row_duals,
+                                       weight_t* d_col_duals,
+                                       int SP,
+                                       vertex_t N,
+                                       weight_t infinity)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
 
   weight_t min = infinity;
 
   if (spid < SP && colid < N) {
     for (int rowid = 0; rowid < N; rowid++) {
-      weight_t cost = d_costs[spid * N * N + rowid * N + colid];
+      weight_t cost     = d_costs[spid * N * N + rowid * N + colid];
       weight_t row_dual = d_row_duals[spid * N + rowid];
 
       weight_t slack = cost - row_dual;
 
-      if (slack < min) {
-        min = slack;
-      }
+      if (slack < min) { min = slack; }
     }
 
     d_col_duals[spid * N + colid] = min;
@@ -196,12 +215,18 @@ __global__ void kernel_columnReduction(weight_t const *d_costs,
 
 // Kernel for calculating initial assignments.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_computeInitialAssignments(
-  weight_t const *d_costs, weight_t const *d_row_duals,
-  weight_t const *d_col_duals, vertex_t *d_row_assignments,
-  vertex_t *d_col_assignments, int *d_row_lock, int *d_col_lock, int SP,
-  vertex_t N, weight_t epsilon) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_computeInitialAssignments(weight_t const* d_costs,
+                                                 weight_t const* d_row_duals,
+                                                 weight_t const* d_col_duals,
+                                                 vertex_t* d_row_assignments,
+                                                 vertex_t* d_col_assignments,
+                                                 int* d_row_lock,
+                                                 int* d_col_lock,
+                                                 int SP,
+                                                 vertex_t N,
+                                                 weight_t epsilon)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && colid < N) {
@@ -213,15 +238,15 @@ __global__ void kernel_computeInitialAssignments(
 
       if (d_col_lock[overall_colid] == 1) break;
 
-      weight_t cost = d_costs[spid * N * N + rowid * N + colid];
+      weight_t cost     = d_costs[spid * N * N + rowid * N + colid];
       weight_t row_dual = d_row_duals[overall_rowid];
-      weight_t slack = cost - row_dual - col_dual;
+      weight_t slack    = cost - row_dual - col_dual;
 
       if (near_zero(slack, epsilon)) {
         if (atomicCAS(&d_row_lock[overall_rowid], 0, 1) == 0) {
           d_row_assignments[overall_rowid] = colid;
           d_col_assignments[overall_colid] = rowid;
-          d_col_lock[overall_colid] = 1;
+          d_col_lock[overall_colid]        = 1;
         }
       }
     }
@@ -230,10 +255,10 @@ __global__ void kernel_computeInitialAssignments(
 
 // Kernel for populating the cover arrays and initializing alternating tree.
 template <typename vertex_t>
-__global__ void kernel_computeRowCovers(vertex_t *d_row_assignments,
-                                        int *d_row_covers, int *d_row_visited,
-                                        int SP, vertex_t N) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_computeRowCovers(
+  vertex_t* d_row_assignments, int* d_row_covers, int* d_row_visited, int SP, vertex_t N)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && rowid < N) {
@@ -249,11 +274,10 @@ __global__ void kernel_computeRowCovers(vertex_t *d_row_assignments,
 
 // Kernel for populating the predicate matrix for edges in row major format.
 template <typename vertex_t>
-__global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates,
-                                                   vertex_t *d_addresses,
-                                                   int *d_row_visited, int SP,
-                                                   vertex_t N) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_rowPredicateConstructionCSR(
+  bool* d_predicates, vertex_t* d_addresses, int* d_row_visited, int SP, vertex_t N)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && rowid < N) {
@@ -261,130 +285,160 @@ __global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates,
 
     if (d_row_visited[index] == ACTIVE) {
       d_predicates[index] = true;
-      d_addresses[index] = 1;
+      d_addresses[index]  = 1;
     } else {
       d_predicates[index] = false;
-      d_addresses[index] = 0;
+      d_addresses[index]  = 0;
     }
   }
 }
 
 // Kernel for scattering the edges based on the scatter addresses.
 template <typename vertex_t>
-__global__ void kernel_rowScatterCSR(bool const *d_predicates,
-                                     vertex_t const *d_addresses,
-                                     vertex_t *d_neighbors, vertex_t *d_ptrs,
-                                     vertex_t M, int SP, vertex_t N) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_rowScatterCSR(bool const* d_predicates,
+                                     vertex_t const* d_addresses,
+                                     vertex_t* d_neighbors,
+                                     vertex_t* d_ptrs,
+                                     vertex_t M,
+                                     int SP,
+                                     vertex_t N)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && rowid < N) {
     int index = spid * N + rowid;
 
-    bool predicate = d_predicates[index];
+    bool predicate  = d_predicates[index];
     vertex_t compid = d_addresses[index];
 
-    if (predicate) {
-      d_neighbors[compid] = rowid;
-    }
+    if (predicate) { d_neighbors[compid] = rowid; }
     if (rowid == 0) {
       d_ptrs[spid] = compid;
-      d_ptrs[SP] = M;
+      d_ptrs[SP]   = M;
     }
   }
 }
 
 // Kernel for finding the minimum zero cover.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_coverAndExpand(bool *d_flag, vertex_t const *d_ptrs,
-                                      vertex_t const *d_neighbors,
-                                      weight_t const *d_elements,
+__global__ void kernel_coverAndExpand(bool* d_flag,
+                                      vertex_t const* d_ptrs,
+                                      vertex_t const* d_neighbors,
+                                      weight_t const* d_elements,
                                       Vertices<vertex_t, weight_t> d_vertices,
                                       VertexData<vertex_t> d_row_data,
-                                      VertexData<vertex_t> d_col_data, int SP,
-                                      vertex_t N, weight_t epsilon) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+                                      VertexData<vertex_t> d_col_data,
+                                      int SP,
+                                      vertex_t N,
+                                      weight_t epsilon)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
 
   // Load values into local memory
 
   if (spid < SP && colid < N) {
     thrust::for_each(
-      thrust::seq, d_neighbors + d_ptrs[spid], d_neighbors + d_ptrs[spid + 1],
-      [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N,
-       epsilon] __device__(vertex_t rowid) {
-        cover_and_expand_row(
-          d_elements, d_vertices.row_duals, d_vertices.col_duals,
-          d_vertices.col_slacks, d_vertices.row_covers, d_vertices.col_covers,
-          d_vertices.col_assignments, d_flag, d_row_data.parents,
-          d_col_data.parents, d_row_data.is_visited, d_col_data.is_visited,
-          rowid, spid, colid, N, epsilon);
+      thrust::seq,
+      d_neighbors + d_ptrs[spid],
+      d_neighbors + d_ptrs[spid + 1],
+      [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, epsilon] __device__(
+        vertex_t rowid) {
+        cover_and_expand_row(d_elements,
+                             d_vertices.row_duals,
+                             d_vertices.col_duals,
+                             d_vertices.col_slacks,
+                             d_vertices.row_covers,
+                             d_vertices.col_covers,
+                             d_vertices.col_assignments,
+                             d_flag,
+                             d_row_data.parents,
+                             d_col_data.parents,
+                             d_row_data.is_visited,
+                             d_col_data.is_visited,
+                             rowid,
+                             spid,
+                             colid,
+                             N,
+                             epsilon);
       });
   }
 }
 
 // Kernel for constructing the predicates for reverse pass or augmentation candidates.
 template <typename vertex_t>
-__global__ void kernel_augmentPredicateConstruction(bool *d_predicates,
-                                                    vertex_t *d_addresses,
-                                                    int *d_visited, int size) {
+__global__ void kernel_augmentPredicateConstruction(bool* d_predicates,
+                                                    vertex_t* d_addresses,
+                                                    int* d_visited,
+                                                    int size)
+{
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
     int visited = d_visited[id];
     if ((visited == REVERSE) || (visited == AUGMENT)) {
       d_predicates[id] = true;
-      d_addresses[id] = 1;
+      d_addresses[id]  = 1;
     } else {
       d_predicates[id] = false;
-      d_addresses[id] = 0;
+      d_addresses[id]  = 0;
     }
   }
 }
 
 // Kernel for scattering the vertices based on the scatter addresses.
 template <typename vertex_t>
-__global__ void kernel_augmentScatter(vertex_t *d_elements,
-                                      bool const *d_predicates,
-                                      vertex_t const *d_addresses,
-                                      std::size_t size) {
+__global__ void kernel_augmentScatter(vertex_t* d_elements,
+                                      bool const* d_predicates,
+                                      vertex_t const* d_addresses,
+                                      std::size_t size)
+{
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
-    if (d_predicates[id]) {
-      d_elements[d_addresses[id]] = id;
-    }
+    if (d_predicates[id]) { d_elements[d_addresses[id]] = id; }
   }
 }
 
 // Kernel for executing the reverse pass of the maximum matching algorithm.
 template <typename vertex_t>
-__global__ void kernel_reverseTraversal(vertex_t *d_elements,
+__global__ void kernel_reverseTraversal(vertex_t* d_elements,
                                         VertexData<vertex_t> d_row_data,
                                         VertexData<vertex_t> d_col_data,
-                                        int size) {
+                                        int size)
+{
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
-    __reverse_traversal(d_row_data.is_visited, d_row_data.children,
-                        d_col_data.children, d_row_data.parents,
-                        d_col_data.parents, d_elements[id]);
+    __reverse_traversal(d_row_data.is_visited,
+                        d_row_data.children,
+                        d_col_data.children,
+                        d_row_data.parents,
+                        d_col_data.parents,
+                        d_elements[id]);
   }
 }
 
 // Kernel for executing the augmentation pass of the maximum matching algorithm.
 template <typename vertex_t>
-__global__ void kernel_augmentation(vertex_t *d_row_assignments,
-                                    vertex_t *d_col_assignments,
-                                    vertex_t const *d_row_elements,
+__global__ void kernel_augmentation(vertex_t* d_row_assignments,
+                                    vertex_t* d_col_assignments,
+                                    vertex_t const* d_row_elements,
                                     VertexData<vertex_t> d_row_data,
-                                    VertexData<vertex_t> d_col_data, vertex_t N,
-                                    vertex_t size) {
+                                    VertexData<vertex_t> d_col_data,
+                                    vertex_t N,
+                                    vertex_t size)
+{
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
-    __augment(d_row_assignments, d_col_assignments, d_row_data.children,
-              d_col_data.children, d_row_elements[id], N);
+    __augment(d_row_assignments,
+              d_col_assignments,
+              d_row_data.children,
+              d_col_data.children,
+              d_row_elements[id],
+              N);
   }
 }
 
@@ -392,18 +446,21 @@ __global__ void kernel_augmentation(vertex_t *d_row_assignments,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_dualUpdate_1(weight_t *d_sp_min,
-                                    weight_t const *d_col_slacks,
-                                    int const *d_col_covers, int SP, vertex_t N,
-                                    weight_t infinity) {
+__global__ void kernel_dualUpdate_1(weight_t* d_sp_min,
+                                    weight_t const* d_col_slacks,
+                                    int const* d_col_covers,
+                                    int SP,
+                                    vertex_t N,
+                                    weight_t infinity)
+{
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP) {
     weight_t min = infinity;
     for (int colid = 0; colid < N; colid++) {
-      int index = spid * N + colid;
+      int index      = spid * N + colid;
       weight_t slack = d_col_slacks[index];
-      int col_cover = d_col_covers[index];
+      int col_cover  = d_col_covers[index];
 
       if (col_cover == 0)
         if (slack < min) min = slack;
@@ -417,21 +474,29 @@ __global__ void kernel_dualUpdate_1(weight_t *d_sp_min,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_dualUpdate_2(
-  weight_t const *d_sp_min, weight_t *d_row_duals, weight_t *d_col_duals,
-  weight_t *d_col_slacks, int const *d_row_covers, int const *d_col_covers,
-  int *d_row_visited, vertex_t *d_col_parents, int SP, vertex_t N,
-  weight_t infinity, weight_t epsilon) {
+__global__ void kernel_dualUpdate_2(weight_t const* d_sp_min,
+                                    weight_t* d_row_duals,
+                                    weight_t* d_col_duals,
+                                    weight_t* d_col_slacks,
+                                    int const* d_row_covers,
+                                    int const* d_col_covers,
+                                    int* d_row_visited,
+                                    vertex_t* d_col_parents,
+                                    int SP,
+                                    vertex_t N,
+                                    weight_t infinity,
+                                    weight_t epsilon)
+{
   int spid = blockIdx.y * blockDim.y + threadIdx.y;
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  int id   = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && id < N) {
     int index = spid * N + id;
 
     if (d_sp_min[spid] < infinity) {
       weight_t theta = d_sp_min[spid];
-      int row_cover = d_row_covers[index];
-      int col_cover = d_col_covers[index];
+      int row_cover  = d_row_covers[index];
+      int col_cover  = d_col_covers[index];
 
       if (row_cover == 0)  // Row vertex is reachable from source.
         d_row_duals[index] += theta;
@@ -453,10 +518,12 @@ __global__ void kernel_dualUpdate_2(
 
 // Kernel for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual,
-                                      weight_t const *d_row_duals,
-                                      weight_t const *d_col_duals, int SP,
-                                      vertex_t N) {
+__global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual,
+                                      weight_t const* d_row_duals,
+                                      weight_t const* d_col_duals,
+                                      int SP,
+                                      vertex_t N)
+{
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP) {
@@ -471,10 +538,12 @@ __global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual,
 
 // Kernel for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_calcObjValPrimal(weight_t *d_obj_val_primal,
-                                        weight_t const *d_costs,
-                                        vertex_t const *d_row_assignments,
-                                        int SP, vertex_t N) {
+__global__ void kernel_calcObjValPrimal(weight_t* d_obj_val_primal,
+                                        weight_t const* d_costs,
+                                        vertex_t const* d_row_assignments,
+                                        int SP,
+                                        vertex_t N)
+{
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP) {
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 7a454f64e2..11d3174951 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -37,8 +37,8 @@ namespace linalg {
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void addScalar(OutT *out, const InT *in, InT scalar, IdxType len,
-               cudaStream_t stream) {
+void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
   auto op = [scalar] __device__(InT in) { return OutT(in + scalar); };
   unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
 }
@@ -57,23 +57,24 @@ void addScalar(OutT *out, const InT *in, InT scalar, IdxType len,
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void add(OutT *out, const InT *in1, const InT *in2, IdxType len,
-         cudaStream_t stream) {
+void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
   auto op = [] __device__(InT a, InT b) { return OutT(a + b); };
   binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
 }
 
 template <class math_t, typename IdxType>
-__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
-                                      const math_t *singleScalarDev,
-                                      IdxType len) {
+__global__ void add_dev_scalar_kernel(math_t* outDev,
+                                      const math_t* inDev,
+                                      const math_t* singleScalarDev,
+                                      IdxType len)
+{
   IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) {
-    outDev[i] = inDev[i] + *singleScalarDev;
-  }
+  if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }
 }
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+ * write result to outDev[i]
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
  * @param outDev the output buffer
@@ -83,14 +84,16 @@ __global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void addDevScalar(math_t *outDev, const math_t *inDev,
-                  const math_t *singleScalarDev, IdxType len,
-                  cudaStream_t stream) {
+void addDevScalar(math_t* outDev,
+                  const math_t* inDev,
+                  const math_t* singleScalarDev,
+                  IdxType len,
+                  cudaStream_t stream)
+{
   // TODO: block dimension has not been tuned
   dim3 block(256);
   dim3 grid(raft::ceildiv(len, (IdxType)block.x));
-  add_dev_scalar_kernel<math_t>
-    <<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+  add_dev_scalar_kernel<math_t><<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index 940d786e87..a49a433941 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -22,10 +22,10 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, int VecLen, typename Lambda, typename IdxType,
-          typename OutType>
-__global__ void binaryOpKernel(OutType *out, const InType *in1,
-                               const InType *in2, IdxType len, Lambda op) {
+template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType>
+__global__ void binaryOpKernel(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op)
+{
   typedef TxN_t<InType, VecLen> InVecType;
   typedef TxN_t<OutType, VecLen> OutVecType;
   InVecType a, b;
@@ -42,12 +42,11 @@ __global__ void binaryOpKernel(OutType *out, const InType *in1,
   c.store(out, idx);
 }
 
-template <typename InType, int VecLen, typename Lambda, typename IdxType,
-          typename OutType, int TPB>
-void binaryOpImpl(OutType *out, const InType *in1, const InType *in2,
-                  IdxType len, Lambda op, cudaStream_t stream) {
-  const IdxType nblks =
-    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType, int TPB>
+void binaryOpImpl(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
   binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
     <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
   CUDA_CHECK(cudaPeekAtLastError());
@@ -56,8 +55,8 @@ void binaryOpImpl(OutType *out, const InType *in1, const InType *in2,
 /**
  * @brief Checks if addresses are aligned on N bytes
  */
-inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3,
-                           uint64_t N) {
+inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N)
+{
   return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
 }
 
@@ -77,38 +76,36 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3,
  * @note Lambda must be a functor with the following signature:
  *       `OutType func(const InType& val1, const InType& val2);`
  */
-template <typename InType, typename Lambda, typename OutType = InType,
-          typename IdxType = int, int TPB = 256>
-void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
-              Lambda op, cudaStream_t stream) {
-  constexpr auto maxSize =
-    sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes = len * maxSize;
-  uint64_t in1Addr = uint64_t(in1);
-  uint64_t in2Addr = uint64_t(in2);
-  uint64_t outAddr = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 &&
-      addressAligned(in1Addr, in2Addr, outAddr, 16)) {
+template <typename InType,
+          typename Lambda,
+          typename OutType = InType,
+          typename IdxType = int,
+          int TPB          = 256>
+void binaryOp(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes           = len * maxSize;
+  uint64_t in1Addr       = uint64_t(in1);
+  uint64_t in2Addr       = uint64_t(in2);
+  uint64_t outAddr       = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) {
     binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (8 / maxSize && bytes % 8 == 0 &&
-             addressAligned(in1Addr, in2Addr, outAddr, 8)) {
+  } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) {
     binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (4 / maxSize && bytes % 4 == 0 &&
-             addressAligned(in1Addr, in2Addr, outAddr, 4)) {
+  } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) {
     binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (2 / maxSize && bytes % 2 == 0 &&
-             addressAligned(in1Addr, in2Addr, outAddr, 2)) {
+  } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) {
     binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (1 / maxSize) {
     binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else {
-    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len,
-                                                           op, stream);
+    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len, op, stream);
   }
 }
 
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index b5a93c4953..b129fe4758 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -122,9 +122,16 @@ namespace linalg {
  *    conditioned systems. Negative values mean no regularizaton.
  */
 template <typename math_t>
-void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
-                         void *workspace, int *n_bytes, cublasFillMode_t uplo,
-                         cudaStream_t stream, math_t eps = -1) {
+void choleskyRank1Update(const raft::handle_t& handle,
+                         math_t* L,
+                         int n,
+                         int ld,
+                         void* workspace,
+                         int* n_bytes,
+                         cublasFillMode_t uplo,
+                         cudaStream_t stream,
+                         math_t eps = -1)
+{
   // The matrix A' is defined as:
   // A' = [[A_11, A_12]
   //       [A_21, A_22]]
@@ -144,18 +151,17 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
   // We need a workspace in device memory to store a scalar. Additionally, in
   // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
   const int align = 256;
-  int offset = (uplo == CUBLAS_FILL_MODE_LOWER)
-                 ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align)
-                 : 0;
+  int offset =
+    (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align) : 0;
   if (workspace == nullptr) {
     *n_bytes = offset + 1 * sizeof(math_t);
     return;
   }
-  math_t *s = reinterpret_cast<math_t *>(((char *)workspace) + offset);
-  math_t *L_22 = L + (n - 1) * ld + n - 1;
+  math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
+  math_t* L_22 = L + (n - 1) * ld + n - 1;
 
-  math_t *A_new;
-  math_t *A_row;
+  math_t* A_new;
+  math_t* A_row;
   if (uplo == CUBLAS_FILL_MODE_UPPER) {
     // A_new is stored as the n-1 th column of L
     A_new = L + (n - 1) * ld;
@@ -164,27 +170,36 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
     // as the n-th row of L. Since the matrix is column major, this is non
     // contiguous. We copy elements from A_row to a contiguous workspace A_new.
     A_row = L + n - 1;
-    A_new = reinterpret_cast<math_t *>(workspace);
-    CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
-                                          A_row, ld, A_new, 1, stream));
+    A_new = reinterpret_cast<math_t*>(workspace);
+    CUBLAS_CHECK(
+      raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
   }
-  cublasOperation_t op =
-    (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
   if (n > 1) {
     // Calculate L_12 = x by solving equation L_11 x = A_12
     math_t alpha = 1;
-    CUBLAS_CHECK(raft::linalg::cublastrsm(
-      handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op,
-      CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream));
+    CUBLAS_CHECK(raft::linalg::cublastrsm(handle.get_cublas_handle(),
+                                          CUBLAS_SIDE_LEFT,
+                                          uplo,
+                                          op,
+                                          CUBLAS_DIAG_NON_UNIT,
+                                          n - 1,
+                                          1,
+                                          &alpha,
+                                          L,
+                                          ld,
+                                          A_new,
+                                          n - 1,
+                                          stream));
 
     // A_new now stores L_12, we calculate s = L_12 * L_12
-    CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1,
-                                         A_new, 1, A_new, 1, s, stream));
+    CUBLAS_CHECK(
+      raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
 
     if (uplo == CUBLAS_FILL_MODE_LOWER) {
       // Copy back the L_12 elements as the n-th row of L
-      CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
-                                            A_new, 1, A_row, ld, stream));
+      CUBLAS_CHECK(
+        raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
     }
   } else {  // n == 1 case
     CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
@@ -202,9 +217,7 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
   // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
   // negative, which would result L_22 = NaN. A small positive eps parameter
   // can be used to prevent this.
-  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) {
-    L_22_host = eps;
-  }
+  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; }
   ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update");
   raft::update_device(L_22, &L_22_host, 1, stream);
 }
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index ef983ff3d0..7e0744f98a 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -26,18 +26,27 @@ namespace linalg {
 // of the matrix, i.e. reduce along rows for row major or reduce along columns
 // for column major layout. Kernel does an inplace reduction adding to original
 // values of dots.
-template <typename InType, typename OutType, typename IdxType, int TPB,
-          typename MainLambda, typename ReduceLambda, typename FinalLambda>
-__global__ void coalescedReductionKernel(OutType *dots, const InType *data,
-                                         int D, int N, OutType init,
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          int TPB,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+__global__ void coalescedReductionKernel(OutType* dots,
+                                         const InType* data,
+                                         int D,
+                                         int N,
+                                         OutType init,
                                          MainLambda main_op,
                                          ReduceLambda reduce_op,
                                          FinalLambda final_op,
-                                         bool inplace = false) {
+                                         bool inplace = false)
+{
   typedef cub::BlockReduce<OutType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType thread_data = init;
-  IdxType rowStart = blockIdx.x * D;
+  IdxType rowStart    = blockIdx.x * D;
   for (IdxType i = threadIdx.x; i < D; i += TPB) {
     IdxType idx = rowStart + i;
     thread_data = reduce_op(thread_data, main_op(data[idx], i));
@@ -79,33 +88,37 @@ __global__ void coalescedReductionKernel(OutType *dots, const InType *data,
  * @param inplace reduction result added inplace or overwrites old values?
  * @param stream cuda stream where to launch work
  */
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda = raft::Nop<OutType>>
-void coalescedReduction(OutType *dots, const InType *data, int D, int N,
-                        OutType init, cudaStream_t stream, bool inplace = false,
-                        MainLambda main_op = raft::Nop<InType, IdxType>(),
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReduction(OutType* dots,
+                        const InType* data,
+                        int D,
+                        int N,
+                        OutType init,
+                        cudaStream_t stream,
+                        bool inplace           = false,
+                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
                         ReduceLambda reduce_op = raft::Sum<OutType>(),
-                        FinalLambda final_op = raft::Nop<OutType>()) {
+                        FinalLambda final_op   = raft::Nop<OutType>())
+{
   // One block per reduction
   // Efficient only for large leading dimensions
   if (D <= 32) {
     coalescedReductionKernel<InType, OutType, IdxType, 32>
-      <<<N, 32, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                             final_op, inplace);
+      <<<N, 32, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   } else if (D <= 64) {
     coalescedReductionKernel<InType, OutType, IdxType, 64>
-      <<<N, 64, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                             final_op, inplace);
+      <<<N, 64, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   } else if (D <= 128) {
     coalescedReductionKernel<InType, OutType, IdxType, 128>
-      <<<N, 128, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                              final_op, inplace);
+      <<<N, 128, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   } else {
     coalescedReductionKernel<InType, OutType, IdxType, 256>
-      <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                              final_op, inplace);
+      <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
index aa711a9140..35d9d96ea4 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -55,8 +55,7 @@ namespace linalg {
  *                 thread block. This also determines the number of threads per
  *                 thread block
  */
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr,
-          int _tc>
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
 struct KernelPolicy {
   enum {
     /** number of elements along K worked upon per main loop iteration */
@@ -101,8 +100,7 @@ struct KernelPolicy {
 
 };  // struct KernelPolicy
 
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr,
-          int _tc>
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
 struct ColKernelPolicy {
   enum {
     /** number of elements along K worked upon per main loop iteration */
@@ -151,7 +149,8 @@ struct ColKernelPolicy {
  * @{
  */
 template <typename DataT, int _veclen>
-struct Policy4x4 {};
+struct Policy4x4 {
+};
 
 template <int _veclen>
 struct Policy4x4<float, _veclen> {
@@ -180,8 +179,7 @@ struct Policy4x4<double, _veclen> {
  * @tparam Policy policy used to customize memory access behavior.
  *                See documentation for `KernelPolicy` to know more.
  */
-template <typename DataT, typename IdxT, typename Policy,
-          bool isRowMajor = true>
+template <typename DataT, typename IdxT, typename Policy, bool isRowMajor = true>
 struct Contractions_NT {
  protected:
   typedef Policy P;
@@ -247,8 +245,7 @@ struct Contractions_NT {
    * @param[in] _k number of cols of X and Y
    * @param[in] _smem shared memory region used during computations
    */
-  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-                     IdxT _k, char* _smem)
+  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, IdxT _k, char* _smem)
     : m(_m),
       n(_n),
       k(_k),
@@ -265,7 +262,9 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0) {}
+      pageRd(0)
+  {
+  }
 
   /**
    * @brief Ctor
@@ -276,8 +275,15 @@ struct Contractions_NT {
    * @param[in] _k number of cols of X and Y
    * @param[in] _smem shared memory region used during computations
    */
-  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-                     IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem)
+  DI Contractions_NT(const DataT* _x,
+                     const DataT* _y,
+                     IdxT _m,
+                     IdxT _n,
+                     IdxT _k,
+                     IdxT _lda,
+                     IdxT _ldb,
+                     IdxT _ldd,
+                     char* _smem)
     : m(_m),
       n(_n),
       k(_k),
@@ -291,17 +297,18 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0) {
+      pageRd(0)
+  {
     if (isRowMajor) {
       xrowid = IdxT(blockIdx.y) * P::Mblk + srowid;
       yrowid = IdxT(blockIdx.x) * P::Nblk + srowid;
-      x = _x + xrowid * lda;
-      y = _y + yrowid * ldb;
+      x      = _x + xrowid * lda;
+      y      = _y + yrowid * ldb;
     } else {
       xrowid = IdxT(blockIdx.y) * P::Mblk;
       yrowid = IdxT(blockIdx.x) * P::Nblk;
-      x = _x + xrowid + srowid * lda;
-      y = _y + yrowid + srowid * ldb;
+      x      = _x + xrowid + srowid * lda;
+      y      = _y + yrowid + srowid * ldb;
     }
   }
 
@@ -310,7 +317,8 @@ struct Contractions_NT {
    * @brief Load current block of X/Y from global memory to registers
    * @param[in] kidx current start index of k to be loaded
    */
-  DI void ldgXY(IdxT kidx) {
+  DI void ldgXY(IdxT kidx)
+  {
     ldgX(kidx);
     ldgY(kidx);
   }
@@ -319,7 +327,8 @@ struct Contractions_NT {
    * @brief Store current block of X/Y from registers to smem
    * @param[in] kidx current start index of k to be loaded
    */
-  DI void stsXY() {
+  DI void stsXY()
+  {
     stsX(sx + pageWr * P::SmemPage);
     stsY(sy + pageWr * P::SmemPage);
   }
@@ -328,13 +337,15 @@ struct Contractions_NT {
    * @brief Load X and Y block from shared memory to registers
    * @param[in] kidx k value from the current k-block to be loaded from smem
    */
-  DI void ldsXY(int kidx) {
+  DI void ldsXY(int kidx)
+  {
     ldsX(kidx, sx + pageRd * P::SmemPage);
     ldsY(kidx, sy + pageRd * P::SmemPage);
   }
 
  private:
-  DI void ldgX(IdxT kidx) {
+  DI void ldgX(IdxT kidx)
+  {
     if (isRowMajor) {
       auto numRows = m;
       auto koffset = kidx + scolid;
@@ -351,11 +362,10 @@ struct Contractions_NT {
       }
     } else {
       const auto numRows = k;
-      auto koffset = scolid;
+      auto koffset       = scolid;
 #pragma unroll
       for (int i = 0; i < P::LdgPerThX; ++i) {
-        if ((koffset + xrowid) < lda &&
-            (srowid + kidx + i * P::LdgRowsX) < numRows) {
+        if ((koffset + xrowid) < lda && (srowid + kidx + i * P::LdgRowsX) < numRows) {
           ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset);
         } else {
 #pragma unroll
@@ -367,7 +377,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldgY(IdxT kidx) {
+  DI void ldgY(IdxT kidx)
+  {
     if (isRowMajor) {
       auto numRows = n;
       auto koffset = kidx + scolid;
@@ -387,8 +398,7 @@ struct Contractions_NT {
       auto koffset = scolid;
 #pragma unroll
       for (int i = 0; i < P::LdgPerThY; ++i) {
-        if ((koffset + yrowid) < ldb &&
-            (srowid + kidx + i * P::LdgRowsY) < numRows) {
+        if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) {
           ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset);
         } else {
 #pragma unroll
@@ -400,7 +410,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void stsX(DataT* smem) {
+  DI void stsX(DataT* smem)
+  {
     auto* saddr = smem + srowid * P::SmemStride + scolid;
 #pragma unroll
     for (int i = 0; i < P::LdgPerThX; ++i) {
@@ -408,7 +419,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void stsY(DataT* smem) {
+  DI void stsY(DataT* smem)
+  {
     auto* saddr = smem + srowid * P::SmemStride + scolid;
 #pragma unroll
     for (int i = 0; i < P::LdgPerThY; ++i) {
@@ -416,7 +428,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldsX(int kidx, DataT* smem) {
+  DI void ldsX(int kidx, DataT* smem)
+  {
     if (isRowMajor) {
       auto* saddr = smem + accrowid * P::SmemStride + kidx;
 #pragma unroll
@@ -435,7 +448,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldsY(int kidx, DataT* smem) {
+  DI void ldsY(int kidx, DataT* smem)
+  {
     if (isRowMajor) {
       auto* saddr = smem + acccolid * P::SmemStride + kidx;
 #pragma unroll
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h
index 7c79e6c91d..2d18691410 100644
--- a/cpp/include/raft/linalg/cublas_wrappers.h
+++ b/cpp/include/raft/linalg/cublas_wrappers.h
@@ -25,8 +25,7 @@
 #include <cstdint>
 
 #define _CUBLAS_ERR_TO_STR(err) \
-  case err:                     \
-    return #err
+  case err: return #err
 
 namespace raft {
 
@@ -34,15 +33,15 @@ namespace raft {
  * @brief Exception thrown when a cuBLAS error is encountered.
  */
 struct cublas_error : public raft::exception {
-  explicit cublas_error(char const *const message) : raft::exception(message) {}
-  explicit cublas_error(std::string const &message)
-    : raft::exception(message) {}
+  explicit cublas_error(char const* const message) : raft::exception(message) {}
+  explicit cublas_error(std::string const& message) : raft::exception(message) {}
 };
 
 namespace linalg {
 namespace detail {
 
-inline const char *cublas_error_to_string(cublasStatus_t err) {
+inline const char* cublas_error_to_string(cublasStatus_t err)
+{
   switch (err) {
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
@@ -54,8 +53,7 @@ inline const char *cublas_error_to_string(cublasStatus_t err) {
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-    default:
-      return "CUBLAS_STATUS_UNKNOWN";
+    default: return "CUBLAS_STATUS_UNKNOWN";
   };
 }
 
@@ -71,16 +69,19 @@ inline const char *cublas_error_to_string(cublasStatus_t err) {
  * Invokes a cuBLAS runtime API function call, if the call does not return
  * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
  */
-#define CUBLAS_TRY(call)                                                      \
-  do {                                                                        \
-    cublasStatus_t const status = (call);                                     \
-    if (CUBLAS_STATUS_SUCCESS != status) {                                    \
-      std::string msg{};                                                      \
-      SET_ERROR_MSG(                                                          \
-        msg, "cuBLAS error encountered at: ", "call='%s', Reason=%d:%s",      \
-        #call, status, raft::linalg::detail::cublas_error_to_string(status)); \
-      throw raft::cublas_error(msg);                                          \
-    }                                                                         \
+#define CUBLAS_TRY(call)                                                   \
+  do {                                                                     \
+    cublasStatus_t const status = (call);                                  \
+    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                   \
+      SET_ERROR_MSG(msg,                                                   \
+                    "cuBLAS error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                             \
+                    #call,                                                 \
+                    status,                                                \
+                    raft::linalg::detail::cublas_error_to_string(status)); \
+      throw raft::cublas_error(msg);                                       \
+    }                                                                      \
   } while (0)
 
 /** FIXME: temporary alias for cuML compatibility */
@@ -107,22 +108,39 @@ namespace linalg {
  * @{
  */
 template <typename T>
-cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, const T *alpha,
-                          const T *x, int incx, T *y, int incy,
+cublasStatus_t cublasaxpy(cublasHandle_t handle,
+                          int n,
+                          const T* alpha,
+                          const T* x,
+                          int incx,
+                          T* y,
+                          int incy,
                           cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
-                                 const float *alpha, const float *x, int incx,
-                                 float *y, int incy, cudaStream_t stream) {
+inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
+                                 int n,
+                                 const float* alpha,
+                                 const float* x,
+                                 int incx,
+                                 float* y,
+                                 int incy,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
 }
 
 template <>
-inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
-                                 const double *alpha, const double *x, int incx,
-                                 double *y, int incy, cudaStream_t stream) {
+inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
+                                 int n,
+                                 const double* alpha,
+                                 const double* x,
+                                 int incx,
+                                 double* y,
+                                 int incy,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
 }
@@ -133,21 +151,21 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasSwap(cublasHandle_t handle, int n, T *x, int incx, T *y,
-                          int incy, cudaStream_t stream);
+cublasStatus_t cublasSwap(
+  cublasHandle_t handle, int n, T* x, int incx, T* y, int incy, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, float *x,
-                                 int incx, float *y, int incy,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasSwap(
+  cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSswap(handle, n, x, incx, y, incy);
 }
 
 template <>
-inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x,
-                                 int incx, double *y, int incy,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasSwap(
+  cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDswap(handle, n, x, incx, y, incy);
 }
@@ -159,20 +177,20 @@ inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const T *x, int incx,
-                          T *y, int incy, cudaStream_t stream);
+cublasStatus_t cublasCopy(
+  cublasHandle_t handle, int n, const T* x, int incx, T* y, int incy, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const float *x,
-                                 int incx, float *y, int incy,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasCopy(
+  cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasScopy(handle, n, x, incx, y, incy);
 }
 template <>
-inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x,
-                                 int incx, double *y, int incy,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasCopy(
+  cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDcopy(handle, n, x, incx, y, incy);
 }
@@ -183,31 +201,56 @@ inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasgemv(cublasHandle_t handle, cublasOperation_t transA,
-                          int m, int n, const T *alfa, const T *A, int lda,
-                          const T *x, int incx, const T *beta, T *y, int incy,
+cublasStatus_t cublasgemv(cublasHandle_t handle,
+                          cublasOperation_t transA,
+                          int m,
+                          int n,
+                          const T* alfa,
+                          const T* A,
+                          int lda,
+                          const T* x,
+                          int incx,
+                          const T* beta,
+                          T* y,
+                          int incy,
                           cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemv(cublasHandle_t handle,
-                                 cublasOperation_t transA, int m, int n,
-                                 const float *alfa, const float *A, int lda,
-                                 const float *x, int incx, const float *beta,
-                                 float *y, int incy, cudaStream_t stream) {
+                                 cublasOperation_t transA,
+                                 int m,
+                                 int n,
+                                 const float* alfa,
+                                 const float* A,
+                                 int lda,
+                                 const float* x,
+                                 int incx,
+                                 const float* beta,
+                                 float* y,
+                                 int incy,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y,
-                     incy);
+  return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 
 template <>
 inline cublasStatus_t cublasgemv(cublasHandle_t handle,
-                                 cublasOperation_t transA, int m, int n,
-                                 const double *alfa, const double *A, int lda,
-                                 const double *x, int incx, const double *beta,
-                                 double *y, int incy, cudaStream_t stream) {
+                                 cublasOperation_t transA,
+                                 int m,
+                                 int n,
+                                 const double* alfa,
+                                 const double* A,
+                                 int lda,
+                                 const double* x,
+                                 int incx,
+                                 const double* beta,
+                                 double* y,
+                                 int incy,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y,
-                     incy);
+  return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 /** @} */
 
@@ -216,23 +259,47 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, const T *alpha,
-                         const T *x, int incx, const T *y, int incy, T *A,
-                         int lda, cudaStream_t stream);
+cublasStatus_t cublasger(cublasHandle_t handle,
+                         int m,
+                         int n,
+                         const T* alpha,
+                         const T* x,
+                         int incx,
+                         const T* y,
+                         int incy,
+                         T* A,
+                         int lda,
+                         cudaStream_t stream);
 template <>
-inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
-                                const float *alpha, const float *x, int incx,
-                                const float *y, int incy, float *A, int lda,
-                                cudaStream_t stream) {
+inline cublasStatus_t cublasger(cublasHandle_t handle,
+                                int m,
+                                int n,
+                                const float* alpha,
+                                const float* x,
+                                int incx,
+                                const float* y,
+                                int incy,
+                                float* A,
+                                int lda,
+                                cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
 template <>
-inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
-                                const double *alpha, const double *x, int incx,
-                                const double *y, int incy, double *A, int lda,
-                                cudaStream_t stream) {
+inline cublasStatus_t cublasger(cublasHandle_t handle,
+                                int m,
+                                int n,
+                                const double* alpha,
+                                const double* x,
+                                int incx,
+                                const double* y,
+                                int incy,
+                                double* A,
+                                int lda,
+                                cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
@@ -243,34 +310,62 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA,
-                          cublasOperation_t transB, int m, int n, int k,
-                          const T *alfa, const T *A, int lda, const T *B,
-                          int ldb, const T *beta, T *C, int ldc,
+cublasStatus_t cublasgemm(cublasHandle_t handle,
+                          cublasOperation_t transA,
+                          cublasOperation_t transB,
+                          int m,
+                          int n,
+                          int k,
+                          const T* alfa,
+                          const T* A,
+                          int lda,
+                          const T* B,
+                          int ldb,
+                          const T* beta,
+                          T* C,
+                          int ldc,
                           cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB, int m, int n, int k,
-                                 const float *alfa, const float *A, int lda,
-                                 const float *B, int ldb, const float *beta,
-                                 float *C, int ldc, cudaStream_t stream) {
+                                 cublasOperation_t transB,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const float* alfa,
+                                 const float* A,
+                                 int lda,
+                                 const float* B,
+                                 int ldb,
+                                 const float* beta,
+                                 float* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb,
-                     beta, C, ldc);
+  return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 
 template <>
 inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB, int m, int n, int k,
-                                 const double *alfa, const double *A, int lda,
-                                 const double *B, int ldb, const double *beta,
-                                 double *C, int ldc, cudaStream_t stream) {
+                                 cublasOperation_t transB,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const double* alfa,
+                                 const double* A,
+                                 int lda,
+                                 const double* B,
+                                 int ldb,
+                                 const double* beta,
+                                 double* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb,
-                     beta, C, ldc);
+  return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
 
@@ -281,38 +376,93 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle,
 template <typename T>
 cublasStatus_t cublasgemmBatched(cublasHandle_t handle,  // NOLINT
                                  cublasOperation_t transa,
-                                 cublasOperation_t transb, int m, int n, int k,
-                                 const T *alpha,
-                                 const T *const Aarray[],           // NOLINT
-                                 int lda, const T *const Barray[],  // NOLINT
-                                 int ldb, const T *beta,
-                                 T *Carray[],  // NOLINT
-                                 int ldc, int batchCount, cudaStream_t stream);
+                                 cublasOperation_t transb,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const T* alpha,
+                                 const T* const Aarray[],  // NOLINT
+                                 int lda,
+                                 const T* const Barray[],  // NOLINT
+                                 int ldb,
+                                 const T* beta,
+                                 T* Carray[],  // NOLINT
+                                 int ldc,
+                                 int batchCount,
+                                 cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemmBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const float *alpha,
-  const float *const Aarray[],                  // NOLINT
-  int lda, const float *const Barray[],         // NOLINT
-  int ldb, const float *beta, float *Carray[],  // NOLINT
-  int ldc, int batchCount, cudaStream_t stream) {
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const float* alpha,
+  const float* const Aarray[],  // NOLINT
+  int lda,
+  const float* const Barray[],  // NOLINT
+  int ldb,
+  const float* beta,
+  float* Carray[],  // NOLINT
+  int ldc,
+  int batchCount,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda,
-                            Barray, ldb, beta, Carray, ldc, batchCount);
+  return cublasSgemmBatched(handle,
+                            transa,
+                            transb,
+                            m,
+                            n,
+                            k,
+                            alpha,
+                            Aarray,
+                            lda,
+                            Barray,
+                            ldb,
+                            beta,
+                            Carray,
+                            ldc,
+                            batchCount);
 }
 
 template <>
 inline cublasStatus_t cublasgemmBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const double *alpha,
-  const double *const Aarray[],                   // NOLINT
-  int lda, const double *const Barray[],          // NOLINT
-  int ldb, const double *beta, double *Carray[],  // NOLINT
-  int ldc, int batchCount, cudaStream_t stream) {
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const double* alpha,
+  const double* const Aarray[],  // NOLINT
+  int lda,
+  const double* const Barray[],  // NOLINT
+  int ldb,
+  const double* beta,
+  double* Carray[],  // NOLINT
+  int ldc,
+  int batchCount,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda,
-                            Barray, ldb, beta, Carray, ldc, batchCount);
+  return cublasDgemmBatched(handle,
+                            transa,
+                            transb,
+                            m,
+                            n,
+                            k,
+                            alpha,
+                            Aarray,
+                            lda,
+                            Barray,
+                            ldb,
+                            beta,
+                            Carray,
+                            ldc,
+                            batchCount);
 }
 /** @} */
 
@@ -322,36 +472,110 @@ inline cublasStatus_t cublasgemmBatched(  // NOLINT
  */
 template <typename T>
 cublasStatus_t cublasgemmStridedBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const T *alpha, const T *const Aarray, int lda,
-  int64_t strideA, const T *const Barray, int ldb, int64_t strideB,
-  const T *beta, T *Carray, int ldc, int64_t strideC, int batchCount,
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const T* alpha,
+  const T* const Aarray,
+  int lda,
+  int64_t strideA,
+  const T* const Barray,
+  int ldb,
+  int64_t strideB,
+  const T* beta,
+  T* Carray,
+  int ldc,
+  int64_t strideC,
+  int batchCount,
   cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const float *alpha, const float *const Aarray, int lda,
-  int64_t strideA, const float *const Barray, int ldb, int64_t strideB,
-  const float *beta, float *Carray, int ldc, int64_t strideC, int batchCount,
-  cudaStream_t stream) {
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const float* alpha,
+  const float* const Aarray,
+  int lda,
+  int64_t strideA,
+  const float* const Barray,
+  int ldb,
+  int64_t strideB,
+  const float* beta,
+  float* Carray,
+  int ldc,
+  int64_t strideC,
+  int batchCount,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha,
-                                   Aarray, lda, strideA, Barray, ldb, strideB,
-                                   beta, Carray, ldc, strideC, batchCount);
+  return cublasSgemmStridedBatched(handle,
+                                   transa,
+                                   transb,
+                                   m,
+                                   n,
+                                   k,
+                                   alpha,
+                                   Aarray,
+                                   lda,
+                                   strideA,
+                                   Barray,
+                                   ldb,
+                                   strideB,
+                                   beta,
+                                   Carray,
+                                   ldc,
+                                   strideC,
+                                   batchCount);
 }
 
 template <>
 inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const double *alpha, const double *const Aarray, int lda,
-  int64_t strideA, const double *const Barray, int ldb, int64_t strideB,
-  const double *beta, double *Carray, int ldc, int64_t strideC, int batchCount,
-  cudaStream_t stream) {
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const double* alpha,
+  const double* const Aarray,
+  int lda,
+  int64_t strideA,
+  const double* const Barray,
+  int ldb,
+  int64_t strideB,
+  const double* beta,
+  double* Carray,
+  int ldc,
+  int64_t strideC,
+  int batchCount,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha,
-                                   Aarray, lda, strideA, Barray, ldb, strideB,
-                                   beta, Carray, ldc, strideC, batchCount);
+  return cublasDgemmStridedBatched(handle,
+                                   transa,
+                                   transb,
+                                   m,
+                                   n,
+                                   k,
+                                   alpha,
+                                   Aarray,
+                                   lda,
+                                   strideA,
+                                   Barray,
+                                   ldb,
+                                   strideB,
+                                   beta,
+                                   Carray,
+                                   ldc,
+                                   strideC,
+                                   batchCount);
 }
 /** @} */
 
@@ -361,51 +585,85 @@ inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
  */
 
 template <typename T>
-cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, int n,  // NOLINT
-                                  T *const A[],                  // NOLINT
-                                  int lda, int *P, int *info, int batchSize,
+cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,
+                                  int n,         // NOLINT
+                                  T* const A[],  // NOLINT
+                                  int lda,
+                                  int* P,
+                                  int* info,
+                                  int batchSize,
                                   cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,    // NOLINT
-                                         int n, float *const A[],  // NOLINT
-                                         int lda, int *P, int *info,
-                                         int batchSize, cudaStream_t stream) {
+inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
+                                         int n,
+                                         float* const A[],  // NOLINT
+                                         int lda,
+                                         int* P,
+                                         int* info,
+                                         int batchSize,
+                                         cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
 template <>
-inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,     // NOLINT
-                                         int n, double *const A[],  // NOLINT
-                                         int lda, int *P, int *info,
-                                         int batchSize, cudaStream_t stream) {
+inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
+                                         int n,
+                                         double* const A[],  // NOLINT
+                                         int lda,
+                                         int* P,
+                                         int* info,
+                                         int batchSize,
+                                         cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
 template <typename T>
-cublasStatus_t cublasgetriBatched(cublasHandle_t handle, int n,  // NOLINT
-                                  const T *const A[],            // NOLINT
-                                  int lda, const int *P,
-                                  T *const C[],  // NOLINT
-                                  int ldc, int *info, int batchSize,
+cublasStatus_t cublasgetriBatched(cublasHandle_t handle,
+                                  int n,               // NOLINT
+                                  const T* const A[],  // NOLINT
+                                  int lda,
+                                  const int* P,
+                                  T* const C[],  // NOLINT
+                                  int ldc,
+                                  int* info,
+                                  int batchSize,
                                   cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasgetriBatched(                // NOLINT
-  cublasHandle_t handle, int n, const float *const A[],  // NOLINT
-  int lda, const int *P, float *const C[],               // NOLINT
-  int ldc, int *info, int batchSize, cudaStream_t stream) {
+inline cublasStatus_t cublasgetriBatched(  // NOLINT
+  cublasHandle_t handle,
+  int n,
+  const float* const A[],  // NOLINT
+  int lda,
+  const int* P,
+  float* const C[],  // NOLINT
+  int ldc,
+  int* info,
+  int batchSize,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
 template <>
-inline cublasStatus_t cublasgetriBatched(                 // NOLINT
-  cublasHandle_t handle, int n, const double *const A[],  // NOLINT
-  int lda, const int *P, double *const C[],               // NOLINT
-  int ldc, int *info, int batchSize, cudaStream_t stream) {
+inline cublasStatus_t cublasgetriBatched(  // NOLINT
+  cublasHandle_t handle,
+  int n,
+  const double* const A[],  // NOLINT
+  int lda,
+  const int* P,
+  double* const C[],  // NOLINT
+  int ldc,
+  int* info,
+  int batchSize,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
@@ -419,34 +677,57 @@ inline cublasStatus_t cublasgetriBatched(                 // NOLINT
 
 template <typename T>
 inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int m, int n,
-                                        int nrhs, T *Aarray[],  // NOLINT
-                                        int lda, T *Carray[],   // NOLINT
-                                        int ldc, int *info, int *devInfoArray,
-                                        int batchSize, cudaStream_t stream);
+                                        cublasOperation_t trans,
+                                        int m,
+                                        int n,
+                                        int nrhs,
+                                        T* Aarray[],  // NOLINT
+                                        int lda,
+                                        T* Carray[],  // NOLINT
+                                        int ldc,
+                                        int* info,
+                                        int* devInfoArray,
+                                        int batchSize,
+                                        cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int m, int n,
-                                        int nrhs, float *Aarray[],  // NOLINT
-                                        int lda, float *Carray[],   // NOLINT
-                                        int ldc, int *info, int *devInfoArray,
-                                        int batchSize, cudaStream_t stream) {
+                                        cublasOperation_t trans,
+                                        int m,
+                                        int n,
+                                        int nrhs,
+                                        float* Aarray[],  // NOLINT
+                                        int lda,
+                                        float* Carray[],  // NOLINT
+                                        int ldc,
+                                        int* info,
+                                        int* devInfoArray,
+                                        int batchSize,
+                                        cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
-                            info, devInfoArray, batchSize);
+  return cublasSgelsBatched(
+    handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
 
 template <>
 inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int m, int n,
-                                        int nrhs, double *Aarray[],  // NOLINT
-                                        int lda, double *Carray[],   // NOLINT
-                                        int ldc, int *info, int *devInfoArray,
-                                        int batchSize, cudaStream_t stream) {
+                                        cublasOperation_t trans,
+                                        int m,
+                                        int n,
+                                        int nrhs,
+                                        double* Aarray[],  // NOLINT
+                                        int lda,
+                                        double* Carray[],  // NOLINT
+                                        int ldc,
+                                        int* info,
+                                        int* devInfoArray,
+                                        int batchSize,
+                                        cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
-                            info, devInfoArray, batchSize);
+  return cublasDgelsBatched(
+    handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
 
 /** @} */
@@ -456,33 +737,59 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
  * @{
  */
 template <typename T>
-cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA,
-                          cublasOperation_t transB, int m, int n, const T *alfa,
-                          const T *A, int lda, const T *beta, const T *B,
-                          int ldb, T *C, int ldc, cudaStream_t stream);
+cublasStatus_t cublasgeam(cublasHandle_t handle,
+                          cublasOperation_t transA,
+                          cublasOperation_t transB,
+                          int m,
+                          int n,
+                          const T* alfa,
+                          const T* A,
+                          int lda,
+                          const T* beta,
+                          const T* B,
+                          int ldb,
+                          T* C,
+                          int ldc,
+                          cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB, int m, int n,
-                                 const float *alfa, const float *A, int lda,
-                                 const float *beta, const float *B, int ldb,
-                                 float *C, int ldc, cudaStream_t stream) {
+                                 cublasOperation_t transB,
+                                 int m,
+                                 int n,
+                                 const float* alfa,
+                                 const float* A,
+                                 int lda,
+                                 const float* beta,
+                                 const float* B,
+                                 int ldb,
+                                 float* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb,
-                     C, ldc);
+  return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 
 template <>
 inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB, int m, int n,
-                                 const double *alfa, const double *A, int lda,
-                                 const double *beta, const double *B, int ldb,
-                                 double *C, int ldc, cudaStream_t stream) {
+                                 cublasOperation_t transB,
+                                 int m,
+                                 int n,
+                                 const double* alfa,
+                                 const double* A,
+                                 int lda,
+                                 const double* beta,
+                                 const double* B,
+                                 int ldb,
+                                 double* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb,
-                     C, ldc);
+  return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 /** @} */
 
@@ -491,31 +798,59 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
-                          cublasFillMode_t uplo, int m, int n, const T *alpha,
-                          const T *A, int lda, const T *B, int ldb,
-                          const T *beta, T *C, int ldc, cudaStream_t stream);
+cublasStatus_t cublassymm(cublasHandle_t handle,
+                          cublasSideMode_t side,
+                          cublasFillMode_t uplo,
+                          int m,
+                          int n,
+                          const T* alpha,
+                          const T* A,
+                          int lda,
+                          const T* B,
+                          int ldb,
+                          const T* beta,
+                          T* C,
+                          int ldc,
+                          cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
-                                 cublasFillMode_t uplo, int m, int n,
-                                 const float *alpha, const float *A, int lda,
-                                 const float *B, int ldb, const float *beta,
-                                 float *C, int ldc, cudaStream_t stream) {
+inline cublasStatus_t cublassymm(cublasHandle_t handle,
+                                 cublasSideMode_t side,
+                                 cublasFillMode_t uplo,
+                                 int m,
+                                 int n,
+                                 const float* alpha,
+                                 const float* A,
+                                 int lda,
+                                 const float* B,
+                                 int ldb,
+                                 const float* beta,
+                                 float* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                     ldc);
+  return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
 template <>
-inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
-                                 cublasFillMode_t uplo, int m, int n,
-                                 const double *alpha, const double *A, int lda,
-                                 const double *B, int ldb, const double *beta,
-                                 double *C, int ldc, cudaStream_t stream) {
+inline cublasStatus_t cublassymm(cublasHandle_t handle,
+                                 cublasSideMode_t side,
+                                 cublasFillMode_t uplo,
+                                 int m,
+                                 int n,
+                                 const double* alpha,
+                                 const double* A,
+                                 int lda,
+                                 const double* B,
+                                 int ldb,
+                                 const double* beta,
+                                 double* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                     ldc);
+  return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
 
@@ -524,27 +859,51 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
  * @{
  */
 template <typename T>
-cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
-                          cublasOperation_t trans, int n, int k, const T *alpha,
-                          const T *A, int lda, const T *beta, T *C, int ldc,
+cublasStatus_t cublassyrk(cublasHandle_t handle,
+                          cublasFillMode_t uplo,
+                          cublasOperation_t trans,
+                          int n,
+                          int k,
+                          const T* alpha,
+                          const T* A,
+                          int lda,
+                          const T* beta,
+                          T* C,
+                          int ldc,
                           cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
-                                 cublasOperation_t trans, int n, int k,
-                                 const float *alpha, const float *A, int lda,
-                                 const float *beta, float *C, int ldc,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublassyrk(cublasHandle_t handle,
+                                 cublasFillMode_t uplo,
+                                 cublasOperation_t trans,
+                                 int n,
+                                 int k,
+                                 const float* alpha,
+                                 const float* A,
+                                 int lda,
+                                 const float* beta,
+                                 float* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
 template <>
-inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
-                                 cublasOperation_t trans, int n, int k,
-                                 const double *alpha, const double *A, int lda,
-                                 const double *beta, double *C, int ldc,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublassyrk(cublasHandle_t handle,
+                                 cublasFillMode_t uplo,
+                                 cublasOperation_t trans,
+                                 int n,
+                                 int k,
+                                 const double* alpha,
+                                 const double* A,
+                                 int lda,
+                                 const double* beta,
+                                 double* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
@@ -555,52 +914,77 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const T *x, int incx,
-                          T *result, cudaStream_t stream);
+cublasStatus_t cublasnrm2(
+  cublasHandle_t handle, int n, const T* x, int incx, T* result, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const float *x,
-                                 int incx, float *result, cudaStream_t stream) {
+inline cublasStatus_t cublasnrm2(
+  cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSnrm2(handle, n, x, incx, result);
 }
 
 template <>
-inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const double *x,
-                                 int incx, double *result,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasnrm2(
+  cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDnrm2(handle, n, x, incx, result);
 }
 /** @} */
 
 template <typename T>
-cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
-                          cublasFillMode_t uplo, cublasOperation_t trans,
-                          cublasDiagType_t diag, int m, int n, const T *alpha,
-                          const T *A, int lda, T *B, int ldb,
+cublasStatus_t cublastrsm(cublasHandle_t handle,
+                          cublasSideMode_t side,
+                          cublasFillMode_t uplo,
+                          cublasOperation_t trans,
+                          cublasDiagType_t diag,
+                          int m,
+                          int n,
+                          const T* alpha,
+                          const T* A,
+                          int lda,
+                          T* B,
+                          int ldb,
                           cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
-                                 cublasFillMode_t uplo, cublasOperation_t trans,
-                                 cublasDiagType_t diag, int m, int n,
-                                 const float *alpha, const float *A, int lda,
-                                 float *B, int ldb, cudaStream_t stream) {
+inline cublasStatus_t cublastrsm(cublasHandle_t handle,
+                                 cublasSideMode_t side,
+                                 cublasFillMode_t uplo,
+                                 cublasOperation_t trans,
+                                 cublasDiagType_t diag,
+                                 int m,
+                                 int n,
+                                 const float* alpha,
+                                 const float* A,
+                                 int lda,
+                                 float* B,
+                                 int ldb,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B,
-                     ldb);
+  return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
 template <>
-inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
-                                 cublasFillMode_t uplo, cublasOperation_t trans,
-                                 cublasDiagType_t diag, int m, int n,
-                                 const double *alpha, const double *A, int lda,
-                                 double *B, int ldb, cudaStream_t stream) {
+inline cublasStatus_t cublastrsm(cublasHandle_t handle,
+                                 cublasSideMode_t side,
+                                 cublasFillMode_t uplo,
+                                 cublasOperation_t trans,
+                                 cublasDiagType_t diag,
+                                 int m,
+                                 int n,
+                                 const double* alpha,
+                                 const double* A,
+                                 int lda,
+                                 double* B,
+                                 int ldb,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B,
-                     ldb);
+  return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
 /**
@@ -608,21 +992,39 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasdot(cublasHandle_t handle, int n, const T *x, int incx,
-                         const T *y, int incy, T *result, cudaStream_t stream);
+cublasStatus_t cublasdot(cublasHandle_t handle,
+                         int n,
+                         const T* x,
+                         int incx,
+                         const T* y,
+                         int incy,
+                         T* result,
+                         cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const float *x,
-                                int incx, const float *y, int incy,
-                                float *result, cudaStream_t stream) {
+inline cublasStatus_t cublasdot(cublasHandle_t handle,
+                                int n,
+                                const float* x,
+                                int incx,
+                                const float* y,
+                                int incy,
+                                float* result,
+                                cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSdot(handle, n, x, incx, y, incy, result);
 }
 
 template <>
-inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x,
-                                int incx, const double *y, int incy,
-                                double *result, cudaStream_t stream) {
+inline cublasStatus_t cublasdot(cublasHandle_t handle,
+                                int n,
+                                const double* x,
+                                int incx,
+                                const double* y,
+                                int incy,
+                                double* result,
+                                cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDdot(handle, n, x, incx, y, incy, result);
 }
@@ -642,7 +1044,8 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x,
 // template<>
 inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
                                            cublasPointerMode_t mode,
-                                           cudaStream_t stream) {
+                                           cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSetPointerMode(handle, mode);
 }
@@ -653,21 +1056,21 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x,
-                          int incx, cudaStream_t stream);
+cublasStatus_t cublasscal(
+  cublasHandle_t handle, int n, const T* alpha, T* x, int incx, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasscal(cublasHandle_t handle, int n,
-                                 const float *alpha, float *x, int incx,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasscal(
+  cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSscal(handle, n, alpha, x, incx);
 }
 
 template <>
-inline cublasStatus_t cublasscal(cublasHandle_t handle, int n,
-                                 const double *alpha, double *x, int incx,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasscal(
+  cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDscal(handle, n, alpha, x, incx);
 }
diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h
index 0eadf47fe3..76a9f40f4d 100644
--- a/cpp/include/raft/linalg/cusolver_wrappers.h
+++ b/cpp/include/raft/linalg/cusolver_wrappers.h
@@ -24,8 +24,7 @@
 #include <type_traits>
 
 #define _CUSOLVER_ERR_TO_STR(err) \
-  case err:                       \
-    return #err;
+  case err: return #err;
 
 namespace raft {
 
@@ -33,16 +32,15 @@ namespace raft {
  * @brief Exception thrown when a cuSOLVER error is encountered.
  */
 struct cusolver_error : public raft::exception {
-  explicit cusolver_error(char const *const message)
-    : raft::exception(message) {}
-  explicit cusolver_error(std::string const &message)
-    : raft::exception(message) {}
+  explicit cusolver_error(char const* const message) : raft::exception(message) {}
+  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
 };
 
 namespace linalg {
 namespace detail {
 
-inline const char *cusolver_error_to_string(cusolverStatus_t err) {
+inline const char* cusolver_error_to_string(cusolverStatus_t err)
+{
   switch (err) {
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
@@ -54,8 +52,7 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) {
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-    default:
-      return "CUSOLVER_STATUS_UNKNOWN";
+    default: return "CUSOLVER_STATUS_UNKNOWN";
   };
 }
 
@@ -76,8 +73,11 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) {
     cusolverStatus_t const status = (call);                                  \
     if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
       std::string msg{};                                                     \
-      SET_ERROR_MSG(msg, "cuSOLVER error encountered at: ",                  \
-                    "call='%s', Reason=%d:%s", #call, status,                \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSOLVER error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
                     raft::linalg::detail::cusolver_error_to_string(status)); \
       throw raft::cusolver_error(msg);                                       \
     }                                                                        \
@@ -107,42 +107,76 @@ namespace linalg {
  * @{
  */
 template <typename T>
-cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, int m,  // NOLINT
-                                 int n, T *A, int lda, T *Workspace,
-                                 int *devIpiv, int *devInfo,
+cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,
+                                 int m,  // NOLINT
+                                 int n,
+                                 T* A,
+                                 int lda,
+                                 T* Workspace,
+                                 int* devIpiv,
+                                 int* devInfo,
                                  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m, int n, float *A, int lda,
-                                        float *Workspace, int *devIpiv,
-                                        int *devInfo, cudaStream_t stream) {
+                                        int m,
+                                        int n,
+                                        float* A,
+                                        int lda,
+                                        float* Workspace,
+                                        int* devIpiv,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m, int n, double *A, int lda,
-                                        double *Workspace, int *devIpiv,
-                                        int *devInfo, cudaStream_t stream) {
+                                        int m,
+                                        int n,
+                                        double* A,
+                                        int lda,
+                                        double* Workspace,
+                                        int* devIpiv,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork);
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  T* A,
+  int lda,
+  int* Lwork);
 
 template <>
 inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  float* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  double* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 
@@ -152,30 +186,49 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
-                                 cublasOperation_t trans, int n, int nrhs,
-                                 const T *A, int lda, const int *devIpiv, T *B,
-                                 int ldb, int *devInfo, cudaStream_t stream);
+                                 cublasOperation_t trans,
+                                 int n,
+                                 int nrhs,
+                                 const T* A,
+                                 int lda,
+                                 const int* devIpiv,
+                                 T* B,
+                                 int ldb,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int n,
-                                        int nrhs, const float *A, int lda,
-                                        const int *devIpiv, float *B, int ldb,
-                                        int *devInfo, cudaStream_t stream) {
+                                        cublasOperation_t trans,
+                                        int n,
+                                        int nrhs,
+                                        const float* A,
+                                        int lda,
+                                        const int* devIpiv,
+                                        float* B,
+                                        int ldb,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb,
-                          devInfo);
+  return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int n,
-                                        int nrhs, const double *A, int lda,
-                                        const int *devIpiv, double *B, int ldb,
-                                        int *devInfo, cudaStream_t stream) {
+                                        cublasOperation_t trans,
+                                        int n,
+                                        int nrhs,
+                                        const double* A,
+                                        int lda,
+                                        const int* devIpiv,
+                                        double* B,
+                                        int ldb,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb,
-                          devInfo);
+  return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 /** @} */
 
@@ -185,20 +238,40 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const T *A, int lda, const T *W, int *lwork);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const T* A,
+  int lda,
+  const T* W,
+  int* lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const float *A, int lda, const float *W, int *lwork) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const float* A,
+  int lda,
+  const float* W,
+  int* lwork)
+{
   return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const double *A, int lda, const double *W, int *lwork) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const double* A,
+  int lda,
+  const double* W,
+  int* lwork)
+{
   return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
 }
 /** @} */
@@ -209,52 +282,96 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnsyevj(cusolverDnHandle_t handle,  // NOLINT
-                                 cusolverEigMode_t jobz, cublasFillMode_t uplo,
-                                 int n, T *A, int lda, T *W, T *work, int lwork,
-                                 int *info, syevjInfo_t params,
+                                 cusolverEigMode_t jobz,
+                                 cublasFillMode_t uplo,
+                                 int n,
+                                 T* A,
+                                 int lda,
+                                 T* W,
+                                 T* work,
+                                 int lwork,
+                                 int* info,
+                                 syevjInfo_t params,
                                  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, float *A, int lda, float *W, float *work, int lwork, int *info,
-  syevjInfo_t params, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  float* A,
+  int lda,
+  float* W,
+  float* work,
+  int lwork,
+  int* info,
+  syevjInfo_t params,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info,
-                          params);
+  return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, double *A, int lda, double *W, double *work, int lwork, int *info,
-  syevjInfo_t params, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  double* A,
+  int lda,
+  double* W,
+  double* work,
+  int lwork,
+  int* info,
+  syevjInfo_t params,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info,
-                          params);
+  return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const T *A, int lda, const T *W, int *lwork, syevjInfo_t params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const T* A,
+  int lda,
+  const T* W,
+  int* lwork,
+  syevjInfo_t params);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const float *A, int lda, const float *W, int *lwork,
-  syevjInfo_t params) {
-  return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork,
-                                     params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const float* A,
+  int lda,
+  const float* W,
+  int* lwork,
+  syevjInfo_t params)
+{
+  return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const double *A, int lda, const double *W, int *lwork,
-  syevjInfo_t params) {
-  return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork,
-                                     params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const double* A,
+  int lda,
+  const double* W,
+  int* lwork,
+  syevjInfo_t params)
+{
+  return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params);
 }
 /** @} */
 
@@ -264,32 +381,49 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
-                                 cusolverEigMode_t jobz, cublasFillMode_t uplo,
-                                 int n, T *A, int lda, T *W, T *work, int lwork,
-                                 int *devInfo, cudaStream_t stream);
+                                 cusolverEigMode_t jobz,
+                                 cublasFillMode_t uplo,
+                                 int n,
+                                 T* A,
+                                 int lda,
+                                 T* W,
+                                 T* work,
+                                 int lwork,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         cusolverEigMode_t jobz,
-                                        cublasFillMode_t uplo, int n, float *A,
-                                        int lda, float *W, float *work,
-                                        int lwork, int *devInfo,
-                                        cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        float* A,
+                                        int lda,
+                                        float* W,
+                                        float* work,
+                                        int lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork,
-                          devInfo);
+  return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         cusolverEigMode_t jobz,
-                                        cublasFillMode_t uplo, int n, double *A,
-                                        int lda, double *W, double *work,
-                                        int lwork, int *devInfo,
-                                        cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        double* A,
+                                        int lda,
+                                        double* W,
+                                        double* work,
+                                        int lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork,
-                          devInfo);
+  return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 /** @} */
 
@@ -297,57 +431,134 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
 /**
  * @defgroup syevdx cusolver syevdx operations
  * @{
-*/
+ */
 template <typename T>
 cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, const T *A, int lda, T vl, T vu, int il, int iu,
-  int *h_meig, const T *W, int *lwork);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  const T* A,
+  int lda,
+  T vl,
+  T vu,
+  int il,
+  int iu,
+  int* h_meig,
+  const T* W,
+  int* lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu,
-  int il, int iu, int *h_meig, const float *W, int *lwork) {
-  return cusolverDnSsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl,
-                                      vu, il, iu, h_meig, W, lwork);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  const float* A,
+  int lda,
+  float vl,
+  float vu,
+  int il,
+  int iu,
+  int* h_meig,
+  const float* W,
+  int* lwork)
+{
+  return cusolverDnSsyevdx_bufferSize(
+    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, const double *A, int lda, double vl, double vu,
-  int il, int iu, int *h_meig, const double *W, int *lwork) {
-  return cusolverDnDsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl,
-                                      vu, il, iu, h_meig, W, lwork);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  const double* A,
+  int lda,
+  double vl,
+  double vu,
+  int il,
+  int iu,
+  int* h_meig,
+  const double* W,
+  int* lwork)
+{
+  return cusolverDnDsyevdx_bufferSize(
+    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnsyevdx(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, T *A, int lda, T vl, T vu, int il, int iu,
-  int *h_meig, T *W, T *work, int lwork, int *devInfo, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  T* A,
+  int lda,
+  T vl,
+  T vu,
+  int il,
+  int iu,
+  int* h_meig,
+  T* W,
+  T* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il,
-  int iu, int *h_meig, float *W, float *work, int lwork, int *devInfo,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  float* A,
+  int lda,
+  float vl,
+  float vu,
+  int il,
+  int iu,
+  int* h_meig,
+  float* W,
+  float* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu,
-                           h_meig, W, work, lwork, devInfo);
+  return cusolverDnSsyevdx(
+    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu,
-  int il, int iu, int *h_meig, double *W, double *work, int lwork, int *devInfo,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  double* A,
+  int lda,
+  double vl,
+  double vu,
+  int il,
+  int iu,
+  int* h_meig,
+  double* W,
+  double* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu,
-                           h_meig, W, work, lwork, devInfo);
+  return cusolverDnDsyevdx(
+    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
 /** @} */
 #endif
@@ -358,7 +569,11 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDngesvd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int* lwork)
+{
   if (std::is_same<std::decay_t<T>, float>::value) {
     return cusolverDnSgesvd_bufferSize(handle, m, n, lwork);
   } else {
@@ -367,72 +582,194 @@ cusolverStatus_t cusolverDngesvd_bufferSize(  // NOLINT
 }
 template <typename T>
 cusolverStatus_t cusolverDngesvd(  // NOLINT
-  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
-  T *A, int lda, T *S, T *U, int ldu, T *VT, int ldvt, T *work, int lwork,
-  T *rwork, int *devInfo, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  signed char jobu,
+  signed char jobvt,
+  int m,
+  int n,
+  T* A,
+  int lda,
+  T* S,
+  T* U,
+  int ldu,
+  T* VT,
+  int ldvt,
+  T* work,
+  int lwork,
+  T* rwork,
+  int* devInfo,
+  cudaStream_t stream);
 template <>
 inline cusolverStatus_t cusolverDngesvd(  // NOLINT
-  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
-  float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt,
-  float *work, int lwork, float *rwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  signed char jobu,
+  signed char jobvt,
+  int m,
+  int n,
+  float* A,
+  int lda,
+  float* S,
+  float* U,
+  int ldu,
+  float* VT,
+  int ldvt,
+  float* work,
+  int lwork,
+  float* rwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT,
-                          ldvt, work, lwork, rwork, devInfo);
+  return cusolverDnSgesvd(
+    handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
 template <>
 inline cusolverStatus_t cusolverDngesvd(  // NOLINT
-  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
-  double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt,
-  double *work, int lwork, double *rwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  signed char jobu,
+  signed char jobvt,
+  int m,
+  int n,
+  double* A,
+  int lda,
+  double* S,
+  double* U,
+  int ldu,
+  double* VT,
+  int ldvt,
+  double* work,
+  int lwork,
+  double* rwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT,
-                          ldvt, work, lwork, rwork, devInfo);
+  return cusolverDnDgesvd(
+    handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
 
 template <typename T>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  const T *A, int lda, const T *S, const T *U, int ldu, const T *V, int ldv,
-  int *lwork, gesvdjInfo_t params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  const T* A,
+  int lda,
+  const T* S,
+  const T* U,
+  int ldu,
+  const T* V,
+  int ldv,
+  int* lwork,
+  gesvdjInfo_t params);
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  const float *A, int lda, const float *S, const float *U, int ldu,
-  const float *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  return cusolverDnSgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U,
-                                      ldu, V, ldv, lwork, params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  const float* A,
+  int lda,
+  const float* S,
+  const float* U,
+  int ldu,
+  const float* V,
+  int ldv,
+  int* lwork,
+  gesvdjInfo_t params)
+{
+  return cusolverDnSgesvdj_bufferSize(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
 }
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  const double *A, int lda, const double *S, const double *U, int ldu,
-  const double *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  return cusolverDnDgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U,
-                                      ldu, V, ldv, lwork, params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  const double* A,
+  int lda,
+  const double* S,
+  const double* U,
+  int ldu,
+  const double* V,
+  int ldv,
+  int* lwork,
+  gesvdjInfo_t params)
+{
+  return cusolverDnDgesvdj_bufferSize(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
 }
 template <typename T>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  T *A, int lda, T *S, T *U, int ldu, T *V, int ldv, T *work, int lwork,
-  int *info, gesvdjInfo_t params, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  T* A,
+  int lda,
+  T* S,
+  T* U,
+  int ldu,
+  T* V,
+  int ldv,
+  T* work,
+  int lwork,
+  int* info,
+  gesvdjInfo_t params,
+  cudaStream_t stream);
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  float *A, int lda, float *S, float *U, int ldu, float *V, int ldv,
-  float *work, int lwork, int *info, gesvdjInfo_t params, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  float* A,
+  int lda,
+  float* S,
+  float* U,
+  int ldu,
+  float* V,
+  int ldv,
+  float* work,
+  int lwork,
+  int* info,
+  gesvdjInfo_t params,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv,
-                           work, lwork, info, params);
+  return cusolverDnSgesvdj(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  double *A, int lda, double *S, double *U, int ldu, double *V, int ldv,
-  double *work, int lwork, int *info, gesvdjInfo_t params,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  double* A,
+  int lda,
+  double* S,
+  double* U,
+  int ldu,
+  double* V,
+  int ldv,
+  double* work,
+  int lwork,
+  int* info,
+  gesvdjInfo_t params,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv,
-                           work, lwork, info, params);
+  return cusolverDnDgesvdj(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
 /** @} */
 
@@ -442,43 +779,74 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, T *A, int lda,
-  int *Lwork);
+  cusolverDnHandle_t handle,
+  cublasFillMode_t uplo,
+  int n,
+  T* A,
+  int lda,
+  int* Lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda,
-  int *Lwork) {
+  cusolverDnHandle_t handle,
+  cublasFillMode_t uplo,
+  int n,
+  float* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
-  int *Lwork) {
+  cusolverDnHandle_t handle,
+  cublasFillMode_t uplo,
+  int n,
+  double* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
 }
 
 template <typename T>
 inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, T *A,
-                                        int lda, T *Workspace, int Lwork,
-                                        int *devInfo, cudaStream_t stream);
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        T* A,
+                                        int lda,
+                                        T* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, float *A,
-                                        int lda, float *Workspace, int Lwork,
-                                        int *devInfo, cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        float* A,
+                                        int lda,
+                                        float* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, double *A,
-                                        int lda, double *Workspace, int Lwork,
-                                        int *devInfo, cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        double* A,
+                                        int lda,
+                                        double* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
@@ -490,26 +858,44 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
-                                 cublasFillMode_t uplo, int n, int nrhs,
-                                 const T *A, int lda, T *B, int ldb,
-                                 int *devInfo, cudaStream_t stream);
+                                 cublasFillMode_t uplo,
+                                 int n,
+                                 int nrhs,
+                                 const T* A,
+                                 int lda,
+                                 T* B,
+                                 int ldb,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, int nrhs,
-                                        const float *A, int lda, float *B,
-                                        int ldb, int *devInfo,
-                                        cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        int nrhs,
+                                        const float* A,
+                                        int lda,
+                                        float* B,
+                                        int ldb,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, int nrhs,
-                                        const double *A, int lda, double *B,
-                                        int ldb, int *devInfo,
-                                        cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        int nrhs,
+                                        const double* A,
+                                        int lda,
+                                        double* B,
+                                        int ldb,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
@@ -520,38 +906,75 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
  * @{
  */
 template <typename T>
-cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, int m,  // NOLINT
-                                 int n, T *A, int lda, T *TAU, T *Workspace,
-                                 int Lwork, int *devInfo, cudaStream_t stream);
+cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,
+                                 int m,  // NOLINT
+                                 int n,
+                                 T* A,
+                                 int lda,
+                                 T* TAU,
+                                 T* Workspace,
+                                 int Lwork,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 template <>
 inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m, int n, float *A, int lda,
-                                        float *TAU, float *Workspace, int Lwork,
-                                        int *devInfo, cudaStream_t stream) {
+                                        int m,
+                                        int n,
+                                        float* A,
+                                        int lda,
+                                        float* TAU,
+                                        float* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 template <>
 inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m, int n, double *A, int lda,
-                                        double *TAU, double *Workspace,
-                                        int Lwork, int *devInfo,
-                                        cudaStream_t stream) {
+                                        int m,
+                                        int n,
+                                        double* A,
+                                        int lda,
+                                        double* TAU,
+                                        double* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork);
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  T* A,
+  int lda,
+  int* Lwork);
 template <>
 inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  float* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 template <>
 inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  double* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 /** @} */
@@ -562,38 +985,86 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnorgqr(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, T *A, int lda, const T *tau,
-  T *work, int lwork, int *devInfo, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  T* A,
+  int lda,
+  const T* tau,
+  T* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream);
 template <>
 inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, float *A, int lda,
-  const float *tau, float *work, int lwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  float* A,
+  int lda,
+  const float* tau,
+  float* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 template <>
 inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, double *A, int lda,
-  const double *tau, double *work, int lwork, int *devInfo,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  double* A,
+  int lda,
+  const double* tau,
+  double* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, const T *A, int lda,
-  const T *TAU, int *lwork);
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  const T* A,
+  int lda,
+  const T* TAU,
+  int* lwork);
 template <>
 inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda,
-  const float *TAU, int *lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  const float* A,
+  int lda,
+  const float* TAU,
+  int* lwork)
+{
   return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
 }
 template <>
 inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda,
-  const double *TAU, int *lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  const double* A,
+  int lda,
+  const double* TAU,
+  int* lwork)
+{
   return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
 }
 /** @} */
@@ -604,53 +1075,114 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnormqr(cusolverDnHandle_t handle,  // NOLINT
-                                 cublasSideMode_t side, cublasOperation_t trans,
-                                 int m, int n, int k, const T *A, int lda,
-                                 const T *tau, T *C, int ldc, T *work,
-                                 int lwork, int *devInfo, cudaStream_t stream);
+                                 cublasSideMode_t side,
+                                 cublasOperation_t trans,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const T* A,
+                                 int lda,
+                                 const T* tau,
+                                 T* C,
+                                 int ldc,
+                                 T* work,
+                                 int lwork,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnormqr(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const float *A, int lda, const float *tau, float *C,
-  int ldc, float *work, int lwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const float* A,
+  int lda,
+  const float* tau,
+  float* C,
+  int ldc,
+  float* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc,
-                          work, lwork, devInfo);
+  return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnormqr(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const double *A, int lda, const double *tau, double *C,
-  int ldc, double *work, int lwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const double* A,
+  int lda,
+  const double* tau,
+  double* C,
+  int ldc,
+  double* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc,
-                          work, lwork, devInfo);
+  return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const T *A, int lda, const T *tau, const T *C, int ldc,
-  int *lwork);
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const T* A,
+  int lda,
+  const T* tau,
+  const T* C,
+  int ldc,
+  int* lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const float *A, int lda, const float *tau,
-  const float *C, int ldc, int *lwork) {
-  return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau,
-                                     C, ldc, lwork);
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const float* A,
+  int lda,
+  const float* tau,
+  const float* C,
+  int ldc,
+  int* lwork)
+{
+  return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const double *A, int lda, const double *tau,
-  const double *C, int ldc, int *lwork) {
-  return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau,
-                                     C, ldc, lwork);
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const double* A,
+  int lda,
+  const double* tau,
+  const double* C,
+  int ldc,
+  int* lwork)
+{
+  return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
 }
 /** @} */
 
@@ -660,62 +1192,136 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, int batchSize, csrqrInfo_t info,
-  size_t *internalDataInBytes, size_t *workspaceInBytes);
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const T* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  int batchSize,
+  csrqrInfo_t info,
+  size_t* internalDataInBytes,
+  size_t* workspaceInBytes);
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, int batchSize, csrqrInfo_t info,
-  size_t *internalDataInBytes, size_t *workspaceInBytes) {
-  return cusolverSpScsrqrBufferInfoBatched(
-    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize,
-    info, internalDataInBytes, workspaceInBytes);
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const float* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  int batchSize,
+  csrqrInfo_t info,
+  size_t* internalDataInBytes,
+  size_t* workspaceInBytes)
+{
+  return cusolverSpScsrqrBufferInfoBatched(handle,
+                                           m,
+                                           n,
+                                           nnzA,
+                                           descrA,
+                                           csrValA,
+                                           csrRowPtrA,
+                                           csrColIndA,
+                                           batchSize,
+                                           info,
+                                           internalDataInBytes,
+                                           workspaceInBytes);
 }
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, int batchSize, csrqrInfo_t info,
-  size_t *internalDataInBytes, size_t *workspaceInBytes) {
-  return cusolverSpDcsrqrBufferInfoBatched(
-    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize,
-    info, internalDataInBytes, workspaceInBytes);
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const double* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  int batchSize,
+  csrqrInfo_t info,
+  size_t* internalDataInBytes,
+  size_t* workspaceInBytes)
+{
+  return cusolverSpDcsrqrBufferInfoBatched(handle,
+                                           m,
+                                           n,
+                                           nnzA,
+                                           descrA,
+                                           csrValA,
+                                           csrRowPtrA,
+                                           csrColIndA,
+                                           batchSize,
+                                           info,
+                                           internalDataInBytes,
+                                           workspaceInBytes);
 }
 
 template <typename T>
 cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, const T *b, T *x, int batchSize, csrqrInfo_t info,
-  void *pBuffer, cudaStream_t stream);
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const T* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  const T* b,
+  T* x,
+  int batchSize,
+  csrqrInfo_t info,
+  void* pBuffer,
+  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, const float *b, float *x, int batchSize,
-  csrqrInfo_t info, void *pBuffer, cudaStream_t stream) {
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const float* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  const float* b,
+  float* x,
+  int batchSize,
+  csrqrInfo_t info,
+  void* pBuffer,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
-  return cusolverSpScsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA,
-                                   csrRowPtrA, csrColIndA, b, x, batchSize,
-                                   info, pBuffer);
+  return cusolverSpScsrqrsvBatched(
+    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, const double *b, double *x, int batchSize,
-  csrqrInfo_t info, void *pBuffer, cudaStream_t stream) {
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const double* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  const double* b,
+  double* x,
+  int batchSize,
+  csrqrInfo_t info,
+  void* pBuffer,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
-  return cusolverSpDcsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA,
-                                   csrRowPtrA, csrColIndA, b, x, batchSize,
-                                   info, pBuffer);
+  return cusolverSpDcsrqrsvBatched(
+    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index c848ac1f4b..562a3d8991 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -33,11 +33,10 @@ namespace linalg {
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
-                  cudaStream_t stream) {
+void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
   unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in / scalar; },
-    stream);
+    out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 6172618380..75e77ac0ce 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -41,26 +41,43 @@ namespace linalg {
  * @{
  */
 template <typename math_t>
-void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
-           int n_cols, math_t *eig_vectors, math_t *eig_vals,
-           cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
+void eigDC(const raft::handle_t& handle,
+           const math_t* in,
+           int n_rows,
+           int n_cols,
+           math_t* eig_vectors,
+           math_t* eig_vals,
+           cudaStream_t stream)
+{
+  auto allocator               = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                            CUBLAS_FILL_MODE_UPPER, n_rows, in,
-                                            n_cols, eig_vals, &lwork));
+  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH,
+                                            CUSOLVER_EIG_MODE_VECTOR,
+                                            CUBLAS_FILL_MODE_UPPER,
+                                            n_rows,
+                                            in,
+                                            n_cols,
+                                            eig_vals,
+                                            &lwork));
 
   raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
   raft::mr::device::buffer<int> d_dev_info(allocator, stream, 1);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
-                                 n_cols, eig_vals, d_work.data(), lwork,
-                                 d_dev_info.data(), stream));
+  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH,
+                                 CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER,
+                                 n_rows,
+                                 eig_vectors,
+                                 n_cols,
+                                 eig_vals,
+                                 d_work.data(),
+                                 lwork,
+                                 d_dev_info.data(),
+                                 stream));
   CUDA_CHECK(cudaGetLastError());
 
   int dev_info;
@@ -90,39 +107,80 @@ enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
  * @{
  */
 template <typename math_t>
-void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
-              int n_eig_vals, math_t *eig_vectors, math_t *eig_vals,
-              EigVecMemUsage memUsage, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
+void eigSelDC(const raft::handle_t& handle,
+              math_t* in,
+              int n_rows,
+              int n_cols,
+              int n_eig_vals,
+              math_t* eig_vectors,
+              math_t* eig_vals,
+              EigVecMemUsage memUsage,
+              cudaStream_t stream)
+{
+  auto allocator               = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
   int h_meig;
 
-  CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
-    CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
-    n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork));
+  CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(cusolverH,
+                                             CUSOLVER_EIG_MODE_VECTOR,
+                                             CUSOLVER_EIG_RANGE_I,
+                                             CUBLAS_FILL_MODE_UPPER,
+                                             n_rows,
+                                             in,
+                                             n_cols,
+                                             math_t(0.0),
+                                             math_t(0.0),
+                                             n_cols - n_eig_vals + 1,
+                                             n_cols,
+                                             &h_meig,
+                                             eig_vals,
+                                             &lwork));
 
   raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
   raft::mr::device::buffer<int> d_dev_info(allocator, stream, 1);
   raft::mr::device::buffer<math_t> d_eig_vectors(allocator, stream, 0);
 
   if (memUsage == OVERWRITE_INPUT) {
-    CUSOLVER_CHECK(cusolverDnsyevdx(
-      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
-      CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
-      n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork,
-      d_dev_info.data(), stream));
+    CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH,
+                                    CUSOLVER_EIG_MODE_VECTOR,
+                                    CUSOLVER_EIG_RANGE_I,
+                                    CUBLAS_FILL_MODE_UPPER,
+                                    n_rows,
+                                    in,
+                                    n_cols,
+                                    math_t(0.0),
+                                    math_t(0.0),
+                                    n_cols - n_eig_vals + 1,
+                                    n_cols,
+                                    &h_meig,
+                                    eig_vals,
+                                    d_work.data(),
+                                    lwork,
+                                    d_dev_info.data(),
+                                    stream));
   } else if (memUsage == COPY_INPUT) {
     d_eig_vectors.resize(n_rows * n_cols, stream);
     raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream);
 
-    CUSOLVER_CHECK(cusolverDnsyevdx(
-      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
-      CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0),
-      math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals,
-      d_work.data(), lwork, d_dev_info.data(), stream));
+    CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH,
+                                    CUSOLVER_EIG_MODE_VECTOR,
+                                    CUSOLVER_EIG_RANGE_I,
+                                    CUBLAS_FILL_MODE_UPPER,
+                                    n_rows,
+                                    eig_vectors,
+                                    n_cols,
+                                    math_t(0.0),
+                                    math_t(0.0),
+                                    n_cols - n_eig_vals + 1,
+                                    n_cols,
+                                    &h_meig,
+                                    eig_vals,
+                                    d_work.data(),
+                                    lwork,
+                                    d_dev_info.data(),
+                                    stream));
   }
 
   CUDA_CHECK(cudaGetLastError());
@@ -135,11 +193,10 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
          "This usually occurs when some of the features do not vary enough.");
 
   if (memUsage == OVERWRITE_INPUT) {
-    raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals,
-                                  stream);
+    raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, stream);
   } else if (memUsage == COPY_INPUT) {
-    raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors,
-                                  n_rows, n_eig_vals, stream);
+    raft::matrix::truncZeroOrigin(
+      d_eig_vectors.data(), n_rows, eig_vectors, n_rows, n_eig_vals, stream);
   }
 }
 
@@ -160,10 +217,17 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
  * @{
  */
 template <typename math_t>
-void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
-               int n_cols, math_t *eig_vectors, math_t *eig_vals,
-               cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) {
-  auto allocator = handle.get_device_allocator();
+void eigJacobi(const raft::handle_t& handle,
+               const math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* eig_vectors,
+               math_t* eig_vals,
+               cudaStream_t stream,
+               math_t tol = 1.e-7,
+               int sweeps = 15)
+{
+  auto allocator               = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   syevjInfo_t syevj_params = nullptr;
@@ -172,23 +236,36 @@ void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
   CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps));
 
   int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows,
-    eig_vectors, n_cols, eig_vals, &lwork, syevj_params));
+  CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(cusolverH,
+                                            CUSOLVER_EIG_MODE_VECTOR,
+                                            CUBLAS_FILL_MODE_UPPER,
+                                            n_rows,
+                                            eig_vectors,
+                                            n_cols,
+                                            eig_vals,
+                                            &lwork,
+                                            syevj_params));
 
   raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
   raft::mr::device::buffer<int> dev_info(allocator, stream, 1);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
-                                 n_cols, eig_vals, d_work.data(), lwork,
-                                 dev_info.data(), syevj_params, stream));
+  CUSOLVER_CHECK(cusolverDnsyevj(cusolverH,
+                                 CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER,
+                                 n_rows,
+                                 eig_vectors,
+                                 n_cols,
+                                 eig_vals,
+                                 d_work.data(),
+                                 lwork,
+                                 dev_info.data(),
+                                 syevj_params,
+                                 stream));
 
   int executed_sweeps;
-  CUSOLVER_CHECK(
-    cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
+  CUSOLVER_CHECK(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
 
   CUDA_CHECK(cudaGetLastError());
   CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh
index 1c6dee562d..097c3ac218 100644
--- a/cpp/include/raft/linalg/eltwise.cuh
+++ b/cpp/include/raft/linalg/eltwise.cuh
@@ -34,19 +34,17 @@ namespace linalg {
  * @{
  */
 template <typename InType, typename IdxType, typename OutType = InType>
-void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len,
-               cudaStream_t stream) {
+void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
   raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in + scalar; },
-    stream);
+    out, in, len, [scalar] __device__(InType in) { return in + scalar; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len,
-                    cudaStream_t stream) {
+void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
   raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in * scalar; },
-    stream);
+    out, in, len, [scalar] __device__(InType in) { return in * scalar; }, stream);
 }
 /** @} */
 
@@ -62,42 +60,46 @@ void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len,
  * @{
  */
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len,
-                cudaStream_t stream) {
+void eltwiseAdd(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len,
-                cudaStream_t stream) {
+void eltwiseSub(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2,
-                     IdxType len, cudaStream_t stream) {
+void eltwiseMultiply(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivide(OutType *out, const InType *in1, const InType *in2,
-                   IdxType len, cudaStream_t stream) {
+void eltwiseDivide(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2,
-                            IdxType len, cudaStream_t stream) {
+void eltwiseDivideCheckZero(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len,
+    out,
+    in1,
+    in2,
+    len,
     [] __device__(InType a, InType b) {
       if (b == InType(0.0))
         return InType(0.0);
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index 0a4897cc0b..d5942b7446 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -43,35 +43,53 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename math_t>
-void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
-          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
-          cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha,
-          math_t beta, cudaStream_t stream) {
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          math_t alpha,
+          math_t beta,
+          cudaStream_t stream)
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
-  int m = n_rows_c;
-  int n = n_cols_c;
-  int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a;
+  int m   = n_rows_c;
+  int n   = n_cols_c;
+  int k   = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a;
   int lda = trans_a == CUBLAS_OP_T ? k : m;
   int ldb = trans_b == CUBLAS_OP_T ? n : k;
   int ldc = m;
-  CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda,
-                          b, ldb, &beta, c, ldc, stream));
+  CUBLAS_CHECK(
+    cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
 }
 
 template <typename math_t>
-void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
-          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
-          cublasOperation_t trans_a, cublasOperation_t trans_b,
-          cudaStream_t stream) {
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          cudaStream_t stream)
+{
   math_t alpha = math_t(1);
-  math_t beta = math_t(0);
-  gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a,
-       trans_b, alpha, beta, stream);
+  math_t beta  = math_t(0);
+  gemm(
+    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
 }
 
 /**
- * @brief A wrapper for CUBLS GEMM function designed for handling all possible 
+ * @brief A wrapper for CUBLS GEMM function designed for handling all possible
  * combinations of operand layouts.
  * It computes the following equation: Z = alpha . X * Y + beta . Z
  * @tparam T Data type of input/output matrices (float/double)
@@ -90,9 +108,20 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
  * @param beta scalar
  */
 template <typename T>
-void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
-          int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor,
-          cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) {
+void gemm(const raft::handle_t& handle,
+          T* z,
+          T* x,
+          T* y,
+          int _M,
+          int _N,
+          int _K,
+          bool isZColMajor,
+          bool isXColMajor,
+          bool isYColMajor,
+          cudaStream_t stream,
+          T alpha = T(1.0),
+          T beta  = T(0.0))
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
   cublasOperation_t trans_a, trans_b;
@@ -119,13 +148,13 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
     // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major
     // layout, trans_b needs to be CUBLAS_OP_N.
     trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T;
-    ldb = isYColMajor == true ? _K : _N;
+    ldb     = isYColMajor == true ? _K : _N;
 
-    c = z;
+    c   = z;
     ldc = _M;
-    M = _M;
-    N = _N;
-    K = _K;
+    M   = _M;
+    N   = _N;
+    K   = _K;
   } else {
     // Result c is required in row major layout Thus we pick
     // a = y, b = x and c = a * b = y * x
@@ -154,7 +183,7 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
     // Set leading dimension appropriately
     ldb = isXColMajor == true ? _M : _K;
 
-    c = z;
+    c   = z;
     ldc = _N;
 
     M = _N;
@@ -162,8 +191,8 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
     K = _K;
   }
   // Actual cuBLAS call
-  CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda,
-                          b, ldb, &beta, c, ldc, stream));
+  CUBLAS_CHECK(
+    cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
 }
 
 }  // end namespace linalg
diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h
index edd18b3bee..a78480bb21 100644
--- a/cpp/include/raft/linalg/gemv.h
+++ b/cpp/include/raft/linalg/gemv.h
@@ -26,9 +26,19 @@ namespace raft {
 namespace linalg {
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle, const math_t* a, int n_rows, int n_cols,
-          const math_t* x, int incx, math_t* y, int incy, bool trans_a,
-          math_t alpha, math_t beta, cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows,
+          int n_cols,
+          const math_t* x,
+          int incx,
+          math_t* y,
+          int incy,
+          bool trans_a,
+          math_t alpha,
+          math_t beta,
+          cudaStream_t stream)
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
   cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
@@ -40,33 +50,47 @@ void gemv(const raft::handle_t& handle, const math_t* a, int n_rows, int n_cols,
   //  n - number of columns in input matrix
   //  lda - purpose of it  to have ability to operate on submatrices of matrix without copying.
   //        If you're not think about it it's always should be equal to m
-  //  lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform transpose
+  //  lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform
+  //  transpose
 
   // In Machine Learning:
   //  m - nunmber of columns in design matrix(number of features)
   //  n - number of rows in designed matrix (number of train examples)
 
-  int m = n_rows;
-  int n = n_cols;
+  int m   = n_rows;
+  int n   = n_cols;
   int lda = trans_a ? m : n;
 
-  CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta,
-                          y, incy, stream));
+  CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta, y, incy, stream));
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a,
-          int n_cols_a, const math_t* x, math_t* y, bool trans_a, math_t alpha,
-          math_t beta, cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          bool trans_a,
+          math_t alpha,
+          math_t beta,
+          cudaStream_t stream)
+{
   gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a,
-          int n_cols_a, const math_t* x, math_t* y, bool trans_a,
-          cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          bool trans_a,
+          cudaStream_t stream)
+{
   math_t alpha = math_t(1);
-  math_t beta = math_t(0);
+  math_t beta  = math_t(0);
 
   gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
 }
diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h
index cb2e8ed1ab..2086172f5d 100644
--- a/cpp/include/raft/linalg/init.h
+++ b/cpp/include/raft/linalg/init.h
@@ -36,7 +36,8 @@ namespace {
  * \param [in] stream cuda stream
  */
 template <typename T>
-void range(T *out, int start, int end, cudaStream_t stream) {
+void range(T* out, int start, int end, cudaStream_t stream)
+{
   thrust::counting_iterator<int> first(start);
   thrust::counting_iterator<int> last = first + (end - start);
   thrust::device_ptr<T> ptr(out);
@@ -53,7 +54,8 @@ void range(T *out, int start, int end, cudaStream_t stream) {
  * \param [in] stream cuda stream
  */
 template <typename T, int TPB = 256>
-void range(T *out, int n, cudaStream_t stream) {
+void range(T* out, int n, cudaStream_t stream)
+{
   range(out, 0, n, stream);
 }
 }  // unnamed namespace
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index b775a1f696..39089473e3 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-//for cmath:
+// for cmath:
 #define _USE_MATH_DEFINES
 
 #include <cmath>
@@ -40,14 +40,14 @@ using namespace linalg;
 namespace spectral {
 
 // curandGeneratorNormalX
-inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
-                                            float *outputPtr, size_t n,
-                                            float mean, float stddev) {
+inline curandStatus_t curandGenerateNormalX(
+  curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev)
+{
   return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
 }
-inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
-                                            double *outputPtr, size_t n,
-                                            double mean, double stddev) {
+inline curandStatus_t curandGenerateNormalX(
+  curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev)
+{
   return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev);
 }
 
@@ -55,7 +55,7 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
 // Helper functions
 // =========================================================
 
-/**  
+/**
  *  @brief  Perform Lanczos iteration
  *    Lanczos iteration is performed on a shifted matrix A+shift*I.
  *  @tparam index_type_t the type of data used for indexing.
@@ -85,25 +85,30 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-int performLanczosIteration(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t *iter, index_type_t maxIter, value_type_t shift,
-  value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host,
-  value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev) {
+int performLanczosIteration(handle_t const& handle,
+                            sparse_matrix_t<index_type_t, value_type_t> const* A,
+                            index_type_t* iter,
+                            index_type_t maxIter,
+                            value_type_t shift,
+                            value_type_t tol,
+                            bool reorthogonalize,
+                            value_type_t* __restrict__ alpha_host,
+                            value_type_t* __restrict__ beta_host,
+                            value_type_t* __restrict__ lanczosVecs_dev,
+                            value_type_t* __restrict__ work_dev)
+{
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
 
   // Useful variables
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one    = 1;
   constexpr value_type_t negOne = -1;
-  constexpr value_type_t zero = 0;
+  constexpr value_type_t zero   = 0;
   value_type_t alpha;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
 
@@ -117,29 +122,28 @@ int performLanczosIteration(
 
     // Apply matrix
     if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev,
+      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n,
+                               lanczosVecs_dev,
                                n * sizeof(value_type_t),
-                               cudaMemcpyDeviceToDevice, stream));
+                               cudaMemcpyDeviceToDevice,
+                               stream));
     A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n);
 
     // Orthogonalize Lanczos vector
-    CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1,
-                           lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host,
-                           stream));
+    CUBLAS_CHECK(cublasdot(
+      cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream));
 
     alpha = -alpha_host[0];
-    CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1,
-                            lanczosVecs_dev + IDX(0, 1, n), 1, stream));
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1,
-                            beta_host, stream));
+    CUBLAS_CHECK(cublasaxpy(
+      cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
 
     // Check if Lanczos has converged
     if (beta_host[0] <= tol) return 0;
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[0];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n),
-                            1, stream));
+    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
   }
 
   // -------------------------------------------------------
@@ -151,65 +155,121 @@ int performLanczosIteration(
 
     // Apply matrix
     if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(
-        lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n,
-        n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
-    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift,
-          lanczosVecs_dev + IDX(0, *iter, n));
+      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n,
+                               lanczosVecs_dev + (*iter - 1) * n,
+                               n * sizeof(value_type_t),
+                               cudaMemcpyDeviceToDevice,
+                               stream));
+    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n));
 
     // Full reorthogonalization
     //   "Twice is enough" algorithm per Kahan and Parlett
     if (reorthogonalize) {
-      CUBLAS_CHECK(cublasgemv(
-        cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n,
-        lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne,
-                              lanczosVecs_dev, n, work_dev, 1, &one,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
-
-      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1),
-                               sizeof(value_type_t), cudaMemcpyDeviceToHost,
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_T,
+                              n,
+                              *iter,
+                              &one,
+                              lanczosVecs_dev,
+                              n,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              &zero,
+                              work_dev,
+                              1,
+                              stream));
+
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_N,
+                              n,
+                              *iter,
+                              &negOne,
+                              lanczosVecs_dev,
+                              n,
+                              work_dev,
+                              1,
+                              &one,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
+
+      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1),
+                               work_dev + (*iter - 1),
+                               sizeof(value_type_t),
+                               cudaMemcpyDeviceToHost,
                                stream));
 
-      CUBLAS_CHECK(cublasgemv(
-        cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n,
-        lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne,
-                              lanczosVecs_dev, n, work_dev, 1, &one,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_T,
+                              n,
+                              *iter,
+                              &one,
+                              lanczosVecs_dev,
+                              n,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              &zero,
+                              work_dev,
+                              1,
+                              stream));
+
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_N,
+                              n,
+                              *iter,
+                              &negOne,
+                              lanczosVecs_dev,
+                              n,
+                              work_dev,
+                              1,
+                              &one,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
     }
 
     // Orthogonalization with 3-term recurrence relation
     else {
-      CUBLAS_CHECK(cublasdot(cublas_h, n,
-                             lanczosVecs_dev + IDX(0, *iter - 1, n), 1,
-                             lanczosVecs_dev + IDX(0, *iter, n), 1,
-                             alpha_host + (*iter - 1), stream));
+      CUBLAS_CHECK(cublasdot(cublas_h,
+                             n,
+                             lanczosVecs_dev + IDX(0, *iter - 1, n),
+                             1,
+                             lanczosVecs_dev + IDX(0, *iter, n),
+                             1,
+                             alpha_host + (*iter - 1),
+                             stream));
 
       auto alpha = -alpha_host[*iter - 1];
-      CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 1, n), 1,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+      CUBLAS_CHECK(cublasaxpy(cublas_h,
+                              n,
+                              &alpha,
+                              lanczosVecs_dev + IDX(0, *iter - 1, n),
+                              1,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
 
       alpha = -beta_host[*iter - 2];
-      CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 2, n), 1,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+      CUBLAS_CHECK(cublasaxpy(cublas_h,
+                              n,
+                              &alpha,
+                              lanczosVecs_dev + IDX(0, *iter - 2, n),
+                              1,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
     }
 
     // Compute residual
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1,
-                            beta_host + *iter - 1, stream));
+    CUBLAS_CHECK(cublasnrm2(
+      cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream));
 
     // Check if Lanczos has converged
     if (beta_host[*iter - 1] <= tol) break;
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[*iter - 1];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha,
-                            lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
   }
 
   CUDA_TRY(cudaStreamSynchronize(stream));
@@ -217,7 +277,7 @@ int performLanczosIteration(
   return 0;
 }
 
-/** 
+/**
  *  @brief  Find Householder transform for 3-dimensional system
  *    Given an input vector v=[x,y,z]', this function finds a
  *    Householder transform P such that P*v is a multiple of
@@ -235,8 +295,8 @@ int performLanczosIteration(
  *    matrix. Matrix dimensions are 3 x 3.
  */
 template <typename index_type_t, typename value_type_t>
-static void findHouseholder3(value_type_t *v, value_type_t *Pv,
-                             value_type_t *P) {
+static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P)
+{
   // Compute norm of vector
   *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
 
@@ -246,8 +306,7 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv,
   v[0] -= *Pv;
 
   // Normalize Householder vector
-  value_type_t normHouseholder =
-    std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+  value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
   if (normHouseholder != 0) {
     v[0] /= normHouseholder;
     v[1] /= normHouseholder;
@@ -261,11 +320,13 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv,
   // Construct Householder matrix
   index_type_t i, j;
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j];
-  for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1;
+    for (i = 0; i < 3; ++i)
+      P[IDX(i, j, 3)] = -2 * v[i] * v[j];
+  for (i = 0; i < 3; ++i)
+    P[IDX(i, i, 3)] += 1;
 }
 
-/**  
+/**
  *  @brief  Apply 3-dimensional Householder transform to 4 x 4 matrix
  *    The Householder transform is pre-applied to the top three rows
  *  of the matrix and post-applied to the left three columns. The
@@ -277,7 +338,8 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv,
  *  @param A (Input/output, host memory, 16 entries) 4 x 4 matrix.
  */
 template <typename index_type_t, typename value_type_t>
-static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
+static void applyHouseholder3(const value_type_t* v, value_type_t* A)
+{
   // Loop indices
   index_type_t i, j;
   // Dot product between Householder vector and matrix row/column
@@ -286,19 +348,23 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
   // Pre-apply Householder transform
   for (j = 0; j < 4; ++j) {
     vDotA = 0;
-    for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)];
-    for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
+    for (i = 0; i < 3; ++i)
+      vDotA += v[i] * A[IDX(i, j, 4)];
+    for (i = 0; i < 3; ++i)
+      A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
   }
 
   // Post-apply Householder transform
   for (i = 0; i < 4; ++i) {
     vDotA = 0;
-    for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j];
-    for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
+    for (j = 0; j < 3; ++j)
+      vDotA += A[IDX(i, j, 4)] * v[j];
+    for (j = 0; j < 3; ++j)
+      A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
   }
 }
 
-/**  
+/**
  *  @brief  Perform one step of Francis QR algorithm
  *    Equivalent to two steps of the classical QR algorithm on a
  *    tridiagonal matrix.
@@ -319,10 +385,14 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int francisQRIteration(index_type_t n, value_type_t shift1,
-                              value_type_t shift2, value_type_t *alpha,
-                              value_type_t *beta, value_type_t *V,
-                              value_type_t *work) {
+static int francisQRIteration(index_type_t n,
+                              value_type_t shift1,
+                              value_type_t shift2,
+                              value_type_t* alpha,
+                              value_type_t* beta,
+                              value_type_t* V,
+                              value_type_t* work)
+{
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
@@ -352,30 +422,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
   householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c;
   householder[1] = beta[0] * (alpha[0] + alpha[1] + b);
   householder[2] = beta[0] * beta[1];
-  findHouseholder3<index_type_t, value_type_t>(householder, &temp,
-                                               householderMatrix);
+  findHouseholder3<index_type_t, value_type_t>(householder, &temp, householderMatrix);
 
   // Apply initial Householder transform to create bulge
   memset(bulge, 0, 16 * sizeof(value_type_t));
-  for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i];
+  for (i = 0; i < 4; ++i)
+    bulge[IDX(i, i, 4)] = alpha[i];
   for (i = 0; i < 3; ++i) {
     bulge[IDX(i + 1, i, 4)] = beta[i];
     bulge[IDX(i, i + 1, 4)] = beta[i];
   }
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix,
-                             3, 0, work, n);
+  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n);
   memcpy(V, work, 3 * n * sizeof(value_type_t));
 
   // Chase bulge to bottom-right of matrix with Householder transforms
   for (pos = 0; pos < n - 4; ++pos) {
     // Move to next position
-    alpha[pos] = bulge[IDX(0, 0, 4)];
+    alpha[pos]     = bulge[IDX(0, 0, 4)];
     householder[0] = bulge[IDX(1, 0, 4)];
     householder[1] = bulge[IDX(2, 0, 4)];
     householder[2] = bulge[IDX(3, 0, 4)];
     for (j = 0; j < 3; ++j)
-      for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+      for (i = 0; i < 3; ++i)
+        bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
     bulge[IDX(3, 0, 4)] = 0;
     bulge[IDX(3, 1, 4)] = 0;
     bulge[IDX(3, 2, 4)] = beta[pos + 3];
@@ -385,22 +455,22 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
     bulge[IDX(3, 3, 4)] = alpha[pos + 4];
 
     // Apply Householder transform
-    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos,
-                                                 householderMatrix);
+    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos, householderMatrix);
     applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-    Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n),
-                               n, householderMatrix, 3, 0, work, n);
+    Lapack<value_type_t>::gemm(
+      false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n);
     memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t));
   }
 
   // Apply penultimate Householder transform
   //   Values in the last row and column are zero
-  alpha[n - 4] = bulge[IDX(0, 0, 4)];
+  alpha[n - 4]   = bulge[IDX(0, 0, 4)];
   householder[0] = bulge[IDX(1, 0, 4)];
   householder[1] = bulge[IDX(2, 0, 4)];
   householder[2] = bulge[IDX(3, 0, 4)];
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+    for (i = 0; i < 3; ++i)
+      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
   bulge[IDX(3, 0, 4)] = 0;
   bulge[IDX(3, 1, 4)] = 0;
   bulge[IDX(3, 2, 4)] = 0;
@@ -408,37 +478,36 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
   bulge[IDX(1, 3, 4)] = 0;
   bulge[IDX(2, 3, 4)] = 0;
   bulge[IDX(3, 3, 4)] = 0;
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4,
-                                               householderMatrix);
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4, householderMatrix);
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n,
-                             householderMatrix, 3, 0, work, n);
+  Lapack<value_type_t>::gemm(
+    false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n);
   memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t));
 
   // Apply final Householder transform
   //   Values in the last two rows and columns are zero
-  alpha[n - 3] = bulge[IDX(0, 0, 4)];
+  alpha[n - 3]   = bulge[IDX(0, 0, 4)];
   householder[0] = bulge[IDX(1, 0, 4)];
   householder[1] = bulge[IDX(2, 0, 4)];
   householder[2] = 0;
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3,
-                                               householderMatrix);
+    for (i = 0; i < 3; ++i)
+      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3, householderMatrix);
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n,
-                             householderMatrix, 3, 0, work, n);
+  Lapack<value_type_t>::gemm(
+    false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n);
   memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t));
 
   // Bulge has been eliminated
   alpha[n - 2] = bulge[IDX(0, 0, 4)];
   alpha[n - 1] = bulge[IDX(1, 1, 4)];
-  beta[n - 2] = bulge[IDX(1, 0, 4)];
+  beta[n - 2]  = bulge[IDX(1, 0, 4)];
 
   return 0;
 }
 
-/**  
+/**
  *  @brief  Perform implicit restart of Lanczos algorithm
  *    Shifts are Chebyshev nodes of unwanted region of matrix spectrum.
  *  @tparam index_type_t the type of data used for indexing.
@@ -474,23 +543,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-static int lanczosRestart(
-  handle_t const &handle, index_type_t n, index_type_t iter,
-  index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower,
-  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, bool smallest_eig) {
+static int lanczosRestart(handle_t const& handle,
+                          index_type_t n,
+                          index_type_t iter,
+                          index_type_t iter_new,
+                          value_type_t* shiftUpper,
+                          value_type_t* shiftLower,
+                          value_type_t* __restrict__ alpha_host,
+                          value_type_t* __restrict__ beta_host,
+                          value_type_t* __restrict__ V_host,
+                          value_type_t* __restrict__ work_host,
+                          value_type_t* __restrict__ lanczosVecs_dev,
+                          value_type_t* __restrict__ work_dev,
+                          bool smallest_eig)
+{
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
 
   // Useful constants
   constexpr value_type_t zero = 0;
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one  = 1;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // Loop index
   index_type_t i;
@@ -501,12 +577,12 @@ static int lanczosRestart(
   index_type_t restartSteps = iter - iter_new;
 
   // Ritz values from Lanczos method
-  value_type_t *ritzVals_host = work_host + 3 * iter;
+  value_type_t* ritzVals_host = work_host + 3 * iter;
   // Shifts for implicit restart
-  value_type_t *shifts_host;
+  value_type_t* shifts_host;
 
   // Orthonormal matrix for similarity transform
-  value_type_t *V_dev = work_dev + n * iter;
+  value_type_t* V_dev = work_dev + n * iter;
 
   // -------------------------------------------------------
   // Implementation
@@ -524,7 +600,8 @@ static int lanczosRestart(
 
   // Initialize similarity transform with identity matrix
   memset(V_host, 0, iter * iter * sizeof(value_type_t));
-  for (i = 0; i < iter; ++i) V_host[IDX(i, i, iter)] = 1;
+  for (i = 0; i < iter; ++i)
+    V_host[IDX(i, i, iter)] = 1;
 
   // Determine interval to suppress eigenvalues
   if (smallest_eig) {
@@ -548,49 +625,71 @@ static int lanczosRestart(
   // Calculate Chebyshev nodes as shifts
   shifts_host = ritzVals_host;
   for (i = 0; i < restartSteps; ++i) {
-    shifts_host[i] =
-      cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
+    shifts_host[i] = cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
     shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower));
     shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower));
   }
 
   // Apply Francis QR algorithm to implicitly restart Lanczos
   for (i = 0; i < restartSteps; i += 2)
-    if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host,
-                           beta_host, V_host, work_host))
+    if (francisQRIteration(
+          iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host))
       WARNING("error in implicitly shifted QR algorithm");
 
   // Obtain new residual
-  CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
-
-  beta_host[iter - 1] =
-    beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
-  CUBLAS_CHECK(cublasgemv(
-    cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev,
-    n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1,
-    lanczosVecs_dev + IDX(0, iter, n), 1, stream));
+  CUDA_TRY(cudaMemcpyAsync(
+    V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+
+  beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
+  CUBLAS_CHECK(cublasgemv(cublas_h,
+                          CUBLAS_OP_N,
+                          n,
+                          iter,
+                          beta_host + iter_new - 1,
+                          lanczosVecs_dev,
+                          n,
+                          V_dev + IDX(0, iter_new, iter),
+                          1,
+                          beta_host + iter - 1,
+                          lanczosVecs_dev + IDX(0, iter, n),
+                          1,
+                          stream));
 
   // Obtain new Lanczos vectors
-  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter,
-                          &one, lanczosVecs_dev, n, V_dev, iter, &zero,
-                          work_dev, n, stream));
-
-  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev,
+  CUBLAS_CHECK(cublasgemm(cublas_h,
+                          CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          iter_new,
+                          iter,
+                          &one,
+                          lanczosVecs_dev,
+                          n,
+                          V_dev,
+                          iter,
+                          &zero,
+                          work_dev,
+                          n,
+                          stream));
+
+  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
+                           work_dev,
                            n * iter_new * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice, stream));
+                           cudaMemcpyDeviceToDevice,
+                           stream));
 
   // Normalize residual to obtain new Lanczos vector
-  CUDA_TRY(cudaMemcpyAsync(
-    lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n),
-    n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n),
+                           lanczosVecs_dev + IDX(0, iter, n),
+                           n * sizeof(value_type_t),
+                           cudaMemcpyDeviceToDevice,
+                           stream));
 
-  CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1,
-                          beta_host + iter_new - 1, stream));
+  CUBLAS_CHECK(cublasnrm2(
+    cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream));
 
   auto h_beta = 1 / beta_host[iter_new - 1];
-  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta,
-                          lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
+  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
 
   return 0;
 }
@@ -601,7 +700,7 @@ static int lanczosRestart(
 // Eigensolver
 // =========================================================
 
-/**  
+/**
  * @brief  Compute smallest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -651,19 +750,28 @@ static int lanczosRestart(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
-  index_type_t *totalIter, value_type_t *shift,
-  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const* A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t* effIter,
+                                index_type_t* totalIter,
+                                value_type_t* shift,
+                                value_type_t* __restrict__ alpha_host,
+                                value_type_t* __restrict__ beta_host,
+                                value_type_t* __restrict__ lanczosVecs_dev,
+                                value_type_t* __restrict__ work_dev,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed)
+{
   using namespace spectral;
 
   // Useful constants
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one  = 1;
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
@@ -683,21 +791,20 @@ int computeSmallestEigenvectors(
   index_type_t i;
 
   // Host memory
-  value_type_t *Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t *work_host;  // Workspace
+  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t* work_host;  // Workspace
 
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
   RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // -------------------------------------------------------
   // Variable initialization
@@ -710,12 +817,11 @@ int computeSmallestEigenvectors(
   std::vector<value_type_t> Z_host_v(restartIter * restartIter);
   std::vector<value_type_t> work_host_v(4 * restartIter);
 
-  Z_host = Z_host_v.data();
+  Z_host    = Z_host_v.data();
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue to determine shift
@@ -738,10 +844,18 @@ int computeSmallestEigenvectors(
 
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
-  *shift = 0;
-  status = performLanczosIteration<index_type_t, value_type_t>(
-    handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host,
-    beta_host, lanczosVecs_dev, work_dev);
+  *shift   = 0;
+  status   = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0.0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
   if (status) WARNING("error in Lanczos iteration");
 
   // Determine largest eigenvalue
@@ -756,9 +870,17 @@ int computeSmallestEigenvectors(
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
 
-  status = performLanczosIteration<index_type_t, value_type_t>(
-    handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host,
-    beta_host, lanczosVecs_dev, work_dev);
+  status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
   if (status) WARNING("error in Lanczos iteration");
   *totalIter += *effIter;
 
@@ -775,9 +897,19 @@ int computeSmallestEigenvectors(
     if (iter_new == *effIter) break;
 
     // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(
-      handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host,
-      beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true);
+    status = lanczosRestart<index_type_t, value_type_t>(handle,
+                                                        n,
+                                                        *effIter,
+                                                        iter_new,
+                                                        &shiftUpper,
+                                                        &shiftLower,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        Z_host,
+                                                        work_host,
+                                                        lanczosVecs_dev,
+                                                        work_dev,
+                                                        true);
     if (status) WARNING("error in Lanczos implicit restart");
     *effIter = iter_new;
 
@@ -786,9 +918,17 @@ int computeSmallestEigenvectors(
 
     // Proceed with Lanczos method
 
-    status = performLanczosIteration<index_type_t, value_type_t>(
-      handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower),
-      reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                                 A,
+                                                                 effIter,
+                                                                 maxIter_curr,
+                                                                 *shift,
+                                                                 tol * fabs(shiftLower),
+                                                                 reorthogonalize,
+                                                                 alpha_host,
+                                                                 beta_host,
+                                                                 lanczosVecs_dev,
+                                                                 work_dev);
     if (status) WARNING("error in Lanczos iteration");
     *totalIter += *effIter - iter_new;
   }
@@ -799,39 +939,59 @@ int computeSmallestEigenvectors(
   }
 
   // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host,
-         (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host,
-         (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I', *effIter, work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter), Z_host, *effIter,
+  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I',
+                              *effIter,
+                              work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter),
+                              Z_host,
+                              *effIter,
                               work_host);
 
   // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift;
-  for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0;
+  for (i = 0; i < *effIter; ++i)
+    work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = *effIter; i < nEigVecs; ++i)
+    work_host[i + 2 * (*effIter)] = 0;
 
   // Copy results to device memory
-  CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter),
+  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                           work_host + 2 * (*effIter),
                            nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
+                           cudaMemcpyHostToDevice,
+                           stream));
 
-  CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host,
+  CUDA_TRY(cudaMemcpyAsync(work_dev,
+                           Z_host,
                            (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
+                           cudaMemcpyHostToDevice,
+                           stream));
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs,
-                          *effIter, &one, lanczosVecs_dev, n, work_dev,
-                          *effIter, &zero, eigVecs_dev, n, stream));
+  CUBLAS_CHECK(cublasgemm(cublas_h,
+                          CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          nEigVecs,
+                          *effIter,
+                          &one,
+                          lanczosVecs_dev,
+                          n,
+                          work_dev,
+                          *effIter,
+                          &zero,
+                          eigVecs_dev,
+                          n,
+                          stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
   return 0;
 }
 
-/**  
+/**
  *  @brief  Compute smallest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -869,20 +1029,25 @@ int computeSmallestEigenvectors(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t &iter,
-  value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) {
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t& iter,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed = 1234567)
+{
   using namespace spectral;
 
   // Matrix dimension
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -892,8 +1057,8 @@ int computeSmallestEigenvectors(
   std::vector<value_type_t> alpha_host_v(restartIter);
   std::vector<value_type_t> beta_host_v(restartIter);
 
-  value_type_t *alpha_host = alpha_host_v.data();
-  value_type_t *beta_host = beta_host_v.data();
+  value_type_t* alpha_host = alpha_host_v.data();
+  value_type_t* beta_host  = beta_host_v.data();
 
   vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
   vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
@@ -901,10 +1066,23 @@ int computeSmallestEigenvectors(
   // Perform Lanczos method
   index_type_t effIter;
   value_type_t shift;
-  int status = computeSmallestEigenvectors(
-    handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter,
-    &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(),
-    eigVals_dev, eigVecs_dev, seed);
+  int status = computeSmallestEigenvectors(handle,
+                                           &A,
+                                           nEigVecs,
+                                           maxIter,
+                                           restartIter,
+                                           tol,
+                                           reorthogonalize,
+                                           &effIter,
+                                           &iter,
+                                           &shift,
+                                           alpha_host,
+                                           beta_host,
+                                           lanczosVecs_dev.raw(),
+                                           work_dev.raw(),
+                                           eigVals_dev,
+                                           eigVecs_dev,
+                                           seed);
 
   // Clean up and return
   return status;
@@ -914,7 +1092,7 @@ int computeSmallestEigenvectors(
 // Eigensolver
 // =========================================================
 
-/**  
+/**
  *  @brief Compute largest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -959,19 +1137,27 @@ int computeSmallestEigenvectors(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
-  index_type_t *totalIter, value_type_t *__restrict__ alpha_host,
-  value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const* A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t* effIter,
+                               index_type_t* totalIter,
+                               value_type_t* __restrict__ alpha_host,
+                               value_type_t* __restrict__ beta_host,
+                               value_type_t* __restrict__ lanczosVecs_dev,
+                               value_type_t* __restrict__ work_dev,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed)
+{
   using namespace spectral;
 
   // Useful constants
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one  = 1;
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
@@ -987,8 +1173,8 @@ int computeLargestEigenvectors(
   index_type_t i;
 
   // Host memory
-  value_type_t *Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t *work_host;  // Workspace
+  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t* work_host;  // Workspace
 
   // -------------------------------------------------------
   // Check that LAPACK is enabled
@@ -998,15 +1184,14 @@ int computeLargestEigenvectors(
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
   RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // -------------------------------------------------------
   // Variable initialization
@@ -1019,12 +1204,11 @@ int computeLargestEigenvectors(
   std::vector<value_type_t> Z_host_v(restartIter * restartIter);
   std::vector<value_type_t> work_host_v(4 * restartIter);
 
-  Z_host = Z_host_v.data();
+  Z_host    = Z_host_v.data();
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue
@@ -1044,13 +1228,21 @@ int computeLargestEigenvectors(
   CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
 
   // Obtain tridiagonal matrix with Lanczos
-  *effIter = 0;
+  *effIter               = 0;
   value_type_t shift_val = 0.0;
-  value_type_t *shift = &shift_val;
-
-  status = performLanczosIteration<index_type_t, value_type_t>(
-    handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host,
-    beta_host, lanczosVecs_dev, work_dev);
+  value_type_t* shift    = &shift_val;
+
+  status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
   if (status) WARNING("error in Lanczos iteration");
   *totalIter += *effIter;
 
@@ -1067,9 +1259,19 @@ int computeLargestEigenvectors(
     if (iter_new == *effIter) break;
 
     // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(
-      handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host,
-      beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false);
+    status = lanczosRestart<index_type_t, value_type_t>(handle,
+                                                        n,
+                                                        *effIter,
+                                                        iter_new,
+                                                        &shiftUpper,
+                                                        &shiftLower,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        Z_host,
+                                                        work_host,
+                                                        lanczosVecs_dev,
+                                                        work_dev,
+                                                        false);
     if (status) WARNING("error in Lanczos implicit restart");
     *effIter = iter_new;
 
@@ -1078,9 +1280,17 @@ int computeLargestEigenvectors(
 
     // Proceed with Lanczos method
 
-    status = performLanczosIteration<index_type_t, value_type_t>(
-      handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower),
-      reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                                 A,
+                                                                 effIter,
+                                                                 maxIter_curr,
+                                                                 *shift,
+                                                                 tol * fabs(shiftLower),
+                                                                 reorthogonalize,
+                                                                 alpha_host,
+                                                                 beta_host,
+                                                                 lanczosVecs_dev,
+                                                                 work_dev);
     if (status) WARNING("error in Lanczos iteration");
     *totalIter += *effIter - iter_new;
   }
@@ -1090,15 +1300,18 @@ int computeLargestEigenvectors(
     WARNING("implicitly restarted Lanczos failed to converge");
   }
   for (int i = 0; i < restartIter; ++i) {
-    for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0;
+    for (int j = 0; j < restartIter; ++j)
+      Z_host[i * restartIter + j] = 0;
   }
   // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host,
-         (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host,
-         (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I', *effIter, work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter), Z_host, *effIter,
+  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I',
+                              *effIter,
+                              work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter),
+                              Z_host,
+                              *effIter,
                               work_host);
 
   // note: We need to pick the top nEigVecs eigenvalues
@@ -1123,36 +1336,52 @@ int computeLargestEigenvectors(
   //}
 
   // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = 0; i < *effIter; ++i)
+    work_host[i + 2 * (*effIter)] -= *shift;
 
   for (i = 0; i < top_eigenparis_idx_offset; ++i)
     work_host[i + 2 * (*effIter)] = 0;
 
   // Copy results to device memory
   // skip smallest eigenvalue if needed
-  CUDA_TRY(cudaMemcpyAsync(
-    eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
-    nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                           work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
+                           nEigVecs * sizeof(value_type_t),
+                           cudaMemcpyHostToDevice,
+                           stream));
 
   // skip smallest eigenvector if needed
   CUDA_TRY(cudaMemcpyAsync(work_dev,
                            Z_host + (top_eigenparis_idx_offset * (*effIter)),
                            (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
+                           cudaMemcpyHostToDevice,
+                           stream));
 
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs,
-                          *effIter, &one, lanczosVecs_dev, n, work_dev,
-                          *effIter, &zero, eigVecs_dev, n, stream));
+  CUBLAS_CHECK(cublasgemm(cublas_h,
+                          CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          nEigVecs,
+                          *effIter,
+                          &one,
+                          lanczosVecs_dev,
+                          n,
+                          work_dev,
+                          *effIter,
+                          &zero,
+                          eigVecs_dev,
+                          n,
+                          stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
   return 0;
 }
 
-/**  
+/**
  *  @brief  Compute largest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -1190,18 +1419,23 @@ int computeLargestEigenvectors(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t &iter,
-  value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) {
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const& A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t& iter,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed = 123456)
+{
   // Matrix dimension
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -1211,18 +1445,30 @@ int computeLargestEigenvectors(
   std::vector<value_type_t> alpha_host_v(restartIter);
   std::vector<value_type_t> beta_host_v(restartIter);
 
-  value_type_t *alpha_host = alpha_host_v.data();
-  value_type_t *beta_host = beta_host_v.data();
+  value_type_t* alpha_host = alpha_host_v.data();
+  value_type_t* beta_host  = beta_host_v.data();
 
   vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
   vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
 
   // Perform Lanczos method
   index_type_t effIter;
-  int status = computeLargestEigenvectors(
-    handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter,
-    &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(),
-    eigVals_dev, eigVecs_dev, seed);
+  int status = computeLargestEigenvectors(handle,
+                                          &A,
+                                          nEigVecs,
+                                          maxIter,
+                                          restartIter,
+                                          tol,
+                                          reorthogonalize,
+                                          &effIter,
+                                          &iter,
+                                          alpha_host,
+                                          beta_host,
+                                          lanczosVecs_dev.raw(),
+                                          work_dev.raw(),
+                                          eigVals_dev,
+                                          eigVecs_dev,
+                                          seed);
 
   // Clean up and return
   return status;
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh
index aff08da2d3..200818fdc3 100644
--- a/cpp/include/raft/linalg/map.cuh
+++ b/cpp/include/raft/linalg/map.cuh
@@ -24,21 +24,18 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, typename OutType, typename MapOp, int TPB,
-          typename... Args>
-__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in,
-                          Args... args) {
+template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
+__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args)
+{
   auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-  if (idx < len) {
-    out[idx] = map(in[idx], args[idx]...);
-  }
+  if (idx < len) { out[idx] = map(in[idx], args[idx]...); }
 }
 
-template <typename InType, typename OutType, typename MapOp, int TPB,
-          typename... Args>
-void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-             const InType *in, Args... args) {
+template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
+void mapImpl(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapKernel<InType, OutType, MapOp, TPB, Args...>
     <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
@@ -60,12 +57,14 @@ void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream,
  * @param args additional input arrays
  */
 
-template <typename InType, typename MapOp, int TPB = 256, typename... Args,
+template <typename InType,
+          typename MapOp,
+          int TPB = 256,
+          typename... Args,
           typename OutType = InType>
-void map(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-         const InType *in, Args... args) {
-  mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in,
-                                                args...);
+void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
 }
 
 }  // namespace linalg
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh
index f2f198670a..78a7017c5c 100644
--- a/cpp/include/raft/linalg/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/map_then_reduce.cuh
@@ -24,50 +24,66 @@
 namespace raft {
 namespace linalg {
 
-struct sum_tag {};
+struct sum_tag {
+};
 
 template <typename InType, typename OutType, int TPB>
-__device__ void reduce(OutType *out, const InType acc, sum_tag) {
+__device__ void reduce(OutType* out, const InType acc, sum_tag)
+{
   typedef cub::BlockReduce<InType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType tmp = BlockReduce(temp_storage).Sum(acc);
-  if (threadIdx.x == 0) {
-    raft::myAtomicAdd(out, tmp);
-  }
+  if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); }
 }
 
 template <typename InType, typename OutType, int TPB, typename ReduceLambda>
-__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) {
+__device__ void reduce(OutType* out, const InType acc, ReduceLambda op)
+{
   typedef cub::BlockReduce<InType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType tmp = BlockReduce(temp_storage).Reduce(acc, op);
-  if (threadIdx.x == 0) {
-    raft::myAtomicReduce(out, tmp, op);
-  }
+  if (threadIdx.x == 0) { raft::myAtomicReduce(out, tmp, op); }
 }
 
-template <typename InType, typename OutType, typename MapOp,
-          typename ReduceLambda, int TPB, typename... Args>
-__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral,
-                                    MapOp map, ReduceLambda op,
-                                    const InType *in, Args... args) {
+template <typename InType,
+          typename OutType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB,
+          typename... Args>
+__global__ void mapThenReduceKernel(OutType* out,
+                                    size_t len,
+                                    OutType neutral,
+                                    MapOp map,
+                                    ReduceLambda op,
+                                    const InType* in,
+                                    Args... args)
+{
   OutType acc = neutral;
-  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
+  auto idx    = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-  if (idx < len) {
-    acc = map(in[idx], args[idx]...);
-  }
+  if (idx < len) { acc = map(in[idx], args[idx]...); }
 
   __syncthreads();
 
   reduce<InType, OutType, TPB>(out, acc, op);
 }
 
-template <typename InType, typename OutType, typename MapOp,
-          typename ReduceLambda, int TPB, typename... Args>
-void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map,
-                       ReduceLambda op, cudaStream_t stream, const InType *in,
-                       Args... args) {
+template <typename InType,
+          typename OutType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB,
+          typename... Args>
+void mapThenReduceImpl(OutType* out,
+                       size_t len,
+                       OutType neutral,
+                       MapOp map,
+                       ReduceLambda op,
+                       cudaStream_t stream,
+                       const InType* in,
+                       Args... args)
+{
   raft::update_device(out, &neutral, 1, stream);
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapThenReduceKernel<InType, OutType, MapOp, ReduceLambda, TPB, Args...>
@@ -89,10 +105,14 @@ void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map,
  * @param args additional input arrays
  */
 
-template <typename InType, typename MapOp, int TPB = 256, typename... Args,
+template <typename InType,
+          typename MapOp,
+          int TPB = 256,
+          typename... Args,
           typename OutType = InType>
-void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-                      const InType *in, Args... args) {
+void mapThenSumReduce(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
   mapThenReduceImpl<InType, OutType, MapOp, sum_tag, TPB, Args...>(
     out, len, (OutType)0, map, sum_tag(), stream, in, args...);
 }
@@ -115,11 +135,21 @@ void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream,
  * @param args additional input arrays
  */
 
-template <typename InType, typename MapOp, typename ReduceLambda, int TPB = 256,
-          typename OutType = InType, typename... Args>
-void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map,
-                   ReduceLambda op, cudaStream_t stream, const InType *in,
-                   Args... args) {
+template <typename InType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB          = 256,
+          typename OutType = InType,
+          typename... Args>
+void mapThenReduce(OutType* out,
+                   size_t len,
+                   OutType neutral,
+                   MapOp map,
+                   ReduceLambda op,
+                   cudaStream_t stream,
+                   const InType* in,
+                   Args... args)
+{
   mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
     out, len, neutral, map, op, stream, in, args...);
 }
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 902816418f..98b5eaa809 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -23,10 +23,15 @@ namespace raft {
 namespace linalg {
 
 template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
-                                     const Type *vector, IdxType D, IdxType N,
-                                     bool rowMajor, bool bcastAlongRows,
-                                     Lambda op) {
+__global__ void matrixVectorOpKernel(Type* out,
+                                     const Type* matrix,
+                                     const Type* vector,
+                                     IdxType D,
+                                     IdxType N,
+                                     bool rowMajor,
+                                     bool bcastAlongRows,
+                                     Lambda op)
+{
   typedef TxN_t<Type, veclen_> VecType;
   IdxType len = N * D;
   IdxType idx = threadIdx.x;
@@ -57,17 +62,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
   mat.store(out, idx);
 }
 
-template <typename Type, int veclen_, typename Lambda, typename IdxType,
-          int TPB>
-void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
-                        IdxType D, IdxType N, bool rowMajor,
-                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
-  IdxType len = N * D;
-  IdxType nblks =
-    raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
+template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
+void matrixVectorOpImpl(Type* out,
+                        const Type* matrix,
+                        const Type* vec,
+                        IdxType D,
+                        IdxType N,
+                        bool rowMajor,
+                        bool bcastAlongRows,
+                        Lambda op,
+                        cudaStream_t stream)
+{
+  IdxType len   = N * D;
+  IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
   matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor,
-                                bcastAlongRows, op);
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -89,11 +98,18 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
-                    IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
-                    cudaStream_t stream) {
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
   IdxType stride = rowMajor ? D : N;
-  size_t bytes = stride * sizeof(Type);
+  size_t bytes   = stride * sizeof(Type);
   if (16 / sizeof(Type) && bytes % 16 == 0) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
@@ -118,10 +134,16 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
 ///@todo: come up with a cleaner interface to support these cases in future!
 
 template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
-                                     const Type *vector1, const Type *vector2,
-                                     IdxType D, IdxType N, bool rowMajor,
-                                     bool bcastAlongRows, Lambda op) {
+__global__ void matrixVectorOpKernel(Type* out,
+                                     const Type* matrix,
+                                     const Type* vector1,
+                                     const Type* vector2,
+                                     IdxType D,
+                                     IdxType N,
+                                     bool rowMajor,
+                                     bool bcastAlongRows,
+                                     Lambda op)
+{
   typedef TxN_t<Type, veclen_> VecType;
   IdxType len = N * D;
   IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
@@ -154,15 +176,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
   mat.store(out, idx);
 }
 
-template <typename Type, int veclen_, typename Lambda, typename IdxType,
-          int TPB>
-void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
-                        const Type *vec2, IdxType D, IdxType N, bool rowMajor,
-                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
+template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
+void matrixVectorOpImpl(Type* out,
+                        const Type* matrix,
+                        const Type* vec1,
+                        const Type* vec2,
+                        IdxType D,
+                        IdxType N,
+                        bool rowMajor,
+                        bool bcastAlongRows,
+                        Lambda op,
+                        cudaStream_t stream)
+{
   IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
   matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor,
-                                bcastAlongRows, op);
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -185,11 +213,19 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
-                    const Type *vec2, IdxType D, IdxType N, bool rowMajor,
-                    bool bcastAlongRows, Lambda op, cudaStream_t stream) {
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec1,
+                    const Type* vec2,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
   IdxType stride = rowMajor ? D : N;
-  size_t bytes = stride * sizeof(Type);
+  size_t bytes   = stride * sizeof(Type);
   if (16 / sizeof(Type) && bytes % 16 == 0) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
index 9d1538c172..a3fcc5bac6 100644
--- a/cpp/include/raft/linalg/mean_squared_error.cuh
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -24,7 +24,7 @@ namespace linalg {
 /**
  * @brief CUDA version mean squared error function mean((A-B)**2)
  * @tparam math_t data-type upon which the math operation will be performed
- * @tparam TPB threads-per-block 
+ * @tparam TPB threads-per-block
  * @param out the output mean squared error value (assumed to be a device pointer)
  * @param A input array (assumed to be a device pointer)
  * @param B input array (assumed to be a device pointer)
@@ -33,14 +33,14 @@ namespace linalg {
  * @param stream cuda-stream where to launch this kernel
  */
 template <typename math_t, int TPB = 256>
-void meanSquaredError(math_t* out, const math_t* A, const math_t* B, size_t len,
-                      math_t weight, cudaStream_t stream) {
+void meanSquaredError(
+  math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
+{
   auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) {
     math_t diff = a - b;
     return diff * diff * weight / len;
   };
-  mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A,
-                                                   B);
+  mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A, B);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
index ce948c927d..53d57ecd00 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -33,11 +33,10 @@ namespace linalg {
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void multiplyScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
-                    cudaStream_t stream) {
+void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
   unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in * scalar; },
-    stream);
+    out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
index 64930a7123..82558c8023 100644
--- a/cpp/include/raft/linalg/norm.cuh
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -44,22 +44,46 @@ enum NormType { L1Norm = 0, L2Norm };
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int,
-          typename Lambda = raft::Nop<Type, IdxType>>
-void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
-             bool rowMajor, cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>()) {
+template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+void rowNorm(Type* dots,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             NormType type,
+             bool rowMajor,
+             cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>())
+{
   switch (type) {
     case L1Norm:
-      reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false,
-             raft::L1Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
+      reduce(dots,
+             data,
+             D,
+             N,
+             (Type)0,
+             rowMajor,
+             true,
+             stream,
+             false,
+             raft::L1Op<Type, IdxType>(),
+             raft::Sum<Type>(),
+             fin_op);
       break;
     case L2Norm:
-      reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false,
-             raft::L2Op<Type>(), raft::Sum<Type>(), fin_op);
+      reduce(dots,
+             data,
+             D,
+             N,
+             (Type)0,
+             rowMajor,
+             true,
+             stream,
+             false,
+             raft::L2Op<Type>(),
+             raft::Sum<Type>(),
+             fin_op);
       break;
-    default:
-      ASSERT(false, "Invalid norm type passed! [%d]", type);
+    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
   };
 }
 
@@ -77,22 +101,46 @@ void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int,
-          typename Lambda = raft::Nop<Type, IdxType>>
-void colNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
-             bool rowMajor, cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>()) {
+template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+void colNorm(Type* dots,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             NormType type,
+             bool rowMajor,
+             cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>())
+{
   switch (type) {
     case L1Norm:
-      reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false,
-             raft::L1Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
+      reduce(dots,
+             data,
+             D,
+             N,
+             (Type)0,
+             rowMajor,
+             false,
+             stream,
+             false,
+             raft::L1Op<Type, IdxType>(),
+             raft::Sum<Type>(),
+             fin_op);
       break;
     case L2Norm:
-      reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false,
-             raft::L2Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
+      reduce(dots,
+             data,
+             D,
+             N,
+             (Type)0,
+             rowMajor,
+             false,
+             stream,
+             false,
+             raft::L2Op<Type, IdxType>(),
+             raft::Sum<Type>(),
+             fin_op);
       break;
-    default:
-      ASSERT(false, "Invalid norm type passed! [%d]", type);
+    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
   };
 }
 
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index cafa8d54f1..c2455ac3a8 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -40,15 +40,19 @@ namespace linalg {
  * @{
  */
 template <typename math_t>
-void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
-            int n_rows, int n_cols, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
+void qrGetQ(const raft::handle_t& handle,
+            const math_t* M,
+            math_t* Q,
+            int n_rows,
+            int n_cols,
+            cudaStream_t stream)
+{
+  auto allocator               = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
   int k = min(m, n);
-  CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n,
-                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
 
   raft::mr::device::buffer<math_t> tau(allocator, stream, k);
   CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream));
@@ -58,19 +62,16 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
 
   CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork));
   raft::mr::device::buffer<math_t> workspace(allocator, stream, Lwork);
-  CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(),
-                                 workspace.data(), Lwork, devInfo.data(),
-                                 stream));
+  CUSOLVER_CHECK(cusolverDngeqrf(
+    cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
   /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
 #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
   CUDA_CHECK(cudaDeviceSynchronize());
 #endif
-  CUSOLVER_CHECK(
-    cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
+  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(),
-                                 workspace.data(), Lwork, devInfo.data(),
-                                 stream));
+  CUSOLVER_CHECK(cusolverDnorgqr(
+    cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
 }
 
 /**
@@ -84,30 +85,41 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
  * @param stream cuda stream
  */
 template <typename math_t>
-void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
-             int n_rows, int n_cols, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
+void qrGetQR(const raft::handle_t& handle,
+             math_t* M,
+             math_t* Q,
+             math_t* R,
+             int n_rows,
+             int n_cols,
+             cudaStream_t stream)
+{
+  auto allocator               = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
   raft::mr::device::buffer<math_t> R_full(allocator, stream, m * n);
   raft::mr::device::buffer<math_t> tau(allocator, stream, min(m, n));
-  CUDA_CHECK(
-    cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
+  CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
   int R_full_nrows = m, R_full_ncols = n;
-  CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n,
-                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
 
   int Lwork;
   raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
 
-  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows,
-                                            R_full_ncols, R_full.data(),
-                                            R_full_nrows, &Lwork));
+  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(
+    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork));
   raft::mr::device::buffer<math_t> workspace(allocator, stream, Lwork);
-  CUSOLVER_CHECK(cusolverDngeqrf(
-    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows,
-    tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
+  CUSOLVER_CHECK(cusolverDngeqrf(cusolverH,
+                                 R_full_nrows,
+                                 R_full_ncols,
+                                 R_full.data(),
+                                 R_full_nrows,
+                                 tau.data(),
+                                 workspace.data(),
+                                 Lwork,
+                                 devInfo.data(),
+                                 stream));
   // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
 #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
   CUDA_CHECK(cudaDeviceSynchronize());
@@ -115,17 +127,24 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
 
   raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n,
-                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
   int Q_nrows = m, Q_ncols = n;
 
-  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols,
-                                            min(Q_ncols, Q_nrows), Q, Q_nrows,
-                                            tau.data(), &Lwork));
+  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(
+    cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDnorgqr(
-    cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(),
-    workspace.data(), Lwork, devInfo.data(), stream));
+  CUSOLVER_CHECK(cusolverDnorgqr(cusolverH,
+                                 Q_nrows,
+                                 Q_ncols,
+                                 min(Q_ncols, Q_nrows),
+                                 Q,
+                                 Q_nrows,
+                                 tau.data(),
+                                 workspace.data(),
+                                 Lwork,
+                                 devInfo.data(),
+                                 stream));
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index d39577bbdd..693a797db9 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -52,28 +52,33 @@ namespace linalg {
  * @param reduce_op binary reduction operation
  * @param final_op elementwise operation to apply before storing results
  */
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda = raft::Nop<OutType>>
-void reduce(OutType *dots, const InType *data, int D, int N, OutType init,
-            bool rowMajor, bool alongRows, cudaStream_t stream,
-            bool inplace = false,
-            MainLambda main_op = raft::Nop<InType, IdxType>(),
+          typename FinalLambda  = raft::Nop<OutType>>
+void reduce(OutType* dots,
+            const InType* data,
+            int D,
+            int N,
+            OutType init,
+            bool rowMajor,
+            bool alongRows,
+            cudaStream_t stream,
+            bool inplace           = false,
+            MainLambda main_op     = raft::Nop<InType, IdxType>(),
             ReduceLambda reduce_op = raft::Sum<OutType>(),
-            FinalLambda final_op = raft::Nop<OutType>()) {
+            FinalLambda final_op   = raft::Nop<OutType>())
+{
   if (rowMajor && alongRows) {
-    coalescedReduction(dots, data, D, N, init, stream, inplace, main_op,
-                       reduce_op, final_op);
+    coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (rowMajor && !alongRows) {
-    stridedReduction(dots, data, D, N, init, stream, inplace, main_op,
-                     reduce_op, final_op);
+    stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (!rowMajor && alongRows) {
-    stridedReduction(dots, data, N, D, init, stream, inplace, main_op,
-                     reduce_op, final_op);
+    stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
-    coalescedReduction(dots, data, N, D, init, stream, inplace, main_op,
-                       reduce_op, final_op);
+    coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
   }
 }
 
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index bba652e137..f931c976fd 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -28,14 +28,15 @@ namespace linalg {
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
 // for column major layout
 template <typename Type, typename MainLambda>
-__global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
-                                       int N, Type init, MainLambda main_op) {
+__global__ void stridedSummationKernel(
+  Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op)
+{
   // Thread reduction
   Type thread_data = Type(init);
-  int colStart = blockIdx.x * blockDim.x + threadIdx.x;
+  int colStart     = blockIdx.x * blockDim.x + threadIdx.x;
   if (colStart < D) {
     int rowStart = blockIdx.y * blockDim.y + threadIdx.y;
-    int stride = blockDim.y * gridDim.y;
+    int stride   = blockDim.y * gridDim.y;
     for (int j = rowStart; j < N; j += stride) {
       int idx = colStart + j * D;
       thread_data += main_op(data[idx], j);
@@ -44,8 +45,8 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
 
   // Block reduction
   extern __shared__ char tmp[];  // One element per thread in block
-  Type *temp = (Type *)tmp;      // Cast to desired type
-  int myidx = threadIdx.x + blockDim.x * threadIdx.y;
+  Type* temp  = (Type*)tmp;      // Cast to desired type
+  int myidx   = threadIdx.x + blockDim.x * threadIdx.y;
   temp[myidx] = thread_data;
   __syncthreads();
   for (int j = blockDim.y / 2; j > 0; j /= 2) {
@@ -54,24 +55,31 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
   }
 
   // Grid reduction
-  if ((colStart < D) && (threadIdx.y == 0))
-    raft::myAtomicAdd(dots + colStart, temp[myidx]);
+  if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]);
 }
 
 // Kernel to perform reductions along the strided dimension
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
 // for column major layout
-template <typename InType, typename OutType, typename IdxType,
-          typename MainLambda, typename ReduceLambda>
-__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
-                                       int N, OutType init, MainLambda main_op,
-                                       ReduceLambda reduce_op) {
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda>
+__global__ void stridedReductionKernel(OutType* dots,
+                                       const InType* data,
+                                       int D,
+                                       int N,
+                                       OutType init,
+                                       MainLambda main_op,
+                                       ReduceLambda reduce_op)
+{
   // Thread reduction
   OutType thread_data = init;
-  IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x;
+  IdxType colStart    = blockIdx.x * blockDim.x + threadIdx.x;
   if (colStart < D) {
     IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y;
-    IdxType stride = blockDim.y * gridDim.y;
+    IdxType stride   = blockDim.y * gridDim.y;
     for (IdxType j = rowStart; j < N; j += stride) {
       IdxType idx = colStart + j * D;
       thread_data = reduce_op(thread_data, main_op(data[idx], j));
@@ -79,14 +87,13 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
   }
 
   // Block reduction
-  extern __shared__ char tmp[];  // One element per thread in block
-  auto *temp = (OutType *)tmp;   // Cast to desired type
+  extern __shared__ char tmp[];   // One element per thread in block
+  auto* temp    = (OutType*)tmp;  // Cast to desired type
   IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y);
-  temp[myidx] = thread_data;
+  temp[myidx]   = thread_data;
   __syncthreads();
   for (int j = blockDim.y / 2; j > 0; j /= 2) {
-    if (threadIdx.y < j)
-      temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]);
+    if (threadIdx.y < j) temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]);
     __syncthreads();
   }
 
@@ -122,15 +129,23 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
  * @param inplace reduction result added inplace or overwrites old values?
  * @param stream cuda stream where to launch work
  */
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda = raft::Nop<OutType>>
-void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
-                      OutType init, cudaStream_t stream, bool inplace = false,
-                      MainLambda main_op = raft::Nop<InType, IdxType>(),
+          typename FinalLambda  = raft::Nop<OutType>>
+void stridedReduction(OutType* dots,
+                      const InType* data,
+                      IdxType D,
+                      IdxType N,
+                      OutType init,
+                      cudaStream_t stream,
+                      bool inplace           = false,
+                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
                       ReduceLambda reduce_op = raft::Sum<OutType>(),
-                      FinalLambda final_op = raft::Nop<OutType>()) {
+                      FinalLambda final_op   = raft::Nop<OutType>())
+{
   ///@todo: this extra should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
   if (!inplace)
@@ -140,7 +155,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
   // Arbitrary numbers for now, probably need to tune
   const dim3 thrds(32, 16);
   IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y);
-  elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread;
+  elemsPerThread         = (elemsPerThread > 8) ? 8 : elemsPerThread;
   const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x),
                    raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread));
   const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y;
@@ -153,8 +168,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
       <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
   else
     stridedReductionKernel<InType, OutType, IdxType>
-      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op,
-                                            reduce_op);
+      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op, reduce_op);
 
   ///@todo: this complication should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index 882c105689..43060d0818 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -38,8 +38,8 @@ namespace linalg {
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len,
-                    cudaStream_t stream) {
+void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
   auto op = [scalar] __device__(InT in) { return OutT(in - scalar); };
   unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
 }
@@ -58,24 +58,25 @@ void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len,
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void subtract(OutT *out, const InT *in1, const InT *in2, IdxType len,
-              cudaStream_t stream) {
+void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
   auto op = [] __device__(InT a, InT b) { return OutT(a - b); };
   binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
 }
 
 template <class math_t, typename IdxType>
-__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
-                                           const math_t *singleScalarDev,
-                                           IdxType len) {
-  //TODO: kernel do not use shared memory in current implementation
+__global__ void subtract_dev_scalar_kernel(math_t* outDev,
+                                           const math_t* inDev,
+                                           const math_t* singleScalarDev,
+                                           IdxType len)
+{
+  // TODO: kernel do not use shared memory in current implementation
   int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) {
-    outDev[i] = inDev[i] - *singleScalarDev;
-  }
+  if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; }
 }
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+ * write result to outDev[i]
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
  * @param outDev the output buffer
@@ -86,9 +87,12 @@ __global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
  * @remark block size has not been tuned
  */
 template <typename math_t, typename IdxType = int, int TPB = 256>
-void subtractDevScalar(math_t *outDev, const math_t *inDev,
-                       const math_t *singleScalarDev, IdxType len,
-                       cudaStream_t stream) {
+void subtractDevScalar(math_t* outDev,
+                       const math_t* inDev,
+                       const math_t* singleScalarDev,
+                       IdxType len,
+                       cudaStream_t stream)
+{
   // Just for the note - there is no way to express such operation with cuBLAS in effective way
   // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda
   const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 7357a68a4c..1cb8b7592f 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -50,14 +50,21 @@ namespace linalg {
 // TODO: couldn't template this function due to cusolverDnSgesvd and
 // cusolverSnSgesvd. Check if there is any other way.
 template <typename T>
-void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
-           T *sing_vals, T *left_sing_vecs, T *right_sing_vecs,
-           bool trans_right, bool gen_left_vec, bool gen_right_vec,
-           cudaStream_t stream) {
-  std::shared_ptr<raft::mr::device::allocator> allocator =
-    handle.get_device_allocator();
-  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-  cublasHandle_t cublasH = handle.get_cublas_handle();
+void svdQR(const raft::handle_t& handle,
+           T* in,
+           int n_rows,
+           int n_cols,
+           T* sing_vals,
+           T* left_sing_vecs,
+           T* right_sing_vecs,
+           bool trans_right,
+           bool gen_left_vec,
+           bool gen_right_vec,
+           cudaStream_t stream)
+{
+  std::shared_ptr<raft::mr::device::allocator> allocator = handle.get_device_allocator();
+  cusolverDnHandle_t cusolverH                           = handle.get_cusolver_dn_handle();
+  cublasHandle_t cublasH                                 = handle.get_cublas_handle();
 
 #if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000
   // 46340: sqrt of max int value
@@ -72,14 +79,13 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
   const int n = n_cols;
 
   raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
-  T *d_rwork = nullptr;
+  T* d_rwork = nullptr;
 
   int lwork = 0;
-  CUSOLVER_CHECK(
-    cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
+  CUSOLVER_CHECK(cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
   raft::mr::device::buffer<T> d_work(allocator, stream, lwork);
 
-  char jobu = 'S';
+  char jobu  = 'S';
   char jobvt = 'A';
 
   if (!gen_left_vec) {
@@ -92,9 +98,23 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
     strcpy(&jobvt, &new_vt);
   }
 
-  CUSOLVER_CHECK(cusolverDngesvd(
-    cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m,
-    right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream));
+  CUSOLVER_CHECK(cusolverDngesvd(cusolverH,
+                                 jobu,
+                                 jobvt,
+                                 m,
+                                 n,
+                                 in,
+                                 m,
+                                 sing_vals,
+                                 left_sing_vecs,
+                                 m,
+                                 right_sing_vecs,
+                                 n,
+                                 d_work.data(),
+                                 lwork,
+                                 d_rwork,
+                                 devInfo.data(),
+                                 stream));
 
   // Transpose the right singular vector back
   if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream);
@@ -110,19 +130,37 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
 }
 
 template <typename T>
-void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
-            T *U, T *V, bool gen_left_vec, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
+void svdEig(const raft::handle_t& handle,
+            T* in,
+            int n_rows,
+            int n_cols,
+            T* S,
+            T* U,
+            T* V,
+            bool gen_left_vec,
+            cudaStream_t stream)
+{
+  auto allocator               = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-  cublasHandle_t cublasH = handle.get_cublas_handle();
+  cublasHandle_t cublasH       = handle.get_cublas_handle();
 
   int len = n_cols * n_cols;
   raft::mr::device::buffer<T> in_cross_mult(allocator, stream, len);
 
   T alpha = T(1);
-  T beta = T(0);
-  raft::linalg::gemm(handle, in, n_rows, n_cols, in, in_cross_mult.data(),
-                     n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta,
+  T beta  = T(0);
+  raft::linalg::gemm(handle,
+                     in,
+                     n_rows,
+                     n_cols,
+                     in,
+                     in_cross_mult.data(),
+                     n_cols,
+                     n_cols,
+                     CUBLAS_OP_T,
+                     CUBLAS_OP_N,
+                     alpha,
+                     beta,
                      stream);
 
   eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream);
@@ -133,10 +171,20 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
   raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true);
 
   if (gen_left_vec) {
-    raft::linalg::gemm(handle, in, n_rows, n_cols, V, U, n_rows, n_cols,
-                       CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
-    raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false,
-                                                true, stream);
+    raft::linalg::gemm(handle,
+                       in,
+                       n_rows,
+                       n_cols,
+                       V,
+                       U,
+                       n_rows,
+                       n_cols,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       alpha,
+                       beta,
+                       stream);
+    raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, true, stream);
   }
 }
 
@@ -158,11 +206,20 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
-               math_t *sing_vals, math_t *left_sing_vecs,
-               math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec,
-               math_t tol, int max_sweeps, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
+void svdJacobi(const raft::handle_t& handle,
+               math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* sing_vals,
+               math_t* left_sing_vecs,
+               math_t* right_sing_vecs,
+               bool gen_left_vec,
+               bool gen_right_vec,
+               math_t tol,
+               int max_sweeps,
+               cudaStream_t stream)
+{
+  auto allocator               = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   gesvdjInfo_t gesvdj_params = NULL;
@@ -177,18 +234,42 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
   raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
 
   int lwork = 0;
-  int econ = 1;
-
-  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
-    left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params));
+  int econ  = 1;
+
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH,
+                                                           CUSOLVER_EIG_MODE_VECTOR,
+                                                           econ,
+                                                           m,
+                                                           n,
+                                                           in,
+                                                           m,
+                                                           sing_vals,
+                                                           left_sing_vecs,
+                                                           m,
+                                                           right_sing_vecs,
+                                                           n,
+                                                           &lwork,
+                                                           gesvdj_params));
 
   raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
 
-  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
-    left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(),
-    gesvdj_params, stream));
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(cusolverH,
+                                                CUSOLVER_EIG_MODE_VECTOR,
+                                                econ,
+                                                m,
+                                                n,
+                                                in,
+                                                m,
+                                                sing_vals,
+                                                left_sing_vecs,
+                                                m,
+                                                right_sing_vecs,
+                                                n,
+                                                d_work.data(),
+                                                lwork,
+                                                devInfo.data(),
+                                                gesvdj_params,
+                                                stream));
 
   CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
@@ -207,18 +288,36 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S,
-                       math_t *V, math_t *out, int n_rows, int n_cols, int k,
-                       cudaStream_t stream) {
+void svdReconstruction(const raft::handle_t& handle,
+                       math_t* U,
+                       math_t* S,
+                       math_t* V,
+                       math_t* out,
+                       int n_rows,
+                       int n_cols,
+                       int k,
+                       cudaStream_t stream)
+{
   auto allocator = handle.get_device_allocator();
 
   const math_t alpha = 1.0, beta = 0.0;
   raft::mr::device::buffer<math_t> SVT(allocator, stream, k * n_cols);
 
-  raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N,
-                     CUBLAS_OP_T, alpha, beta, stream);
-  raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols,
-                     CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
+  raft::linalg::gemm(
+    handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream);
+  raft::linalg::gemm(handle,
+                     U,
+                     n_rows,
+                     k,
+                     SVT.data(),
+                     out,
+                     n_rows,
+                     n_cols,
+                     CUBLAS_OP_N,
+                     CUBLAS_OP_N,
+                     alpha,
+                     beta,
+                     stream);
 }
 
 /**
@@ -236,10 +335,18 @@ void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S,
  * @param stream cuda stream
  */
 template <typename math_t>
-bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
-                         math_t *S_vec, math_t *V, int n_rows, int n_cols,
-                         int k, math_t tol, cudaStream_t stream) {
-  auto allocator = handle.get_device_allocator();
+bool evaluateSVDByL2Norm(const raft::handle_t& handle,
+                         math_t* A_d,
+                         math_t* U,
+                         math_t* S_vec,
+                         math_t* V,
+                         int n_rows,
+                         int n_cols,
+                         int k,
+                         math_t tol,
+                         cudaStream_t stream)
+{
+  auto allocator         = handle.get_device_allocator();
   cublasHandle_t cublasH = handle.get_cublas_handle();
 
   int m = n_rows, n = n_cols;
@@ -263,16 +370,25 @@ bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
   // calculate percent error
   const math_t alpha = 1.0, beta = -1.0;
   raft::mr::device::buffer<math_t> A_minus_P(allocator, stream, m * n);
-  CUDA_CHECK(
-    cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
-
-  CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n,
-                                        &alpha, A_d, m, &beta, P_d.data(), m,
-                                        A_minus_P.data(), m, stream));
-
-  math_t norm_A_minus_P =
-    raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
-  math_t percent_error = 100.0 * norm_A_minus_P / normA;
+  CUDA_CHECK(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
+
+  CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH,
+                                        CUBLAS_OP_N,
+                                        CUBLAS_OP_N,
+                                        m,
+                                        n,
+                                        &alpha,
+                                        A_d,
+                                        m,
+                                        &beta,
+                                        P_d.data(),
+                                        m,
+                                        A_minus_P.data(),
+                                        m,
+                                        stream));
+
+  math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
+  math_t percent_error  = 100.0 * norm_A_minus_P / normA;
   return (percent_error / 100.0 < tol);
 }
 
diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h
index d90f6271fa..9b954c29c1 100644
--- a/cpp/include/raft/linalg/transpose.h
+++ b/cpp/include/raft/linalg/transpose.h
@@ -33,18 +33,34 @@ namespace linalg {
  * @param stream: cuda stream
  */
 template <typename math_t>
-void transpose(const raft::handle_t &handle, math_t *in, math_t *out,
-               int n_rows, int n_cols, cudaStream_t stream) {
+void transpose(const raft::handle_t& handle,
+               math_t* in,
+               math_t* out,
+               int n_rows,
+               int n_cols,
+               cudaStream_t stream)
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
   int out_n_rows = n_cols;
   int out_n_cols = n_rows;
 
   const math_t alpha = 1.0;
-  const math_t beta = 0.0;
-  CUBLAS_CHECK(raft::linalg::cublasgeam(
-    cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, out_n_rows, out_n_cols, &alpha, in,
-    n_rows, &beta, out, out_n_rows, out, out_n_rows, stream));
+  const math_t beta  = 0.0;
+  CUBLAS_CHECK(raft::linalg::cublasgeam(cublas_h,
+                                        CUBLAS_OP_T,
+                                        CUBLAS_OP_N,
+                                        out_n_rows,
+                                        out_n_cols,
+                                        &alpha,
+                                        in,
+                                        n_rows,
+                                        &beta,
+                                        out,
+                                        out_n_rows,
+                                        out,
+                                        out_n_rows,
+                                        stream));
 }
 
 /**
@@ -54,24 +70,25 @@ void transpose(const raft::handle_t &handle, math_t *in, math_t *out,
  * @param stream: cuda stream
  */
 template <typename math_t>
-void transpose(math_t *inout, int n, cudaStream_t stream) {
-  auto m = n;
-  auto size = n * n;
-  auto d_inout = inout;
+void transpose(math_t* inout, int n, cudaStream_t stream)
+{
+  auto m        = n;
+  auto size     = n * n;
+  auto d_inout  = inout;
   auto counting = thrust::make_counting_iterator<int>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
-                   [=] __device__(int idx) {
-                     int s_row = idx % m;
-                     int s_col = idx / m;
-                     int d_row = s_col;
-                     int d_col = s_row;
-                     if (s_row < s_col) {
-                       auto temp = d_inout[d_col * m + d_row];
-                       d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row];
-                       d_inout[s_col * m + s_row] = temp;
-                     }
-                   });
+  thrust::for_each(
+    thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(int idx) {
+      int s_row = idx % m;
+      int s_col = idx / m;
+      int d_row = s_col;
+      int d_col = s_row;
+      if (s_row < s_col) {
+        auto temp                  = d_inout[d_col * m + d_row];
+        d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row];
+        d_inout[s_col * m + s_row] = temp;
+      }
+    });
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index 46b4d296cb..198b9b2b10 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -23,10 +23,9 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, int VecLen, typename Lambda, typename OutType,
-          typename IdxType>
-__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len,
-                              Lambda op) {
+template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType>
+__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op)
+{
   typedef TxN_t<InType, VecLen> InVecType;
   typedef TxN_t<OutType, VecLen> OutVecType;
   InVecType a;
@@ -42,12 +41,10 @@ __global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len,
   b.store(out, idx);
 }
 
-template <typename InType, int VecLen, typename Lambda, typename OutType,
-          typename IdxType, int TPB>
-void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op,
-                 cudaStream_t stream) {
-  const IdxType nblks =
-    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType, int TPB>
+void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
   unaryOpKernel<InType, VecLen, Lambda, OutType, IdxType>
     <<<nblks, TPB, 0, stream>>>(out, in, len, op);
   CUDA_CHECK(cudaPeekAtLastError());
@@ -68,47 +65,38 @@ void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op,
  * @note Lambda must be a functor with the following signature:
  *       `OutType func(const InType& val);`
  */
-template <typename InType, typename Lambda, typename IdxType = int,
-          typename OutType = InType, int TPB = 256>
-void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op,
-             cudaStream_t stream) {
-  if (len <= 0) return;  //silently skip in case of 0 length input
-  constexpr auto maxSize =
-    sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes = len * maxSize;
-  uint64_t inAddr = uint64_t(in);
-  uint64_t outAddr = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 &&
-      outAddr % 16 == 0) {
-    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 &&
-             outAddr % 8 == 0) {
-    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 &&
-             outAddr % 4 == 0) {
-    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 &&
-             outAddr % 2 == 0) {
-    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
+template <typename InType,
+          typename Lambda,
+          typename IdxType = int,
+          typename OutType = InType,
+          int TPB          = 256>
+void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  if (len <= 0) return;  // silently skip in case of 0 length input
+  constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes           = len * maxSize;
+  uint64_t inAddr        = uint64_t(in);
+  uint64_t outAddr       = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) {
+    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) {
+    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) {
+    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) {
+    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
   } else if (1 / maxSize) {
-    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
+    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
   } else {
-    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op,
-                                                          stream);
+    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
   }
 }
 
 template <typename OutType, typename Lambda, typename IdxType>
-__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) {
+__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op)
+{
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-  if (idx < len) {
-    op(out + idx, idx);
-  }
+  if (idx < len) { op(out + idx, idx); }
 }
 
 /**
@@ -128,14 +116,12 @@ __global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) {
  *                    where outLocationOffset will be out + idx.
  * @param[in]  stream cuda stream where to launch work
  */
-template <typename OutType, typename Lambda, typename IdxType = int,
-          int TPB = 256>
-void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op,
-                      cudaStream_t stream) {
+template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
+void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+{
   if (len <= 0) return;  // silently skip in case of 0 length input
   auto nblks = raft::ceildiv<IdxType>(len, TPB);
-  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, len, op);
+  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType><<<nblks, TPB, 0, stream>>>(out, len, op);
   CUDA_CHECK(cudaGetLastError());
 }
 
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
index 0a72117140..579491b5cc 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.cuh
@@ -41,14 +41,18 @@ namespace matrix {
  * @param stream cuda stream
  */
 template <typename math_t>
-void power(math_t *in, math_t *out, math_t scalar, int len,
-           cudaStream_t stream) {
-  auto d_src = in;
+void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
+{
+  auto d_src  = in;
   auto d_dest = out;
 
   raft::linalg::binaryOp(
-    d_dest, d_src, d_src, len,
-    [=] __device__(math_t a, math_t b) { return scalar * a * b; }, stream);
+    d_dest,
+    d_src,
+    d_src,
+    len,
+    [=] __device__(math_t a, math_t b) { return scalar * a * b; },
+    stream);
 }
 
 /**
@@ -59,7 +63,8 @@ void power(math_t *in, math_t *out, math_t scalar, int len,
  * @param stream cuda stream
  */
 template <typename math_t>
-void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) {
+void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
+{
   power(inout, inout, scalar, len, stream);
 }
 
@@ -70,7 +75,8 @@ void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) {
  * @param stream cuda stream
  */
 template <typename math_t>
-void power(math_t *inout, int len, cudaStream_t stream) {
+void power(math_t* inout, int len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   power(inout, scalar, len, stream);
 }
@@ -84,7 +90,8 @@ void power(math_t *inout, int len, cudaStream_t stream) {
  * @{
  */
 template <typename math_t>
-void power(math_t *in, math_t *out, int len, cudaStream_t stream) {
+void power(math_t* in, math_t* out, int len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   power(in, out, scalar, len, stream);
 }
@@ -101,13 +108,20 @@ void power(math_t *in, math_t *out, int len, cudaStream_t stream) {
  * @param set_neg_zero whether to set negative numbers to zero
  */
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len,
-             cudaStream_t stream, bool set_neg_zero = false) {
-  auto d_src = in;
+void seqRoot(math_t* in,
+             math_t* out,
+             math_t scalar,
+             IdxType len,
+             cudaStream_t stream,
+             bool set_neg_zero = false)
+{
+  auto d_src  = in;
   auto d_dest = out;
 
   raft::linalg::unaryOp(
-    d_dest, d_src, len,
+    d_dest,
+    d_src,
+    len,
     [=] __device__(math_t a) {
       if (set_neg_zero) {
         if (a < math_t(0)) {
@@ -133,8 +147,9 @@ void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len,
  * @param set_neg_zero whether to set negative numbers to zero
  */
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
-             bool set_neg_zero = false) {
+void seqRoot(
+  math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false)
+{
   seqRoot(inout, inout, scalar, len, stream, set_neg_zero);
 }
 
@@ -148,22 +163,27 @@ void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t *in, math_t *out, IdxType len, cudaStream_t stream) {
+void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   seqRoot(in, out, scalar, len, stream);
 }
 
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t *inout, IdxType len, cudaStream_t stream) {
+void seqRoot(math_t* inout, IdxType len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   seqRoot(inout, inout, scalar, len, stream);
 }
 
 template <typename math_t, typename IdxType = int>
-void setSmallValuesZero(math_t *out, const math_t *in, IdxType len,
-                        cudaStream_t stream, math_t thres = 1e-15) {
+void setSmallValuesZero(
+  math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
   raft::linalg::unaryOp(
-    out, in, len,
+    out,
+    in,
+    len,
     [=] __device__(math_t a) {
       if (a <= thres && -a <= thres) {
         return math_t(0);
@@ -184,8 +204,8 @@ void setSmallValuesZero(math_t *out, const math_t *in, IdxType len,
  * @param thres: threshold
  */
 template <typename math_t, typename IdxType = int>
-void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream,
-                        math_t thres = 1e-15) {
+void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
   setSmallValuesZero(inout, inout, len, stream, thres);
 }
 
@@ -203,14 +223,21 @@ void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream,
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t *in, math_t *out, math_t scalar, int len,
-                cudaStream_t stream, bool setzero = false,
-                math_t thres = 1e-15) {
-  auto d_src = in;
+void reciprocal(math_t* in,
+                math_t* out,
+                math_t scalar,
+                int len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
+  auto d_src  = in;
   auto d_dest = out;
 
   raft::linalg::unaryOp(
-    d_dest, d_src, len,
+    d_dest,
+    d_src,
+    len,
     [=] __device__(math_t a) {
       if (setzero) {
         if (abs(a) <= thres) {
@@ -237,8 +264,13 @@ void reciprocal(math_t *in, math_t *out, math_t scalar, int len,
  * @param thres: Threshold to avoid dividing by zero (|value| < thres -> result = 0)
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
-                bool setzero = false, math_t thres = 1e-15) {
+void reciprocal(math_t* inout,
+                math_t scalar,
+                IdxType len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
   reciprocal(inout, inout, scalar, len, stream, setzero, thres);
 }
 
@@ -251,7 +283,8 @@ void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) {
+void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   reciprocal(inout, scalar, len, stream);
 }
@@ -266,14 +299,15 @@ void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) {
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t *in, math_t *out, IdxType len, cudaStream_t stream) {
+void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   reciprocal(in, out, scalar, len, stream);
 }
 
 template <typename math_t>
-void setValue(math_t *out, const math_t *in, math_t scalar, int len,
-              cudaStream_t stream = 0) {
+void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
+{
   raft::linalg::unaryOp(
     out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream);
 }
@@ -289,46 +323,44 @@ void setValue(math_t *out, const math_t *in, math_t scalar, int len,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len,
-           cudaStream_t stream) {
-  auto d_src = src;
+void ratio(
+  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
+{
+  auto d_src  = src;
   auto d_dest = dest;
 
-  std::shared_ptr<raft::mr::device::allocator> allocator =
-    handle.get_device_allocator();
+  std::shared_ptr<raft::mr::device::allocator> allocator = handle.get_device_allocator();
 
   raft::mr::device::buffer<math_t> d_sum(allocator, stream, 1);
-  auto *d_sum_ptr = d_sum.data();
-  auto no_op = [] __device__(math_t in) { return in; };
+  auto* d_sum_ptr = d_sum.data();
+  auto no_op      = [] __device__(math_t in) { return in; };
   raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
   raft::linalg::unaryOp(
-    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); },
-    stream);
+    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream);
 }
 
 /** @} */
 
 // Computes the argmax(d_in) column-wise in a DxN matrix
 template <typename T, int TPB>
-__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) {
+__global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax)
+{
   typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   // compute maxIndex=argMax  index for column
-  using KVP = cub::KeyValuePair<int, T>;
+  using KVP    = cub::KeyValuePair<int, T>;
   int rowStart = blockIdx.x * D;
   KVP thread_data(-1, -raft::myInf<T>());
 
   for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx = rowStart + i;
+    int idx     = rowStart + i;
     thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx]));
   }
 
   auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
 
-  if (threadIdx.x == 0) {
-    argmax[blockIdx.x] = maxKV.key;
-  }
+  if (threadIdx.x == 0) { argmax[blockIdx.x] = maxKV.key; }
 }
 
 /**
@@ -340,8 +372,8 @@ __global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) {
  * @param stream: cuda stream
  */
 template <typename math_t>
-void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
-            cudaStream_t stream) {
+void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream)
+{
   int D = n_rows;
   int N = n_cols;
   if (D <= 32) {
@@ -360,30 +392,29 @@ void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
 // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by
 // flipping the sign if the |max| value for each column is negative.
 template <typename T, int TPB>
-__global__ void signFlipKernel(T *d_in, int D, int N) {
+__global__ void signFlipKernel(T* d_in, int D, int N)
+{
   typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   // compute maxIndex=argMax (with abs()) index for column
-  using KVP = cub::KeyValuePair<int, T>;
+  using KVP    = cub::KeyValuePair<int, T>;
   int rowStart = blockIdx.x * D;
   KVP thread_data(0, 0);
   for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx = rowStart + i;
+    int idx     = rowStart + i;
     thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx])));
   }
   auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
 
   // flip column sign if d_in[maxIndex] < 0
   __shared__ bool need_sign_flip;
-  if (threadIdx.x == 0) {
-    need_sign_flip = d_in[maxKV.key] < T(0);
-  }
+  if (threadIdx.x == 0) { need_sign_flip = d_in[maxKV.key] < T(0); }
   __syncthreads();
 
   if (need_sign_flip) {
     for (int i = threadIdx.x; i < D; i += TPB) {
-      int idx = rowStart + i;
+      int idx   = rowStart + i;
       d_in[idx] = -d_in[idx];
     }
   }
@@ -398,9 +429,10 @@ __global__ void signFlipKernel(T *d_in, int D, int N) {
  * @param stream cuda stream
  */
 template <typename math_t>
-void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
-  int D = n_rows;
-  int N = n_cols;
+void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
+{
+  int D     = n_rows;
+  int N     = n_cols;
   auto data = inout;
   if (D <= 32) {
     signFlipKernel<math_t, 32><<<N, 32, 0, stream>>>(data, D, N);
@@ -415,20 +447,43 @@ void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryMult(Type *data, const Type *vec, IdxType n_row,
-                            IdxType n_col, bool rowMajor, bool bcastAlongRows,
-                            cudaStream_t stream) {
+void matrixVectorBinaryMult(Type* data,
+                            const Type* vec,
+                            IdxType n_row,
+                            IdxType n_col,
+                            bool rowMajor,
+                            bool bcastAlongRows,
+                            cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a * b; }, stream);
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a * b; },
+    stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row,
-                                    IdxType n_col, bool rowMajor,
-                                    bool bcastAlongRows, cudaStream_t stream) {
+void matrixVectorBinaryMultSkipZero(Type* data,
+                                    const Type* vec,
+                                    IdxType n_row,
+                                    IdxType n_col,
+                                    bool rowMajor,
+                                    bool bcastAlongRows,
+                                    cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
     [] __device__(Type a, Type b) {
       if (b == Type(0))
         return a;
@@ -439,22 +494,45 @@ void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row,
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryDiv(Type *data, const Type *vec, IdxType n_row,
-                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
-                           cudaStream_t stream) {
+void matrixVectorBinaryDiv(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a / b; }, stream);
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a / b; },
+    stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row,
-                                   IdxType n_col, bool rowMajor,
-                                   bool bcastAlongRows, cudaStream_t stream,
-                                   bool return_zero = false) {
+void matrixVectorBinaryDivSkipZero(Type* data,
+                                   const Type* vec,
+                                   IdxType n_row,
+                                   IdxType n_col,
+                                   bool rowMajor,
+                                   bool bcastAlongRows,
+                                   cudaStream_t stream,
+                                   bool return_zero = false)
+{
   if (return_zero) {
     raft::linalg::matrixVectorOp(
-      data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+      data,
+      data,
+      vec,
+      n_col,
+      n_row,
+      rowMajor,
+      bcastAlongRows,
       [] __device__(Type a, Type b) {
         if (raft::myAbs(b) < Type(1e-10))
           return Type(0);
@@ -464,7 +542,13 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row,
       stream);
   } else {
     raft::linalg::matrixVectorOp(
-      data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+      data,
+      data,
+      vec,
+      n_col,
+      n_row,
+      rowMajor,
+      bcastAlongRows,
       [] __device__(Type a, Type b) {
         if (raft::myAbs(b) < Type(1e-10))
           return a;
@@ -476,21 +560,45 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row,
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryAdd(Type *data, const Type *vec, IdxType n_row,
-                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
-                           cudaStream_t stream) {
+void matrixVectorBinaryAdd(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; }, stream);
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; },
+    stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinarySub(Type *data, const Type *vec, IdxType n_row,
-                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
-                           cudaStream_t stream) {
+void matrixVectorBinarySub(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; }, stream);
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; },
+    stream);
 }
 
 };  // end namespace matrix
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
index 5f5755e24e..71a2888545 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -49,29 +49,33 @@ using namespace std;
  * @param rowMajor whether the matrix has row major layout
  */
 template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
-void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
-              const idx_array_t *indices, idx_t n_rows_indices,
-              cudaStream_t stream, bool rowMajor = false) {
+void copyRows(const m_t* in,
+              idx_t n_rows,
+              idx_t n_cols,
+              m_t* out,
+              const idx_array_t* indices,
+              idx_t n_rows_indices,
+              cudaStream_t stream,
+              bool rowMajor = false)
+{
   if (rowMajor) {
     const idx_t TPB = 256;
-    cache::
-      get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
-        in, n_cols, indices, n_rows_indices, out);
+    cache::get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
+      in, n_cols, indices, n_rows_indices, out);
     CUDA_CHECK(cudaPeekAtLastError());
     return;
   }
 
-  idx_t size = n_rows_indices * n_cols;
+  idx_t size    = n_rows_indices * n_cols;
   auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
-                   [=] __device__(idx_t idx) {
-                     idx_t row = idx % n_rows_indices;
-                     idx_t col = idx / n_rows_indices;
+  thrust::for_each(
+    thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(idx_t idx) {
+      idx_t row = idx % n_rows_indices;
+      idx_t col = idx / n_rows_indices;
 
-                     out[col * n_rows_indices + row] =
-                       in[col * n_rows + indices[row]];
-                   });
+      out[col * n_rows_indices + row] = in[col * n_rows + indices[row]];
+    });
 }
 
 /**
@@ -83,8 +87,8 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols,
-          cudaStream_t stream) {
+void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   raft::copy_async(out, in, n_rows * n_cols, stream);
 }
 
@@ -99,21 +103,22 @@ void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows,
-                     idx_t out_n_cols, cudaStream_t stream) {
-  auto m = out_n_rows;
-  auto k = in_n_rows;
-  idx_t size = out_n_rows * out_n_cols;
-  auto d_q = in;
+void truncZeroOrigin(
+  m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream)
+{
+  auto m         = out_n_rows;
+  auto k         = in_n_rows;
+  idx_t size     = out_n_rows * out_n_cols;
+  auto d_q       = in;
   auto d_q_trunc = out;
-  auto counting = thrust::make_counting_iterator<idx_t>(0);
+  auto counting  = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
-                   [=] __device__(idx_t idx) {
-                     idx_t row = idx % m;
-                     idx_t col = idx / m;
-                     d_q_trunc[col * m + row] = d_q[col * k + row];
-                   });
+  thrust::for_each(
+    thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(idx_t idx) {
+      idx_t row                = idx % m;
+      idx_t col                = idx / m;
+      d_q_trunc[col * m + row] = d_q[col * k + row];
+    });
 }
 
 /**
@@ -125,24 +130,25 @@ void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
-  auto n = n_cols;
-  auto m = n_rows;
-  idx_t size = n_rows * n_cols;
-  auto d_q = inout;
+void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  auto n            = n_cols;
+  auto m            = n_rows;
+  idx_t size        = n_rows * n_cols;
+  auto d_q          = inout;
   auto d_q_reversed = inout;
-  auto counting = thrust::make_counting_iterator<idx_t>(0);
+  auto counting     = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting,
-                   counting + (size / 2), [=] __device__(idx_t idx) {
-                     idx_t dest_row = idx % m;
-                     idx_t dest_col = idx / m;
-                     idx_t src_row = dest_row;
-                     idx_t src_col = (n - dest_col) - 1;
-                     m_t temp = (m_t)d_q_reversed[idx];
-                     d_q_reversed[idx] = d_q[src_col * m + src_row];
-                     d_q[src_col * m + src_row] = temp;
-                   });
+  thrust::for_each(
+    thrust::cuda::par.on(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
+      idx_t dest_row             = idx % m;
+      idx_t dest_col             = idx / m;
+      idx_t src_row              = dest_row;
+      idx_t src_col              = (n - dest_col) - 1;
+      m_t temp                   = (m_t)d_q_reversed[idx];
+      d_q_reversed[idx]          = d_q[src_col * m + src_row];
+      d_q[src_col * m + src_row] = temp;
+    });
 }
 
 /**
@@ -154,25 +160,26 @@ void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
-  auto m = n_rows;
-  idx_t size = n_rows * n_cols;
-  auto d_q = inout;
+void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  auto m            = n_rows;
+  idx_t size        = n_rows * n_cols;
+  auto d_q          = inout;
   auto d_q_reversed = inout;
-  auto counting = thrust::make_counting_iterator<idx_t>(0);
+  auto counting     = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(thrust::cuda::par.on(stream), counting,
-                   counting + (size / 2), [=] __device__(idx_t idx) {
-                     idx_t dest_row = idx % m;
-                     idx_t dest_col = idx / m;
-                     idx_t src_row = (m - dest_row) - 1;
-                     ;
-                     idx_t src_col = dest_col;
+  thrust::for_each(
+    thrust::cuda::par.on(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
+      idx_t dest_row = idx % m;
+      idx_t dest_col = idx / m;
+      idx_t src_row  = (m - dest_row) - 1;
+      ;
+      idx_t src_col = dest_col;
 
-                     m_t temp = (m_t)d_q_reversed[idx];
-                     d_q_reversed[idx] = d_q[src_col * m + src_row];
-                     d_q[src_col * m + src_row] = temp;
-                   });
+      m_t temp                   = (m_t)d_q_reversed[idx];
+      d_q_reversed[idx]          = d_q[src_col * m + src_row];
+      d_q[src_col * m + src_row] = temp;
+    });
 }
 
 /**
@@ -184,16 +191,16 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
  * @param v_separator: vertical separator character
  */
 template <typename m_t, typename idx_t = int>
-void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ',
-           char v_separator = '\n') {
+void print(
+  const m_t* in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', char v_separator = '\n')
+{
   std::vector<m_t> h_matrix = std::vector<m_t>(n_cols * n_rows);
-  CUDA_CHECK(cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t),
-                        cudaMemcpyDeviceToHost));
+  CUDA_CHECK(
+    cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t), cudaMemcpyDeviceToHost));
 
   for (idx_t i = 0; i < n_rows; i++) {
     for (idx_t j = 0; j < n_cols; j++) {
-      printf("%1.4f%c", h_matrix[j * n_rows + i],
-             j < n_cols - 1 ? h_separator : v_separator);
+      printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator);
     }
   }
 }
@@ -205,7 +212,8 @@ void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ',
  * @param n_cols: number of columns of input matrix
  */
 template <typename m_t, typename idx_t = int>
-void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) {
+void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
+{
   for (idx_t i = 0; i < n_rows; i++) {
     for (idx_t j = 0; j < n_cols; j++) {
       printf("%1.4f ", in[j * n_rows + i]);
@@ -226,8 +234,9 @@ void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) {
  * (1-based)
  */
 template <typename m_t, typename idx_t = int>
-__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
-                      idx_t y1, idx_t x2, idx_t y2) {
+__global__ void slice(
+  m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2)
+{
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t dm = x2 - x1, dn = y2 - y1;
   if (idx < dm * dn) {
@@ -251,8 +260,16 @@ __global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
-                 idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) {
+void sliceMatrix(m_t* in,
+                 idx_t n_rows,
+                 idx_t n_cols,
+                 m_t* out,
+                 idx_t x1,
+                 idx_t y1,
+                 idx_t x2,
+                 idx_t y2,
+                 cudaStream_t stream)
+{
   // Slicing
   dim3 block(64);
   dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x);
@@ -268,15 +285,13 @@ void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
  * @param k: min(n_rows, n_cols)
  */
 template <typename m_t, typename idx_t = int>
-__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows,
-                                   idx_t n_cols, idx_t k) {
+__global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k)
+{
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t m = n_rows, n = n_cols;
   if (idx < m * n) {
     idx_t i = idx % m, j = idx / m;
-    if (i < k && j < k && j >= i) {
-      dst[i + j * k] = src[idx];
-    }
+    if (i < k && j < k && j >= i) { dst[i + j * k] = src[idx]; }
   }
 }
 
@@ -289,8 +304,8 @@ __global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
-                         cudaStream_t stream) {
+void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   idx_t m = n_rows, n = n_cols;
   idx_t k = min(m, n);
   dim3 block(64);
@@ -307,13 +322,11 @@ void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
  * @param k: dimensionality
  */
 template <typename m_t, typename idx_t = int>
-__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m,
-                                           idx_t n, idx_t k) {
+__global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k)
+{
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
 
-  if (idx < k) {
-    matrix[idx + idx * m] = vec[idx];
-  }
+  if (idx < k) { matrix[idx + idx * m] = vec[idx]; }
 }
 
 /**
@@ -325,13 +338,13 @@ __global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
-                              cudaStream_t stream) {
+void initializeDiagonalMatrix(
+  m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   idx_t k = min(n_rows, n_cols);
   dim3 block(64);
   dim3 grid((k + block.x - 1) / block.x);
-  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows,
-                                                         n_cols, k);
+  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows, n_cols, k);
 }
 
 /**
@@ -341,11 +354,10 @@ void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
  * @param len: size of one side of the matrix
  */
 template <typename m_t, typename idx_t = int>
-__global__ void matrixDiagonalInverse(m_t *in, idx_t len) {
+__global__ void matrixDiagonalInverse(m_t* in, idx_t len)
+{
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (idx < len) {
-    in[idx + idx * len] = 1.0 / in[idx + idx * len];
-  }
+  if (idx < len) { in[idx + idx * len] = 1.0 / in[idx + idx * len]; }
 }
 
 /**
@@ -355,7 +367,8 @@ __global__ void matrixDiagonalInverse(m_t *in, idx_t len) {
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
+void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
+{
   dim3 block(64);
   dim3 grid((len + block.x - 1) / block.x);
   matrixDiagonalInverse<m_t><<<grid, block, 0, stream>>>(in, len);
@@ -369,12 +382,11 @@ void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-m_t getL2Norm(const raft::handle_t &handle, m_t *in, idx_t size,
-              cudaStream_t stream) {
+m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
+{
   cublasHandle_t cublasH = handle.get_cublas_handle();
-  m_t normval = 0;
-  CUBLAS_CHECK(
-    raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  m_t normval            = 0;
+  CUBLAS_CHECK(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
   return normval;
 }
 
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
index 29e0d7cfcd..18c8be5f45 100644
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ b/cpp/include/raft/mr/buffer_base.hpp
@@ -35,11 +35,11 @@ namespace mr {
 template <typename T, typename AllocatorT>
 class buffer_base {
  public:
-  using size_type = std::size_t;
-  using value_type = T;
-  using iterator = value_type*;
-  using const_iterator = const value_type*;
-  using reference = T&;
+  using size_type       = std::size_t;
+  using value_type      = T;
+  using iterator        = value_type*;
+  using const_iterator  = const value_type*;
+  using reference       = T&;
   using const_reference = const T&;
 
   buffer_base() = delete;
@@ -55,16 +55,12 @@ class buffer_base {
    * @param[in] stream    cuda stream where this allocation operations are async
    * @param[in] n         size of the buffer (in number of elements)
    */
-  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream,
-              size_type n = 0)
-    : data_(nullptr),
-      size_(n),
-      capacity_(n),
-      stream_(stream),
-      allocator_(std::move(allocator)) {
+  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream, size_type n = 0)
+    : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator))
+  {
     if (capacity_ > 0) {
-      data_ = static_cast<value_type*>(
-        allocator_->allocate(capacity_ * sizeof(value_type), stream_));
+      data_ =
+        static_cast<value_type*>(allocator_->allocate(capacity_ * sizeof(value_type), stream_));
       CUDA_CHECK(cudaStreamSynchronize(stream_));
     }
   }
@@ -98,23 +94,23 @@ class buffer_base {
    * @param[in] stream       cuda stream where allocation operations are queued
    * @{
    */
-  void reserve(size_type new_capacity) {
+  void reserve(size_type new_capacity)
+  {
     if (new_capacity > capacity_) {
-      auto* new_data = static_cast<value_type*>(
-        allocator_->allocate(new_capacity * sizeof(value_type), stream_));
-      if (size_ > 0) {
-        raft::copy(new_data, data_, size_, stream_);
-      }
+      auto* new_data =
+        static_cast<value_type*>(allocator_->allocate(new_capacity * sizeof(value_type), stream_));
+      if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); }
       // Only deallocate if we have allocated a pointer
       if (nullptr != data_) {
         allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
       }
-      data_ = new_data;
+      data_     = new_data;
       capacity_ = new_capacity;
     }
   }
 
-  void reserve(size_type new_capacity, cudaStream_t stream) {
+  void reserve(size_type new_capacity, cudaStream_t stream)
+  {
     set_stream(stream);
     reserve(new_capacity);
   }
@@ -127,12 +123,14 @@ class buffer_base {
    * @param[in] stream   cuda stream where the work will be queued
    * @{
    */
-  void resize(const size_type new_size) {
+  void resize(const size_type new_size)
+  {
     reserve(new_size);
     size_ = new_size;
   }
 
-  void resize(const size_type new_size, cudaStream_t stream) {
+  void resize(const size_type new_size, cudaStream_t stream)
+  {
     set_stream(stream);
     resize(new_size);
   }
@@ -146,16 +144,18 @@ class buffer_base {
    * @param[in] stream   cuda stream where the work will be queued
    * @{
    */
-  void release() {
+  void release()
+  {
     if (nullptr != data_) {
       allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
     }
-    data_ = nullptr;
+    data_     = nullptr;
     capacity_ = 0;
-    size_ = 0;
+    size_     = 0;
   }
 
-  void release(cudaStream_t stream) {
+  void release(cudaStream_t stream)
+  {
     set_stream(stream);
     release();
   }
@@ -195,7 +195,8 @@ class buffer_base {
    * @param[in] stream new cuda stream to be set. If it is the same as the
    *                   current one, then this method will be a no-op.
    */
-  void set_stream(cudaStream_t stream) {
+  void set_stream(cudaStream_t stream)
+  {
     if (stream_ != stream) {
       cudaEvent_t event;
       CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp
index 889e1640db..e930b617e0 100644
--- a/cpp/include/raft/mr/device/allocator.hpp
+++ b/cpp/include/raft/mr/device/allocator.hpp
@@ -32,17 +32,20 @@ namespace device {
  * further to the ones listed in `Allocator`:
  * - Allocations may be always on the device that was specified on construction.
  */
-class allocator : public base_allocator {};
+class allocator : public base_allocator {
+};
 
 /** Default device allocator based on the one provided by RMM */
 class default_allocator : public allocator {
  public:
-  void* allocate(std::size_t n, cudaStream_t stream) override {
+  void* allocate(std::size_t n, cudaStream_t stream) override
+  {
     void* ptr = rmm::mr::get_current_device_resource()->allocate(n, stream);
     return ptr;
   }
 
-  void deallocate(void* p, std::size_t n, cudaStream_t stream) override {
+  void deallocate(void* p, std::size_t n, cudaStream_t stream) override
+  {
     rmm::mr::get_current_device_resource()->deallocate(p, n, stream);
   }
 };  // class default_allocator
diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp
index 39b5674ce4..2b9d84368f 100644
--- a/cpp/include/raft/mr/device/buffer.hpp
+++ b/cpp/include/raft/mr/device/buffer.hpp
@@ -46,11 +46,11 @@ namespace device {
 template <typename T>
 class buffer : public buffer_base<T, allocator> {
  public:
-  using size_type = typename buffer_base<T, allocator>::size_type;
-  using value_type = typename buffer_base<T, allocator>::value_type;
-  using iterator = typename buffer_base<T, allocator>::iterator;
-  using const_iterator = typename buffer_base<T, allocator>::const_iterator;
-  using reference = typename buffer_base<T, allocator>::reference;
+  using size_type       = typename buffer_base<T, allocator>::size_type;
+  using value_type      = typename buffer_base<T, allocator>::value_type;
+  using iterator        = typename buffer_base<T, allocator>::iterator;
+  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
+  using reference       = typename buffer_base<T, allocator>::reference;
   using const_reference = typename buffer_base<T, allocator>::const_reference;
 
   buffer() = delete;
@@ -60,7 +60,9 @@ class buffer : public buffer_base<T, allocator> {
   buffer& operator=(const buffer& other) = delete;
 
   buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, device::allocator>(alloc, stream, n) {}
+    : buffer_base<T, device::allocator>(alloc, stream, n)
+  {
+  }
 };  // class buffer
 
 };  // namespace device
diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp
index 8af266d4f0..62b6826211 100644
--- a/cpp/include/raft/mr/host/allocator.hpp
+++ b/cpp/include/raft/mr/host/allocator.hpp
@@ -34,20 +34,23 @@ namespace host {
  * further to the ones listed in `Allocator`:
  * - Allocations don't need to be zero copy accessible form a device.
  */
-class allocator : public base_allocator {};
+class allocator : public base_allocator {
+};
 
 /** Default cudaMallocHost/cudaFreeHost based host allocator */
 class default_allocator : public allocator {
  public:
-  void* allocate(std::size_t n, cudaStream_t stream) override {
+  void* allocate(std::size_t n, cudaStream_t stream) override
+  {
     void* ptr = nullptr;
     CUDA_CHECK(cudaMallocHost(&ptr, n));
     return ptr;
   }
 
-  void deallocate(void* p, std::size_t n, cudaStream_t stream) override {
-    //Must call _NO_THROW here since this is called frequently from object
-    //destructors which are "nothrow" by default
+  void deallocate(void* p, std::size_t n, cudaStream_t stream) override
+  {
+    // Must call _NO_THROW here since this is called frequently from object
+    // destructors which are "nothrow" by default
     CUDA_CHECK_NO_THROW(cudaFreeHost(p));
   }
 };  // class default_allocator
diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp
index 3c505bf2ed..52475ad6ec 100644
--- a/cpp/include/raft/mr/host/buffer.hpp
+++ b/cpp/include/raft/mr/host/buffer.hpp
@@ -48,11 +48,11 @@ namespace host {
 template <typename T>
 class buffer : public buffer_base<T, allocator> {
  public:
-  using size_type = typename buffer_base<T, allocator>::size_type;
-  using value_type = typename buffer_base<T, allocator>::value_type;
-  using iterator = typename buffer_base<T, allocator>::iterator;
-  using const_iterator = typename buffer_base<T, allocator>::const_iterator;
-  using reference = typename buffer_base<T, allocator>::reference;
+  using size_type       = typename buffer_base<T, allocator>::size_type;
+  using value_type      = typename buffer_base<T, allocator>::value_type;
+  using iterator        = typename buffer_base<T, allocator>::iterator;
+  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
+  using reference       = typename buffer_base<T, allocator>::reference;
   using const_reference = typename buffer_base<T, allocator>::const_reference;
 
   buffer() = delete;
@@ -62,14 +62,15 @@ class buffer : public buffer_base<T, allocator> {
   buffer& operator=(const buffer& other) = delete;
 
   buffer(std::shared_ptr<allocator> alloc, const device::buffer<T>& other)
-    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size()) {
-    if (other.size() > 0) {
-      raft::copy(data_, other.data(), other.size(), other.get_stream());
-    }
+    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size())
+  {
+    if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); }
   }
 
   buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, allocator>(alloc, stream, n) {}
+    : buffer_base<T, allocator>(alloc, stream, n)
+  {
+  }
 
   reference operator[](size_type pos) { return data_[pos]; }
 
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 56710ea81f..5267770e8a 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -43,10 +43,9 @@ enum GeneratorType {
   GenKiss99
 };
 
-template <typename OutType, typename MathType, typename GenType,
-          typename LenType, typename Lambda>
-__global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr,
-                           LenType len, Lambda randOp) {
+template <typename OutType, typename MathType, typename GenType, typename LenType, typename Lambda>
+__global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda randOp)
+{
   LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   detail::Generator<GenType> gen(seed, (uint64_t)tid, offset);
   const LenType stride = gridDim.x * blockDim.x;
@@ -58,10 +57,10 @@ __global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr,
 }
 
 // used for Box-Muller type transformations
-template <typename OutType, typename MathType, typename GenType,
-          typename LenType, typename Lambda2>
-__global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr,
-                            LenType len, Lambda2 rand2Op) {
+template <typename OutType, typename MathType, typename GenType, typename LenType, typename Lambda2>
+__global__ void rand2Kernel(
+  uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda2 rand2Op)
+{
   LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   detail::Generator<GenType> gen(seed, (uint64_t)tid, offset);
   const LenType stride = gridDim.x * blockDim.x;
@@ -77,8 +76,9 @@ __global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr,
 }
 
 template <typename Type>
-__global__ void constFillKernel(Type *ptr, int len, Type val) {
-  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+__global__ void constFillKernel(Type* ptr, int len, Type val)
+{
+  unsigned tid          = (blockIdx.x * blockDim.x) + threadIdx.x;
   const unsigned stride = gridDim.x * blockDim.x;
   for (unsigned idx = tid; idx < len; idx += stride) {
     ptr[idx] = val;
@@ -99,19 +99,20 @@ __global__ void constFillKernel(Type *ptr, int len, Type val) {
  * @{
  */
 template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
-                             Type sigma2, Type mu2) {
-  constexpr Type twoPi = Type(2.0) * Type(3.141592654);
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
+{
+  constexpr Type twoPi  = Type(2.0) * Type(3.141592654);
   constexpr Type minus2 = -Type(2.0);
-  Type R = raft::mySqrt(minus2 * raft::myLog(val1));
-  Type theta = twoPi * val2;
+  Type R                = raft::mySqrt(minus2 * raft::myLog(val1));
+  Type theta            = twoPi * val2;
   Type s, c;
   raft::mySinCos(theta, s, c);
   val1 = R * c * sigma1 + mu1;
   val2 = R * s * sigma2 + mu2;
 }
 template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
+{
   box_muller_transform<Type>(val1, val2, sigma1, mu1, sigma1, mu1);
 }
 /** @} */
@@ -131,7 +132,8 @@ class Rng {
       // simple heuristic to make sure all SMs will be occupied properly
       // and also not too many initialization calls will be made by each thread
       nBlocks(4 * getMultiProcessorCount()),
-      gen() {
+      gen()
+  {
     seed(_s);
   }
 
@@ -142,7 +144,8 @@ class Rng {
    *       function of timestamp. Another example is to use the c++11's
    *       `std::random_device` for setting seed.
    */
-  void seed(uint64_t _s) {
+  void seed(uint64_t _s)
+  {
     gen.seed(_s);
     offset = 0;
   }
@@ -158,7 +161,8 @@ class Rng {
    * @param[out] b intercept parameter
    */
   template <typename IdxT>
-  void affine_transform_params(IdxT n, IdxT &a, IdxT &b) {
+  void affine_transform_params(IdxT n, IdxT& a, IdxT& b)
+  {
     // always keep 'a' to be coprime to 'n'
     a = gen() % n;
     while (gcd(a, n) != 1) {
@@ -181,27 +185,24 @@ class Rng {
    * @{
    */
   template <typename Type, typename LenType = int>
-  void uniform(Type *ptr, LenType len, Type start, Type end,
-               cudaStream_t stream) {
+  void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream)
+  {
     static_assert(std::is_floating_point<Type>::value,
                   "Type for 'uniform' can only be floating point!");
     custom_distribution(
-      ptr, len,
-      [=] __device__(Type val, LenType idx) {
-        return (val * (end - start)) + start;
-      },
+      ptr,
+      len,
+      [=] __device__(Type val, LenType idx) { return (val * (end - start)) + start; },
       stream);
   }
   template <typename IntType, typename LenType = int>
-  void uniformInt(IntType *ptr, LenType len, IntType start, IntType end,
-                  cudaStream_t stream) {
-    static_assert(std::is_integral<IntType>::value,
-                  "Type for 'uniformInt' can only be integer!");
+  void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream)
+  {
+    static_assert(std::is_integral<IntType>::value, "Type for 'uniformInt' can only be integer!");
     custom_distribution(
-      ptr, len,
-      [=] __device__(IntType val, LenType idx) {
-        return (val % (end - start)) + start;
-      },
+      ptr,
+      len,
+      [=] __device__(IntType val, LenType idx) { return (val % (end - start)) + start; },
       stream);
   }
   /** @} */
@@ -218,28 +219,37 @@ class Rng {
    * @{
    */
   template <typename Type, typename LenType = int>
-  void normal(Type *ptr, LenType len, Type mu, Type sigma,
-              cudaStream_t stream) {
+  void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  {
     static_assert(std::is_floating_point<Type>::value,
                   "Type for 'normal' can only be floating point!");
     rand2Impl(
-      offset, ptr, len,
+      offset,
+      ptr,
+      len,
       [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
         box_muller_transform<Type>(val1, val2, sigma, mu);
       },
-      NumThreads, nBlocks, type, stream);
+      NumThreads,
+      nBlocks,
+      type,
+      stream);
   }
   template <typename IntType, typename LenType = int>
-  void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma,
-                 cudaStream_t stream) {
-    static_assert(std::is_integral<IntType>::value,
-                  "Type for 'normalInt' can only be integer!");
+  void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
+  {
+    static_assert(std::is_integral<IntType>::value, "Type for 'normalInt' can only be integer!");
     rand2Impl<IntType, double>(
-      offset, ptr, len,
-      [=] __device__(double &val1, double &val2, LenType idx1, LenType idx2) {
+      offset,
+      ptr,
+      len,
+      [=] __device__(double& val1, double& val2, LenType idx1, LenType idx2) {
         box_muller_transform<double>(val1, val2, sigma, mu);
       },
-      NumThreads, nBlocks, type, stream);
+      NumThreads,
+      nBlocks,
+      type,
+      stream);
   }
   /** @} */
 
@@ -264,21 +274,32 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu,
-                   const Type *sigma_vec, Type sigma, cudaStream_t stream) {
+  void normalTable(Type* ptr,
+                   LenType n_rows,
+                   LenType n_cols,
+                   const Type* mu,
+                   const Type* sigma_vec,
+                   Type sigma,
+                   cudaStream_t stream)
+  {
     rand2Impl(
-      offset, ptr, n_rows * n_cols,
+      offset,
+      ptr,
+      n_rows * n_cols,
       [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
         // yikes! use fast-int-div
-        auto col1 = idx1 % n_cols;
-        auto col2 = idx2 % n_cols;
+        auto col1  = idx1 % n_cols;
+        auto col2  = idx2 % n_cols;
         auto mean1 = mu[col1];
         auto mean2 = mu[col2];
-        auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1];
-        auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2];
+        auto sig1  = sigma_vec == nullptr ? sigma : sigma_vec[col1];
+        auto sig2  = sigma_vec == nullptr ? sigma : sigma_vec[col2];
         box_muller_transform<Type>(val1, val2, sig1, mean1, sig2, mean2);
       },
-      NumThreads, nBlocks, type, stream);
+      NumThreads,
+      nBlocks,
+      type,
+      stream);
   }
 
   /**
@@ -291,7 +312,8 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) {
+  void fill(Type* ptr, LenType len, Type val, cudaStream_t stream)
+  {
     constFillKernel<Type><<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
     CUDA_CHECK(cudaPeekAtLastError());
   }
@@ -309,10 +331,10 @@ class Rng {
    * @param[in]  stream stream where to launch the kernel
    */
   template <typename Type, typename OutType = bool, typename LenType = int>
-  void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) {
+  void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream)
+  {
     custom_distribution<OutType, Type>(
-      ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; },
-      stream);
+      ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, stream);
   }
 
   /**
@@ -326,15 +348,14 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale,
-                        cudaStream_t stream) {
+  void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream)
+  {
     static_assert(std::is_floating_point<Type>::value,
                   "Type for 'scaled_bernoulli' can only be floating point!");
     custom_distribution(
-      ptr, len,
-      [=] __device__(Type val, LenType idx) {
-        return val > prob ? -scale : scale;
-      },
+      ptr,
+      len,
+      [=] __device__(Type val, LenType idx) { return val > prob ? -scale : scale; },
       stream);
   }
 
@@ -350,12 +371,12 @@ class Rng {
    * @note https://en.wikipedia.org/wiki/Gumbel_distribution
    */
   template <typename Type, typename LenType = int>
-  void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) {
+  void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
-      [=] __device__(Type val, LenType idx) {
-        return mu - beta * raft::myLog(-raft::myLog(val));
-      },
+      ptr,
+      len,
+      [=] __device__(Type val, LenType idx) { return mu - beta * raft::myLog(-raft::myLog(val)); },
       stream);
   }
 
@@ -370,16 +391,21 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void lognormal(Type *ptr, LenType len, Type mu, Type sigma,
-                 cudaStream_t stream) {
+  void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  {
     rand2Impl(
-      offset, ptr, len,
+      offset,
+      ptr,
+      len,
       [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
         box_muller_transform<Type>(val1, val2, sigma, mu);
         val1 = raft::myExp(val1);
         val2 = raft::myExp(val2);
       },
-      NumThreads, nBlocks, type, stream);
+      NumThreads,
+      nBlocks,
+      type,
+      stream);
   }
 
   /**
@@ -393,10 +419,11 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void logistic(Type *ptr, LenType len, Type mu, Type scale,
-                cudaStream_t stream) {
+  void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
+      ptr,
+      len,
       [=] __device__(Type val, LenType idx) {
         constexpr Type one = (Type)1.0;
         return mu - scale * raft::myLog(one / val - one);
@@ -414,9 +441,11 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) {
+  void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
+      ptr,
+      len,
       [=] __device__(Type val, LenType idx) {
         constexpr Type one = (Type)1.0;
         return -raft::myLog(one - val) / lambda;
@@ -434,9 +463,11 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) {
+  void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
+      ptr,
+      len,
       [=] __device__(Type val, LenType idx) {
         constexpr Type one = (Type)1.0;
         constexpr Type two = (Type)2.0;
@@ -456,13 +487,14 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void laplace(Type *ptr, LenType len, Type mu, Type scale,
-               cudaStream_t stream) {
+  void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
+      ptr,
+      len,
       [=] __device__(Type val, LenType idx) {
-        constexpr Type one = (Type)1.0;
-        constexpr Type two = (Type)2.0;
+        constexpr Type one     = (Type)1.0;
+        constexpr Type two     = (Type)2.0;
         constexpr Type oneHalf = (Type)0.5;
         Type out;
         if (val <= oneHalf) {
@@ -502,43 +534,44 @@ class Rng {
    * @param stream cuda stream
    */
   template <typename DataT, typename WeightsT, typename IdxT = int>
-  void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out,
-                                IdxT *outIdx, const DataT *in,
-                                const WeightsT *wts, IdxT sampledLen, IdxT len,
-                                cudaStream_t stream) {
-    ASSERT(sampledLen <= len,
-           "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
-
-    std::shared_ptr<raft::mr::device::allocator> allocator =
-      handle.get_device_allocator();
+  void sampleWithoutReplacement(const raft::handle_t& handle,
+                                DataT* out,
+                                IdxT* outIdx,
+                                const DataT* in,
+                                const WeightsT* wts,
+                                IdxT sampledLen,
+                                IdxT len,
+                                cudaStream_t stream)
+  {
+    ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
+
+    std::shared_ptr<raft::mr::device::allocator> allocator = handle.get_device_allocator();
 
     raft::mr::device::buffer<WeightsT> expWts(allocator, stream, len);
     raft::mr::device::buffer<WeightsT> sortedWts(allocator, stream, len);
     raft::mr::device::buffer<IdxT> inIdx(allocator, stream, len);
     raft::mr::device::buffer<IdxT> outIdxBuff(allocator, stream, len);
-    auto *inIdxPtr = inIdx.data();
+    auto* inIdxPtr = inIdx.data();
     // generate modified weights
     custom_distribution(
-      expWts.data(), len,
+      expWts.data(),
+      len,
       [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) {
-        inIdxPtr[idx] = idx;
+        inIdxPtr[idx]          = idx;
         constexpr WeightsT one = (WeightsT)1.0;
-        auto exp = -raft::myLog(one - val);
-        if (wts != nullptr) {
-          return exp / wts[idx];
-        }
+        auto exp               = -raft::myLog(one - val);
+        if (wts != nullptr) { return exp / wts[idx]; }
         return exp;
       },
       stream);
     ///@todo: use a more efficient partitioning scheme instead of full sort
     // sort the array and pick the top sampledLen items
-    IdxT *outIdxPtr = outIdxBuff.data();
+    IdxT* outIdxPtr = outIdxBuff.data();
     raft::mr::device::buffer<char> workspace(allocator, stream);
-    sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr,
-              (int)len, stream);
+    sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream);
     if (outIdx != nullptr) {
-      CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen,
-                                 cudaMemcpyDeviceToDevice, stream));
+      CUDA_CHECK(cudaMemcpyAsync(
+        outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream));
     }
     scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
   }
@@ -558,17 +591,15 @@ class Rng {
    * @param[in]  stream cuda stream
    * @{
    */
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void custom_distribution(OutType *ptr, LenType len, Lambda randOp,
-                           cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
+  {
     randImpl<OutType, MathType, LenType, Lambda>(
       offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
   }
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void custom_distribution2(OutType *ptr, LenType len, Lambda randOp,
-                            cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
+  {
     rand2Impl<OutType, MathType, LenType, Lambda>(
       offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
   }
@@ -591,12 +622,10 @@ class Rng {
   static const int NumThreads = 256;
 
   template <bool IsNormal, typename Type, typename LenType>
-  uint64_t _setupSeeds(uint64_t &seed, uint64_t &offset, LenType len,
-                       int nThreads, int nBlocks) {
+  uint64_t _setupSeeds(uint64_t& seed, uint64_t& offset, LenType len, int nThreads, int nBlocks)
+  {
     LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads));
-    if (IsNormal && itemsPerThread % 2 == 1) {
-      ++itemsPerThread;
-    }
+    if (IsNormal && itemsPerThread % 2 == 1) { ++itemsPerThread; }
     // curand uses 2 32b uint's to generate one double
     uint64_t factor = sizeof(Type) / sizeof(float);
     if (factor == 0) ++factor;
@@ -604,22 +633,26 @@ class Rng {
     // If not, then generate new seed and start from zero offset
     uint64_t newOffset = offset + LenType(itemsPerThread) * factor;
     if (newOffset < offset) {
-      offset = 0;
-      seed = gen();
+      offset    = 0;
+      seed      = gen();
       newOffset = itemsPerThread * factor;
     }
     return newOffset;
   }
 
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void randImpl(uint64_t &offset, OutType *ptr, LenType len, Lambda randOp,
-                int nThreads, int nBlocks, GeneratorType type,
-                cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void randImpl(uint64_t& offset,
+                OutType* ptr,
+                LenType len,
+                Lambda randOp,
+                int nThreads,
+                int nBlocks,
+                GeneratorType type,
+                cudaStream_t stream)
+  {
     if (len <= 0) return;
-    uint64_t seed = gen();
-    auto newOffset = _setupSeeds<false, MathType, LenType>(seed, offset, len,
-                                                           nThreads, nBlocks);
+    uint64_t seed  = gen();
+    auto newOffset = _setupSeeds<false, MathType, LenType>(seed, offset, len, nThreads, nBlocks);
     switch (type) {
       case GenPhilox:
         randKernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda>
@@ -633,26 +666,28 @@ class Rng {
         randKernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
-      default:
-        ASSERT(false, "randImpl: Incorrect generator type! %d", type);
+      default: ASSERT(false, "randImpl: Incorrect generator type! %d", type);
     };
     CUDA_CHECK(cudaGetLastError());
     offset = newOffset;
   }
 
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda2>
-  void rand2Impl(uint64_t &offset, OutType *ptr, LenType len, Lambda2 rand2Op,
-                 int nThreads, int nBlocks, GeneratorType type,
-                 cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda2>
+  void rand2Impl(uint64_t& offset,
+                 OutType* ptr,
+                 LenType len,
+                 Lambda2 rand2Op,
+                 int nThreads,
+                 int nBlocks,
+                 GeneratorType type,
+                 cudaStream_t stream)
+  {
     if (len <= 0) return;
-    auto seed = gen();
-    auto newOffset = _setupSeeds<true, MathType, LenType>(seed, offset, len,
-                                                          nThreads, nBlocks);
+    auto seed      = gen();
+    auto newOffset = _setupSeeds<true, MathType, LenType>(seed, offset, len, nThreads, nBlocks);
     switch (type) {
       case GenPhilox:
-        rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType,
-                    Lambda2>
+        rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       case GenTaps:
@@ -660,12 +695,10 @@ class Rng {
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       case GenKiss99:
-        rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType,
-                    Lambda2>
+        rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
-      default:
-        ASSERT(false, "rand2Impl: Incorrect generator type! %d", type);
+      default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type);
     };
     CUDA_CHECK(cudaGetLastError());
     offset = newOffset;
diff --git a/cpp/include/raft/random/rng_impl.cuh b/cpp/include/raft/random/rng_impl.cuh
index d44c6f018b..485f4ddd68 100644
--- a/cpp/include/raft/random/rng_impl.cuh
+++ b/cpp/include/raft/random/rng_impl.cuh
@@ -33,7 +33,8 @@ struct PhiloxGenerator {
    * @param subsequence as found in curand docs
    * @param offset as found in curand docs
    */
-  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+  {
     curand_init(seed, subsequence, offset, &state);
   }
 
@@ -44,18 +45,21 @@ struct PhiloxGenerator {
   DI void next(float& ret) { ret = curand_uniform(&(this->state)); }
   DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); }
   DI void next(uint32_t& ret) { ret = curand(&(this->state)); }
-  DI void next(uint64_t& ret) {
+  DI void next(uint64_t& ret)
+  {
     uint32_t a, b;
     next(a);
     next(b);
     ret = (uint64_t)a | ((uint64_t)b << 32);
   }
-  DI void next(int32_t& ret) {
+  DI void next(int32_t& ret)
+  {
     uint32_t val;
     next(val);
     ret = int32_t(val & 0x7fffffff);
   }
-  DI void next(int64_t& ret) {
+  DI void next(int64_t& ret)
+  {
     uint64_t val;
     next(val);
     ret = int64_t(val & 0x7fffffffffffffff);
@@ -76,8 +80,9 @@ struct TapsGenerator {
    * @param subsequence unused
    * @param offset unused
    */
-  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
-    uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x;
+  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+  {
+    uint64_t delta  = (blockIdx.x * blockDim.x) + threadIdx.x;
     uint64_t stride = blockDim.x * gridDim.x;
     delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride;
     stride *= blockDim.y * gridDim.y;
@@ -90,31 +95,36 @@ struct TapsGenerator {
    * @{
    */
   template <typename Type>
-  DI void next(Type& ret) {
+  DI void next(Type& ret)
+  {
     constexpr double ULL_LARGE = 1.8446744073709551614e19;
     uint64_t val;
     next(val);
     ret = static_cast<Type>(val);
     ret /= static_cast<Type>(ULL_LARGE);
   }
-  DI void next(uint64_t& ret) {
+  DI void next(uint64_t& ret)
+  {
     constexpr uint64_t TAPS = 0x8000100040002000ULL;
-    constexpr int ROUNDS = 128;
+    constexpr int ROUNDS    = 128;
     for (int i = 0; i < ROUNDS; i++)
       state = (state >> 1) ^ (-(state & 1ULL) & TAPS);
     ret = state;
   }
-  DI void next(uint32_t& ret) {
+  DI void next(uint32_t& ret)
+  {
     uint64_t val;
     next(val);
     ret = (uint32_t)val;
   }
-  DI void next(int32_t& ret) {
+  DI void next(int32_t& ret)
+  {
     uint32_t val;
     next(val);
     ret = int32_t(val & 0x7fffffff);
   }
-  DI void next(int64_t& ret) {
+  DI void next(int64_t& ret)
+  {
     uint64_t val;
     next(val);
     ret = int64_t(val & 0x7fffffffffffffff);
@@ -135,46 +145,49 @@ struct Kiss99Generator {
    * @param subsequence unused
    * @param offset unused
    */
-  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
-    initKiss99(seed);
-  }
+  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { initKiss99(seed); }
 
   /**
    * @defgroup NextRand Generate the next random number
    * @{
    */
   template <typename Type>
-  DI void next(Type& ret) {
+  DI void next(Type& ret)
+  {
     constexpr double U_LARGE = 4.294967295e9;
     uint32_t val;
     next(val);
     ret = static_cast<Type>(val);
     ret /= static_cast<Type>(U_LARGE);
   }
-  DI void next(uint32_t& ret) {
+  DI void next(uint32_t& ret)
+  {
     uint32_t MWC;
-    z = 36969 * (z & 65535) + (z >> 16);
-    w = 18000 * (w & 65535) + (w >> 16);
+    z   = 36969 * (z & 65535) + (z >> 16);
+    w   = 18000 * (w & 65535) + (w >> 16);
     MWC = ((z << 16) + w);
     jsr ^= (jsr << 17);
     jsr ^= (jsr >> 13);
     jsr ^= (jsr << 5);
     jcong = 69069 * jcong + 1234567;
-    MWC = ((MWC ^ jcong) + jsr);
-    ret = MWC;
+    MWC   = ((MWC ^ jcong) + jsr);
+    ret   = MWC;
   }
-  DI void next(uint64_t& ret) {
+  DI void next(uint64_t& ret)
+  {
     uint32_t a, b;
     next(a);
     next(b);
     ret = (uint64_t)a | ((uint64_t)b << 32);
   }
-  DI void next(int32_t& ret) {
+  DI void next(int32_t& ret)
+  {
     uint32_t val;
     next(val);
     ret = int32_t(val & 0x7fffffff);
   }
-  DI void next(int64_t& ret) {
+  DI void next(int64_t& ret)
+  {
     uint64_t val;
     next(val);
     ret = int64_t(val & 0x7fffffffffffffff);
@@ -193,7 +206,8 @@ struct Kiss99Generator {
 
   // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower
   // 128 bits. It uses 32-bit wide multiply only.
-  DI void mulByFnv1a128Prime(uint32_t* h) {
+  DI void mulByFnv1a128Prime(uint32_t* h)
+  {
     typedef union {
       uint32_t u32[2];
       uint64_t u64[1];
@@ -217,12 +231,12 @@ struct Kiss99Generator {
     // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]);
     // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]);
     uint32_t carry = 0;
-    h[0] = h0p0.u32[0];
+    h[0]           = h0p0.u32[0];
 
-    h[1] = h0p0.u32[1] + h1p0.u32[0];
+    h[1]  = h0p0.u32[1] + h1p0.u32[0];
     carry = h[1] < h0p0.u32[1] ? 1 : 0;
 
-    h[2] = h1p0.u32[1] + carry;
+    h[2]  = h1p0.u32[1] + carry;
     carry = h[2] < h1p0.u32[1] ? 1 : 0;
     h[2] += h2p0.u32[0];
     carry = h[2] < h2p0.u32[0] ? carry + 1 : carry;
@@ -233,7 +247,8 @@ struct Kiss99Generator {
     return;
   }
 
-  DI void fnv1a128(uint32_t* hash, uint32_t txt) {
+  DI void fnv1a128(uint32_t* hash, uint32_t txt)
+  {
     hash[0] ^= (txt >> 0) & 0xFF;
     mulByFnv1a128Prime(hash);
     hash[0] ^= (txt >> 8) & 0xFF;
@@ -244,7 +259,8 @@ struct Kiss99Generator {
     mulByFnv1a128Prime(hash);
   }
 
-  DI void initKiss99(uint64_t seed) {
+  DI void initKiss99(uint64_t seed)
+  {
     // Initialize hash to 128-bit FNV1a basis
     uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL};
 
@@ -259,9 +275,9 @@ struct Kiss99Generator {
     fnv1a128(hash, uint32_t(seed >> 32));
 
     // Initialize KISS99 state with hash
-    z = hash[0];
-    w = hash[1];
-    jsr = hash[2];
+    z     = hash[0];
+    w     = hash[1];
+    jsr   = hash[2];
     jcong = hash[3];
   }
 };
@@ -273,10 +289,13 @@ struct Kiss99Generator {
 template <typename GenType>
 struct Generator {
   DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset)
-    : gen(seed, subsequence, offset) {}
+    : gen(seed, subsequence, offset)
+  {
+  }
 
   template <typename Type>
-  DI void next(Type& ret) {
+  DI void next(Type& ret)
+  {
     gen.next(ret);
   }
 
diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh
index e367550060..5d38bdf4a8 100644
--- a/cpp/include/raft/sparse/convert/coo.cuh
+++ b/cpp/include/raft/sparse/convert/coo.cuh
@@ -37,14 +37,18 @@ namespace sparse {
 namespace convert {
 
 template <typename value_idx = int, int TPB_X = 32>
-__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m,
-                                  value_idx *coo_rows, value_idx nnz) {
+__global__ void csr_to_coo_kernel(const value_idx* row_ind,
+                                  value_idx m,
+                                  value_idx* coo_rows,
+                                  value_idx nnz)
+{
   // row-based matrix 1 thread per row
   value_idx row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < m) {
     value_idx start_idx = row_ind[row];
-    value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind);
-    for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row;
+    value_idx stop_idx  = get_stop_idx(row, m, nnz, row_ind);
+    for (value_idx i = start_idx; i < stop_idx; i++)
+      coo_rows[i] = row;
   }
 }
 
@@ -57,14 +61,14 @@ __global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m,
  * @param stream: cuda stream to use
  */
 template <typename value_idx = int, int TPB_X = 32>
-void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows,
-                value_idx nnz, cudaStream_t stream) {
+void csr_to_coo(
+  const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
+{
   // @TODO: Use cusparse for this.
   dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_to_coo_kernel<value_idx, TPB_X>
-    <<<grid, blk, 0, stream>>>(row_ind, m, coo_rows, nnz);
+  csr_to_coo_kernel<value_idx, TPB_X><<<grid, blk, 0, stream>>>(row_ind, m, coo_rows, nnz);
 
   CUDA_CHECK(cudaGetLastError());
 }
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index a034bdbda8..2191f5edd1 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -44,29 +44,33 @@ namespace sparse {
 namespace convert {
 
 template <typename value_t>
-void coo_to_csr(const raft::handle_t &handle, const int *srcRows,
-                const int *srcCols, const value_t *srcVals, int nnz, int m,
-                int *dst_offsets, int *dstCols, value_t *dstVals) {
-  auto stream = handle.get_stream();
+void coo_to_csr(const raft::handle_t& handle,
+                const int* srcRows,
+                const int* srcCols,
+                const value_t* srcVals,
+                int nnz,
+                int m,
+                int* dst_offsets,
+                int* dstCols,
+                value_t* dstVals)
+{
+  auto stream         = handle.get_stream();
   auto cusparseHandle = handle.get_cusparse_handle();
-  auto d_alloc = handle.get_device_allocator();
+  auto d_alloc        = handle.get_device_allocator();
   raft::mr::device::buffer<int> dstRows(d_alloc, stream, nnz);
-  CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz,
-                             cudaMemcpyDeviceToDevice, stream));
-  CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz,
-                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
   auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt(
     cusparseHandle, m, m, nnz, srcRows, srcCols, stream);
   raft::mr::device::buffer<char> pBuffer(d_alloc, stream, buffSize);
   raft::mr::device::buffer<int> P(d_alloc, stream, nnz);
-  CUSPARSE_CHECK(
-    cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
-  raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(),
-                                     dstCols, P.data(), pBuffer.data(), stream);
-  raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(),
-                             stream);
-  raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m,
-                                dst_offsets, stream);
+  CUSPARSE_CHECK(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
+  raft::sparse::cusparsecoosortByRow(
+    cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream);
+  raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream);
+  raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream);
   CUDA_CHECK(cudaDeviceSynchronize());
 }
 
@@ -85,14 +89,20 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows,
  * @param stream cuda stream to use
  * @param fused_op: the fused operation
  */
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
-                           Index_ batchSize, const bool *adj,
-                           Index_ *row_ind_ptr, cudaStream_t stream,
-                           Lambda fused_op) {
+template <typename Index_, int TPB_X = 32, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_* row_ind,
+                           Index_ total_rows,
+                           Index_ nnz,
+                           Index_ batchSize,
+                           const bool* adj,
+                           Index_* row_ind_ptr,
+                           cudaStream_t stream,
+                           Lambda fused_op)
+{
   op::csr_row_op<Index_, TPB_X>(
-    row_ind, batchSize, nnz,
+    row_ind,
+    batchSize,
+    nnz,
     [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__(
       Index_ row, Index_ start_idx, Index_ stop_idx) {
       fused_op(row, start_idx, stop_idx);
@@ -108,14 +118,23 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
     stream);
 }
 
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
-                           Index_ batchSize, const bool *adj,
-                           Index_ *row_ind_ptr, cudaStream_t stream) {
-  csr_adj_graph_batched(
-    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream,
-    [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {});
+template <typename Index_, int TPB_X = 32, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_* row_ind,
+                           Index_ total_rows,
+                           Index_ nnz,
+                           Index_ batchSize,
+                           const bool* adj,
+                           Index_* row_ind_ptr,
+                           cudaStream_t stream)
+{
+  csr_adj_graph_batched(row_ind,
+                        total_rows,
+                        nnz,
+                        batchSize,
+                        adj,
+                        row_ind_ptr,
+                        stream,
+                        [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {});
 }
 
 /**
@@ -131,13 +150,17 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
  * @param stream cuda stream to use
  * @param fused_op the fused operation
  */
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
-                   const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream,
-                   Lambda fused_op) {
-  csr_adj_graph_batched<Index_, TPB_X>(row_ind, total_rows, nnz, total_rows,
-                                       adj, row_ind_ptr, stream, fused_op);
+template <typename Index_, int TPB_X = 32, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph(const Index_* row_ind,
+                   Index_ total_rows,
+                   Index_ nnz,
+                   const bool* adj,
+                   Index_* row_ind_ptr,
+                   cudaStream_t stream,
+                   Lambda fused_op)
+{
+  csr_adj_graph_batched<Index_, TPB_X>(
+    row_ind, total_rows, nnz, total_rows, adj, row_ind_ptr, stream, fused_op);
 }
 
 /**
@@ -151,9 +174,13 @@ void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
  * @param stream: cuda stream to use
  */
 template <typename T>
-void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
+void sorted_coo_to_csr(const T* rows,
+                       int nnz,
+                       T* row_ind,
+                       int m,
                        std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream) {
+                       cudaStream_t stream)
+{
   raft::mr::device::buffer<T> row_counts(d_alloc, stream, m);
 
   CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
@@ -161,11 +188,9 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
   linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream);
 
   // create csr compressed row index from row counts
-  thrust::device_ptr<T> row_counts_d =
-    thrust::device_pointer_cast(row_counts.data());
-  thrust::device_ptr<T> c_ind_d = thrust::device_pointer_cast(row_ind);
-  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
-                 c_ind_d);
+  thrust::device_ptr<T> row_counts_d = thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<T> c_ind_d      = thrust::device_pointer_cast(row_ind);
+  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, c_ind_d);
 }
 
 /**
@@ -177,11 +202,12 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
  * @param stream: cuda stream to use
  */
 template <typename T>
-void sorted_coo_to_csr(COO<T> *coo, int *row_ind,
+void sorted_coo_to_csr(COO<T>* coo,
+                       int* row_ind,
                        std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream) {
-  sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc,
-                    stream);
+                       cudaStream_t stream)
+{
+  sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc, stream);
 }
 
 };  // end NAMESPACE convert
diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh
index 299f9d36d4..e90882b501 100644
--- a/cpp/include/raft/sparse/convert/dense.cuh
+++ b/cpp/include/raft/sparse/convert/dense.cuh
@@ -37,22 +37,20 @@ namespace sparse {
 namespace convert {
 
 template <typename value_t>
-__global__ void csr_to_dense_warp_per_row_kernel(int n_cols,
-                                                 const value_t *csrVal,
-                                                 const int *csrRowPtr,
-                                                 const int *csrColInd,
-                                                 value_t *a) {
+__global__ void csr_to_dense_warp_per_row_kernel(
+  int n_cols, const value_t* csrVal, const int* csrRowPtr, const int* csrColInd, value_t* a)
+{
   int row = blockIdx.x;
   int tid = threadIdx.x;
 
   int colStart = csrRowPtr[row];
-  int colEnd = csrRowPtr[row + 1];
-  int rowNnz = colEnd - colStart;
+  int colEnd   = csrRowPtr[row + 1];
+  int rowNnz   = colEnd - colStart;
 
   for (int i = tid; i < rowNnz; i += blockDim.x) {
     int colIdx = colStart + i;
     if (colIdx < colEnd) {
-      int col = csrColInd[colIdx];
+      int col               = csrColInd[colIdx];
       a[row * n_cols + col] = csrVal[colIdx];
     }
   }
@@ -77,10 +75,17 @@ __global__ void csr_to_dense_warp_per_row_kernel(int n_cols,
  * @param[in] row_major : Is row-major output desired?
  */
 template <typename value_idx, typename value_t>
-void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols,
-                  const value_idx *csr_indptr, const value_idx *csr_indices,
-                  const value_t *csr_data, value_idx lda, value_t *out,
-                  cudaStream_t stream, bool row_major = true) {
+void csr_to_dense(cusparseHandle_t handle,
+                  value_idx nrows,
+                  value_idx ncols,
+                  const value_idx* csr_indptr,
+                  const value_idx* csr_indices,
+                  const value_t* csr_data,
+                  value_idx lda,
+                  value_t* out,
+                  cudaStream_t stream,
+                  bool row_major = true)
+{
   if (!row_major) {
     /**
      * If we need col-major, use cusparse.
@@ -91,15 +96,13 @@ void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols,
     CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL));
 
     CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense(
-      handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out,
-      lda, stream));
+      handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, lda, stream));
 
     CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat));
 
   } else {
     int blockdim = block_dim(ncols);
-    CUDA_CHECK(
-      cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
+    CUDA_CHECK(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
     csr_to_dense_warp_per_row_kernel<<<nrows, blockdim, 0, stream>>>(
       ncols, csr_data, csr_indptr, csr_indices, out);
   }
diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh
index 73120fea8c..348ed5eab2 100644
--- a/cpp/include/raft/sparse/coo.cuh
+++ b/cpp/include/raft/sparse/coo.cuh
@@ -68,83 +68,87 @@ class COO {
   Index_Type n_cols;
 
   /**
-    * @param d_alloc: the device allocator to use for the underlying buffers
-    * @param stream: CUDA stream to use
-    */
+   * @param d_alloc: the device allocator to use for the underlying buffers
+   * @param stream: CUDA stream to use
+   */
   COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream)
     : rows_arr(d_alloc, stream, 0),
       cols_arr(d_alloc, stream, 0),
       vals_arr(d_alloc, stream, 0),
       nnz(0),
       n_rows(0),
-      n_cols(0) {}
+      n_cols(0)
+  {
+  }
 
   /**
-    * @param rows: coo rows array
-    * @param cols: coo cols array
-    * @param vals: coo vals array
-    * @param nnz: size of the rows/cols/vals arrays
-    * @param n_rows: number of rows in the dense matrix
-    * @param n_cols: number of cols in the dense matrix
-    */
-  COO(raft::mr::device::buffer<Index_Type> &rows,
-      raft::mr::device::buffer<Index_Type> &cols,
-      raft::mr::device::buffer<T> &vals, Index_Type nnz, Index_Type n_rows = 0,
+   * @param rows: coo rows array
+   * @param cols: coo cols array
+   * @param vals: coo vals array
+   * @param nnz: size of the rows/cols/vals arrays
+   * @param n_rows: number of rows in the dense matrix
+   * @param n_cols: number of cols in the dense matrix
+   */
+  COO(raft::mr::device::buffer<Index_Type>& rows,
+      raft::mr::device::buffer<Index_Type>& cols,
+      raft::mr::device::buffer<T>& vals,
+      Index_Type nnz,
+      Index_Type n_rows = 0,
       Index_Type n_cols = 0)
-    : rows_arr(rows),
-      cols_arr(cols),
-      vals_arr(vals),
-      nnz(nnz),
-      n_rows(n_rows),
-      n_cols(n_cols) {}
+    : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols)
+  {
+  }
 
   /**
-    * @param d_alloc: the device allocator use
-    * @param stream: CUDA stream to use
-    * @param nnz: size of the rows/cols/vals arrays
-    * @param n_rows: number of rows in the dense matrix
-    * @param n_cols: number of cols in the dense matrix
-    * @param init: initialize arrays with zeros
-    */
-  COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream,
-      Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0,
-      bool init = true)
+   * @param d_alloc: the device allocator use
+   * @param stream: CUDA stream to use
+   * @param nnz: size of the rows/cols/vals arrays
+   * @param n_rows: number of rows in the dense matrix
+   * @param n_cols: number of cols in the dense matrix
+   * @param init: initialize arrays with zeros
+   */
+  COO(std::shared_ptr<raft::mr::device::allocator> d_alloc,
+      cudaStream_t stream,
+      Index_Type nnz,
+      Index_Type n_rows = 0,
+      Index_Type n_cols = 0,
+      bool init         = true)
     : rows_arr(d_alloc, stream, nnz),
       cols_arr(d_alloc, stream, nnz),
       vals_arr(d_alloc, stream, nnz),
       nnz(nnz),
       n_rows(n_rows),
-      n_cols(n_cols) {
+      n_cols(n_cols)
+  {
     if (init) init_arrays(stream);
   }
 
-  void init_arrays(cudaStream_t stream) {
-    CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0,
-                               this->nnz * sizeof(Index_Type), stream));
-    CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0,
-                               this->nnz * sizeof(Index_Type), stream));
-    CUDA_CHECK(
-      cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
+  void init_arrays(cudaStream_t stream)
+  {
+    CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
+    CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
+    CUDA_CHECK(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
   }
 
   ~COO() {}
 
   /**
-    * @brief Size should be > 0, with the number of rows
-    * and cols in the dense matrix being > 0.
-    */
-  bool validate_size() const {
+   * @brief Size should be > 0, with the number of rows
+   * and cols in the dense matrix being > 0.
+   */
+  bool validate_size() const
+  {
     if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false;
     return true;
   }
 
   /**
-    * @brief If the underlying arrays have not been set,
-    * return false. Otherwise true.
-    */
-  bool validate_mem() const {
-    if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 ||
-        this->vals_arr.size() == 0) {
+   * @brief If the underlying arrays have not been set,
+   * return false. Otherwise true.
+   */
+  bool validate_mem() const
+  {
+    if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || this->vals_arr.size() == 0) {
       return false;
     }
 
@@ -154,33 +158,30 @@ class COO {
   /*
    * @brief Returns the rows array
    */
-  Index_Type *rows() { return this->rows_arr.data(); }
+  Index_Type* rows() { return this->rows_arr.data(); }
 
   /**
    * @brief Returns the cols array
    */
-  Index_Type *cols() { return this->cols_arr.data(); }
+  Index_Type* cols() { return this->cols_arr.data(); }
 
   /**
    * @brief Returns the vals array
    */
-  T *vals() { return this->vals_arr.data(); }
+  T* vals() { return this->vals_arr.data(); }
 
   /**
-    * @brief Send human-readable state information to output stream
-    */
-  friend std::ostream &operator<<(std::ostream &out,
-                                  const COO<T, Index_Type> &c) {
+   * @brief Send human-readable state information to output stream
+   */
+  friend std::ostream& operator<<(std::ostream& out, const COO<T, Index_Type>& c)
+  {
     if (c.validate_size() && c.validate_mem()) {
       cudaStream_t stream;
       CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-      out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream)
-          << std::endl;
-      out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream)
-          << std::endl;
-      out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream)
-          << std::endl;
+      out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl;
+      out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl;
+      out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) << std::endl;
       out << "nnz=" << c.nnz << std::endl;
       out << "n_rows=" << c.n_rows << std::endl;
       out << "n_cols=" << c.n_cols << std::endl;
@@ -194,58 +195,59 @@ class COO {
   }
 
   /**
-    * @brief Set the number of rows and cols
-    * @param n_rows: number of rows in the dense matrix
-    * @param n_cols: number of columns in the dense matrix
-    */
-  void setSize(int n_rows, int n_cols) {
+   * @brief Set the number of rows and cols
+   * @param n_rows: number of rows in the dense matrix
+   * @param n_cols: number of columns in the dense matrix
+   */
+  void setSize(int n_rows, int n_cols)
+  {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
   }
 
   /**
-    * @brief Set the number of rows and cols for a square dense matrix
-    * @param n: number of rows and cols
-    */
-  void setSize(int n) {
+   * @brief Set the number of rows and cols for a square dense matrix
+   * @param n: number of rows and cols
+   */
+  void setSize(int n)
+  {
     this->n_rows = n;
     this->n_cols = n;
   }
 
   /**
-    * @brief Allocate the underlying arrays
-    * @param nnz: size of underlying row/col/val arrays
-    * @param init: should values be initialized to 0?
-    * @param stream: CUDA stream to use
-    */
-  void allocate(int nnz, bool init, cudaStream_t stream) {
-    this->allocate(nnz, 0, init, stream);
-  }
+   * @brief Allocate the underlying arrays
+   * @param nnz: size of underlying row/col/val arrays
+   * @param init: should values be initialized to 0?
+   * @param stream: CUDA stream to use
+   */
+  void allocate(int nnz, bool init, cudaStream_t stream) { this->allocate(nnz, 0, init, stream); }
 
   /**
-    * @brief Allocate the underlying arrays
-    * @param nnz: size of the underlying row/col/val arrays
-    * @param size: the number of rows/cols in a square dense matrix
-    * @param init: should values be initialized to 0?
-    * @param stream: CUDA stream to use
-    */
-  void allocate(int nnz, int size, bool init, cudaStream_t stream) {
+   * @brief Allocate the underlying arrays
+   * @param nnz: size of the underlying row/col/val arrays
+   * @param size: the number of rows/cols in a square dense matrix
+   * @param init: should values be initialized to 0?
+   * @param stream: CUDA stream to use
+   */
+  void allocate(int nnz, int size, bool init, cudaStream_t stream)
+  {
     this->allocate(nnz, size, size, init, stream);
   }
 
   /**
-    * @brief Allocate the underlying arrays
-    * @param nnz: size of the underlying row/col/val arrays
-    * @param n_rows: number of rows in the dense matrix
-    * @param n_cols: number of columns in the dense matrix
-    * @param init: should values be initialized to 0?
-    * @param stream: stream to use for init
-    */
-  void allocate(int nnz, int n_rows, int n_cols, bool init,
-                cudaStream_t stream) {
+   * @brief Allocate the underlying arrays
+   * @param nnz: size of the underlying row/col/val arrays
+   * @param n_rows: number of rows in the dense matrix
+   * @param n_cols: number of columns in the dense matrix
+   * @param init: should values be initialized to 0?
+   * @param stream: stream to use for init
+   */
+  void allocate(int nnz, int n_rows, int n_cols, bool init, cudaStream_t stream)
+  {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
-    this->nnz = nnz;
+    this->nnz    = nnz;
 
     this->rows_arr.resize(this->nnz, stream);
     this->cols_arr.resize(this->nnz, stream);
diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh
index bc4a68d296..17f3c735af 100644
--- a/cpp/include/raft/sparse/csr.cuh
+++ b/cpp/include/raft/sparse/csr.cuh
@@ -41,57 +41,64 @@ namespace sparse {
 
 struct WeakCCState {
  public:
-  bool *m;
-  WeakCCState(bool *m) : m(m) {}
+  bool* m;
+  WeakCCState(bool* m) : m(m) {}
 };
 
 template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_label_device(Index_ *__restrict__ labels,
-                                     const Index_ *__restrict__ row_ind,
-                                     const Index_ *__restrict__ row_ind_ptr,
-                                     Index_ nnz, bool *__restrict__ m,
-                                     Index_ start_vertex_id, Index_ batch_size,
-                                     Index_ N, Lambda filter_op) {
-  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
+__global__ void weak_cc_label_device(Index_* __restrict__ labels,
+                                     const Index_* __restrict__ row_ind,
+                                     const Index_* __restrict__ row_ind_ptr,
+                                     Index_ nnz,
+                                     bool* __restrict__ m,
+                                     Index_ start_vertex_id,
+                                     Index_ batch_size,
+                                     Index_ N,
+                                     Lambda filter_op)
+{
+  Index_ tid       = threadIdx.x + blockIdx.x * TPB_X;
   Index_ global_id = tid + start_vertex_id;
   if (tid < batch_size && global_id < N) {
     Index_ start = __ldg(row_ind + tid);
 
     Index_ ci, cj;
-    bool ci_mod = false;
-    ci = labels[global_id];
+    bool ci_mod        = false;
+    ci                 = labels[global_id];
     bool ci_allow_prop = filter_op(global_id);
 
     Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind);
     /// TODO: add one element to row_ind and avoid get_stop_idx
     for (Index_ j = start; j < end; j++) {
-      Index_ j_ind = __ldg(row_ind_ptr + j);
-      cj = labels[j_ind];
+      Index_ j_ind       = __ldg(row_ind_ptr + j);
+      cj                 = labels[j_ind];
       bool cj_allow_prop = filter_op(j_ind);
       if (ci < cj && ci_allow_prop) {
         if (sizeof(Index_) == 4)
-          atomicMin((int *)(labels + j_ind), ci);
+          atomicMin((int*)(labels + j_ind), ci);
         else if (sizeof(Index_) == 8)
-          atomicMin((long long int *)(labels + j_ind), ci);
+          atomicMin((long long int*)(labels + j_ind), ci);
         if (cj_allow_prop) *m = true;
       } else if (ci > cj && cj_allow_prop) {
-        ci = cj;
+        ci     = cj;
         ci_mod = true;
       }
     }
     if (ci_mod) {
       if (sizeof(Index_) == 4)
-        atomicMin((int *)(labels + global_id), ci);
+        atomicMin((int*)(labels + global_id), ci);
       else if (sizeof(Index_) == 8)
-        atomicMin((long long int *)(labels + global_id), ci);
+        atomicMin((long long int*)(labels + global_id), ci);
       if (ci_allow_prop) *m = true;
     }
   }
 }
 
 template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N,
-                                        Index_ MAX_LABEL, Lambda filter_op) {
+__global__ void weak_cc_init_all_kernel(Index_* labels,
+                                        Index_ N,
+                                        Index_ MAX_LABEL,
+                                        Lambda filter_op)
+{
   Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     if (filter_op(tid))
@@ -123,22 +130,25 @@ __global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N,
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_, int TPB_X = 256,
-          typename Lambda = auto(Index_)->bool>
-void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
-                     const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
-                     Index_ start_vertex_id, Index_ batch_size,
-                     WeakCCState *state, cudaStream_t stream,
-                     Lambda filter_op) {
-  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8,
-         "Index_ should be 4 or 8 bytes");
+template <typename Index_, int TPB_X = 256, typename Lambda = auto(Index_)->bool>
+void weak_cc_batched(Index_* labels,
+                     const Index_* row_ind,
+                     const Index_* row_ind_ptr,
+                     Index_ nnz,
+                     Index_ N,
+                     Index_ start_vertex_id,
+                     Index_ batch_size,
+                     WeakCCState* state,
+                     cudaStream_t stream,
+                     Lambda filter_op)
+{
+  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "Index_ should be 4 or 8 bytes");
 
   bool host_m;
 
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
   weak_cc_init_all_kernel<Index_, TPB_X>
-    <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(
-      labels, N, MAX_LABEL, filter_op);
+    <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(labels, N, MAX_LABEL, filter_op);
   CUDA_CHECK(cudaPeekAtLastError());
 
   int n_iters = 0;
@@ -147,8 +157,7 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
 
     weak_cc_label_device<Index_, TPB_X>
       <<<raft::ceildiv(batch_size, Index_(TPB_X)), TPB_X, 0, stream>>>(
-        labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id,
-        batch_size, N, filter_op);
+        labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op);
     CUDA_CHECK(cudaPeekAtLastError());
 
     //** Updating m *
@@ -180,12 +189,25 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
  * @param stream the cuda stream to use
  */
 template <typename Index_, int TPB_X = 256>
-void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
-                     const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
-                     Index_ start_vertex_id, Index_ batch_size,
-                     WeakCCState *state, cudaStream_t stream) {
-  weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id,
-                  batch_size, state, stream,
+void weak_cc_batched(Index_* labels,
+                     const Index_* row_ind,
+                     const Index_* row_ind_ptr,
+                     Index_ nnz,
+                     Index_ N,
+                     Index_ start_vertex_id,
+                     Index_ batch_size,
+                     WeakCCState* state,
+                     cudaStream_t stream)
+{
+  weak_cc_batched(labels,
+                  row_ind,
+                  row_ind_ptr,
+                  nnz,
+                  N,
+                  start_vertex_id,
+                  batch_size,
+                  state,
+                  stream,
                   [] __device__(Index_ tid) { return true; });
 }
 
@@ -213,17 +235,20 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_ = int, int TPB_X = 256,
-          typename Lambda = auto(Index_)->bool>
-void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
-             Index_ nnz, Index_ N,
+template <typename Index_ = int, int TPB_X = 256, typename Lambda = auto(Index_)->bool>
+void weak_cc(Index_* labels,
+             const Index_* row_ind,
+             const Index_* row_ind_ptr,
+             Index_ nnz,
+             Index_ N,
              std::shared_ptr<raft::mr::device::allocator> d_alloc,
-             cudaStream_t stream, Lambda filter_op) {
+             cudaStream_t stream,
+             Lambda filter_op)
+{
   raft::mr::device::buffer<bool> m(d_alloc, stream, 1);
 
   WeakCCState state(m.data());
-  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
-                                 stream, filter_op);
+  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, filter_op);
 }
 
 /**
@@ -249,14 +274,18 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
  * @param stream the cuda stream to use
  */
 template <typename Index_, int TPB_X = 256>
-void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
-             Index_ nnz, Index_ N,
+void weak_cc(Index_* labels,
+             const Index_* row_ind,
+             const Index_* row_ind_ptr,
+             Index_ nnz,
+             Index_ N,
              std::shared_ptr<raft::mr::device::allocator> d_alloc,
-             cudaStream_t stream) {
+             cudaStream_t stream)
+{
   raft::mr::device::buffer<bool> m(d_alloc, stream, 1);
   WeakCCState state(m.data());
-  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
-                                 stream, [](Index_) { return true; });
+  weak_cc_batched<Index_, TPB_X>(
+    labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; });
 }
 
 };  // namespace sparse
diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h
index 360832f557..9d42ec34cb 100644
--- a/cpp/include/raft/sparse/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/cusparse_wrappers.h
@@ -23,10 +23,9 @@
 //#include <cuml/common/logger.hpp>
 
 #define _CUSPARSE_ERR_TO_STR(err) \
-  case err:                       \
-    return #err;
+  case err: return #err;
 
-//Notes:
+// Notes:
 //(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic;
 //(2.) to enforce a lower version,
 //
@@ -43,16 +42,15 @@ namespace raft {
  * @brief Exception thrown when a cuSparse error is encountered.
  */
 struct cusparse_error : public raft::exception {
-  explicit cusparse_error(char const* const message)
-    : raft::exception(message) {}
-  explicit cusparse_error(std::string const& message)
-    : raft::exception(message) {}
+  explicit cusparse_error(char const* const message) : raft::exception(message) {}
+  explicit cusparse_error(std::string const& message) : raft::exception(message) {}
 };
 
 namespace sparse {
 namespace detail {
 
-inline const char* cusparse_error_to_string(cusparseStatus_t err) {
+inline const char* cusparse_error_to_string(cusparseStatus_t err)
+{
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
   return cusparseGetErrorString(err);
 #else  // CUDART_VERSION
@@ -65,8 +63,7 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) {
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    default:
-      return "CUSPARSE_STATUS_UNKNOWN";
+    default: return "CUSPARSE_STATUS_UNKNOWN";
   };
 #endif  // CUDART_VERSION
 }
@@ -88,8 +85,11 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) {
     cusparseStatus_t const status = (call);                                  \
     if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
       std::string msg{};                                                     \
-      SET_ERROR_MSG(msg, "cuSparse error encountered at: ",                  \
-                    "call='%s', Reason=%d:%s", #call, status,                \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSparse error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
                     raft::sparse::detail::cusparse_error_to_string(status)); \
       throw raft::cusparse_error(msg);                                       \
     }                                                                        \
@@ -100,13 +100,15 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) {
 
 //@todo: use logger here once logging is enabled
 /** check for cusparse runtime API errors but do not assert */
-#define CUSPARSE_CHECK_NO_THROW(call)                                  \
-  do {                                                                 \
-    cusparseStatus_t err = call;                                       \
-    if (err != CUSPARSE_STATUS_SUCCESS) {                              \
-      printf("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \
-             raft::sparse::detail::cusparse_error_to_string(err));     \
-    }                                                                  \
+#define CUSPARSE_CHECK_NO_THROW(call)                              \
+  do {                                                             \
+    cusparseStatus_t err = call;                                   \
+    if (err != CUSPARSE_STATUS_SUCCESS) {                          \
+      printf("CUSPARSE call='%s' got errorcode=%d err=%s",         \
+             #call,                                                \
+             err,                                                  \
+             raft::sparse::detail::cusparse_error_to_string(err)); \
+    }                                                              \
   } while (0)
 
 namespace raft {
@@ -117,28 +119,34 @@ namespace sparse {
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, const T* vals,
-                              T* vals_sorted, int* d_P, cudaStream_t stream);
+cusparseStatus_t cusparsegthr(
+  cusparseHandle_t handle, int nnz, const T* vals, T* vals_sorted, int* d_P, cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
-                                     const double* vals, double* vals_sorted,
-                                     int* d_P, cudaStream_t stream) {
+inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle,
+                                     int nnz,
+                                     const double* vals,
+                                     double* vals_sorted,
+                                     int* d_P,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P,
-                       CUSPARSE_INDEX_BASE_ZERO);
+  return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO);
 #pragma GCC diagnostic pop
 }
 template <>
-inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
-                                     const float* vals, float* vals_sorted,
-                                     int* d_P, cudaStream_t stream) {
+inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle,
+                                     int nnz,
+                                     const float* vals,
+                                     float* vals_sorted,
+                                     int* d_P,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P,
-                       CUSPARSE_INDEX_BASE_ZERO);
+  return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO);
 #pragma GCC diagnostic pop
 }
 /** @} */
@@ -148,15 +156,18 @@ inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
  * @{
  */
 template <typename T>
-void cusparsecoo2csr(cusparseHandle_t handle, const T* cooRowInd, int nnz,
-                     int m, T* csrRowPtr, cudaStream_t stream);
+void cusparsecoo2csr(
+  cusparseHandle_t handle, const T* cooRowInd, int nnz, int m, T* csrRowPtr, cudaStream_t stream);
 template <>
-inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd,
-                            int nnz, int m, int* csrRowPtr,
-                            cudaStream_t stream) {
+inline void cusparsecoo2csr(cusparseHandle_t handle,
+                            const int* cooRowInd,
+                            int nnz,
+                            int m,
+                            int* csrRowPtr,
+                            cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr,
-                                  CUSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, CUSPARSE_INDEX_BASE_ZERO));
 }
 /** @} */
 
@@ -166,30 +177,54 @@ inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd,
  */
 template <typename T>
 size_t cusparsecoosort_bufferSizeExt(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int nnz, const T* cooRows,
-  const T* cooCols, cudaStream_t stream);
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int nnz,
+  const T* cooRows,
+  const T* cooCols,
+  cudaStream_t stream);
 template <>
 inline size_t cusparsecoosort_bufferSizeExt(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int nnz, const int* cooRows,
-  const int* cooCols, cudaStream_t stream) {
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int nnz,
+  const int* cooRows,
+  const int* cooCols,
+  cudaStream_t stream)
+{
   size_t val;
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(
-    cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val));
+  CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val));
   return val;
 }
 
 template <typename T>
 void cusparsecoosortByRow(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int nnz, T* cooRows, T* cooCols, T* P,
-  void* pBuffer, cudaStream_t stream);
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int nnz,
+  T* cooRows,
+  T* cooCols,
+  T* P,
+  void* pBuffer,
+  cudaStream_t stream);
 template <>
 inline void cusparsecoosortByRow(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int nnz, int* cooRows, int* cooCols,
-  int* P, void* pBuffer, cudaStream_t stream) {
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int nnz,
+  int* cooRows,
+  int* cooCols,
+  int* P,
+  void* pBuffer,
+  cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(
-    cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer));
+  CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer));
 }
 /** @} */
 
@@ -199,37 +234,67 @@ inline void cusparsecoosortByRow(  // NOLINT
  */
 template <typename T>
 cusparseStatus_t cusparsegemmi(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha,
-  const T* A, int lda, const T* cscValB, const int* cscColPtrB,
-  const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream);
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int k,
+  int nnz,
+  const T* alpha,
+  const T* A,
+  int lda,
+  const T* cscValB,
+  const int* cscColPtrB,
+  const int* cscRowIndB,
+  const T* beta,
+  T* C,
+  int ldc,
+  cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
-                                      int k, int nnz, const float* alpha,
-                                      const float* A, int lda,
+inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const float* alpha,
+                                      const float* A,
+                                      int lda,
                                       const float* cscValB,
                                       const int* cscColPtrB,
-                                      const int* cscRowIndB, const float* beta,
-                                      float* C, int ldc, cudaStream_t stream) {
+                                      const int* cscRowIndB,
+                                      const float* beta,
+                                      float* C,
+                                      int ldc,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB,
-                        cscColPtrB, cscRowIndB, beta, C, ldc);
+  return cusparseSgemmi(
+    handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
 #pragma GCC diagnostic pop
 }
 template <>
-inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
-                                      int k, int nnz, const double* alpha,
-                                      const double* A, int lda,
+inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const double* alpha,
+                                      const double* A,
+                                      int lda,
                                       const double* cscValB,
                                       const int* cscColPtrB,
-                                      const int* cscRowIndB, const double* beta,
-                                      double* C, int ldc, cudaStream_t stream) {
+                                      const int* cscRowIndB,
+                                      const double* beta,
+                                      double* C,
+                                      int ldc,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB,
-                        cscColPtrB, cscRowIndB, beta, C, ldc);
+  return cusparseDgemmi(
+    handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
 #pragma GCC diagnostic pop
 }
 /** @} */
@@ -241,49 +306,94 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
  */
 template <typename IndexT, typename ValueT>
 cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                   int64_t rows, int64_t cols, int64_t nnz,
-                                   IndexT* csrRowOffsets, IndexT* csrColInd,
+                                   int64_t rows,
+                                   int64_t cols,
+                                   int64_t nnz,
+                                   IndexT* csrRowOffsets,
+                                   IndexT* csrColInd,
                                    ValueT* csrValues);
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows, int64_t cols,
-                                          int64_t nnz, int* csrRowOffsets,
-                                          int* csrColInd, float* csrValues) {
-  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
-                           csrColInd, csrValues, CUSPARSE_INDEX_32I,
-                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int* csrRowOffsets,
+                                          int* csrColInd,
+                                          float* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_32F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows, int64_t cols,
-                                          int64_t nnz, int* csrRowOffsets,
-                                          int* csrColInd, double* csrValues) {
-  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
-                           csrColInd, csrValues, CUSPARSE_INDEX_32I,
-                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int* csrRowOffsets,
+                                          int* csrColInd,
+                                          double* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_64F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows, int64_t cols,
-                                          int64_t nnz, int64_t* csrRowOffsets,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int64_t* csrRowOffsets,
                                           int64_t* csrColInd,
-                                          float* csrValues) {
-  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
-                           csrColInd, csrValues, CUSPARSE_INDEX_64I,
-                           CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO,
+                                          float* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_32F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows, int64_t cols,
-                                          int64_t nnz, int64_t* csrRowOffsets,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int64_t* csrRowOffsets,
                                           int64_t* csrColInd,
-                                          double* csrValues) {
-  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
-                           csrColInd, csrValues, CUSPARSE_INDEX_64I,
-                           CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO,
+                                          double* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_64F);
 }
 /** @} */
@@ -292,16 +402,19 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
-                                     int64_t size, T* values);
+cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, int64_t size, T* values);
 template <>
 inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
-                                            int64_t size, float* values) {
+                                            int64_t size,
+                                            float* values)
+{
   return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
-                                            int64_t size, double* values) {
+                                            int64_t size,
+                                            double* values)
+{
   return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F);
 }
 /** @} */
@@ -312,23 +425,30 @@ inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
  */
 template <typename T>
 cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
-                                     int64_t rows, int64_t cols, int64_t ld,
-                                     T* values, cusparseOrder_t order);
+                                     int64_t rows,
+                                     int64_t cols,
+                                     int64_t ld,
+                                     T* values,
+                                     cusparseOrder_t order);
 template <>
 inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
-                                            int64_t rows, int64_t cols,
-                                            int64_t ld, float* values,
-                                            cusparseOrder_t order) {
-  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F,
-                             order);
+                                            int64_t rows,
+                                            int64_t cols,
+                                            int64_t ld,
+                                            float* values,
+                                            cusparseOrder_t order)
+{
+  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, order);
 }
 template <>
 inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
-                                            int64_t rows, int64_t cols,
-                                            int64_t ld, double* values,
-                                            cusparseOrder_t order) {
-  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F,
-                             order);
+                                            int64_t rows,
+                                            int64_t cols,
+                                            int64_t ld,
+                                            double* values,
+                                            cusparseOrder_t order)
+{
+  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, order);
 }
 /** @} */
 
@@ -337,58 +457,89 @@ inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsespmv_buffersize(
-  cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  size_t* bufferSize, cudaStream_t stream);
+cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle,
+                                         cusparseOperation_t opA,
+                                         const T* alpha,
+                                         const cusparseSpMatDescr_t matA,
+                                         const cusparseDnVecDescr_t vecX,
+                                         const T* beta,
+                                         const cusparseDnVecDescr_t vecY,
+                                         cusparseSpMVAlg_t alg,
+                                         size_t* bufferSize,
+                                         cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmv_buffersize(
-  cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle,
+                                                cusparseOperation_t opA,
+                                                const float* alpha,
+                                                const cusparseSpMatDescr_t matA,
+                                                const cusparseDnVecDescr_t vecX,
+                                                const float* beta,
+                                                const cusparseDnVecDescr_t vecY,
+                                                cusparseSpMVAlg_t alg,
+                                                size_t* bufferSize,
+                                                cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY,
-                                 CUDA_R_32F, alg, bufferSize);
+  return cusparseSpMV_bufferSize(
+    handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, bufferSize);
 }
 template <>
-inline cusparseStatus_t cusparsespmv_buffersize(
-  cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle,
+                                                cusparseOperation_t opA,
+                                                const double* alpha,
+                                                const cusparseSpMatDescr_t matA,
+                                                const cusparseDnVecDescr_t vecX,
+                                                const double* beta,
+                                                const cusparseDnVecDescr_t vecY,
+                                                cusparseSpMVAlg_t alg,
+                                                size_t* bufferSize,
+                                                cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY,
-                                 CUDA_R_64F, alg, bufferSize);
+  return cusparseSpMV_bufferSize(
+    handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, bufferSize);
 }
 
 template <typename T>
-cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA,
-                              const T* alpha, const cusparseSpMatDescr_t matA,
-                              const cusparseDnVecDescr_t vecX, const T* beta,
+cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
+                              cusparseOperation_t opA,
+                              const T* alpha,
+                              const cusparseSpMatDescr_t matA,
+                              const cusparseDnVecDescr_t vecX,
+                              const T* beta,
                               const cusparseDnVecDescr_t vecY,
-                              cusparseSpMVAlg_t alg, T* externalBuffer,
+                              cusparseSpMVAlg_t alg,
+                              T* externalBuffer,
                               cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmv(
-  cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  float* externalBuffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     const float* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnVecDescr_t vecX,
+                                     const float* beta,
+                                     const cusparseDnVecDescr_t vecY,
+                                     cusparseSpMVAlg_t alg,
+                                     float* externalBuffer,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F,
-                      alg, externalBuffer);
+  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, externalBuffer);
 }
 template <>
-inline cusparseStatus_t cusparsespmv(
-  cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  double* externalBuffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     const double* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnVecDescr_t vecX,
+                                     const double* beta,
+                                     const cusparseDnVecDescr_t vecY,
+                                     cusparseSpMVAlg_t alg,
+                                     double* externalBuffer,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F,
-                      alg, externalBuffer);
+  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer);
 }
 /** @} */
 #else
@@ -398,29 +549,59 @@ inline cusparseStatus_t cusparsespmv(
  */
 template <typename T>
 cusparseStatus_t cusparsecsrmv(  // NOLINT
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
-  const T* alpha, const cusparseMatDescr_t descr, const T* csrVal,
-  const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y,
+  cusparseHandle_t handle,
+  cusparseOperation_t trans,
+  int m,
+  int n,
+  int nnz,
+  const T* alpha,
+  const cusparseMatDescr_t descr,
+  const T* csrVal,
+  const int* csrRowPtr,
+  const int* csrColInd,
+  const T* x,
+  const T* beta,
+  T* y,
   cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsecsrmv(
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
-  const float* alpha, const cusparseMatDescr_t descr, const float* csrVal,
-  const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta,
-  float* y, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle,
+                                      cusparseOperation_t trans,
+                                      int m,
+                                      int n,
+                                      int nnz,
+                                      const float* alpha,
+                                      const cusparseMatDescr_t descr,
+                                      const float* csrVal,
+                                      const int* csrRowPtr,
+                                      const int* csrColInd,
+                                      const float* x,
+                                      const float* beta,
+                                      float* y,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal,
-                        csrRowPtr, csrColInd, x, beta, y);
+  return cusparseScsrmv(
+    handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y);
 }
 template <>
-inline cusparseStatus_t cusparsecsrmv(
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
-  const double* alpha, const cusparseMatDescr_t descr, const double* csrVal,
-  const int* csrRowPtr, const int* csrColInd, const double* x,
-  const double* beta, double* y, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle,
+                                      cusparseOperation_t trans,
+                                      int m,
+                                      int n,
+                                      int nnz,
+                                      const double* alpha,
+                                      const cusparseMatDescr_t descr,
+                                      const double* csrVal,
+                                      const int* csrRowPtr,
+                                      const int* csrColInd,
+                                      const double* x,
+                                      const double* beta,
+                                      double* y,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal,
-                        csrRowPtr, csrColInd, x, beta, y);
+  return cusparseDcsrmv(
+    handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y);
 }
 /** @} */
 #endif
@@ -431,58 +612,96 @@ inline cusparseStatus_t cusparsecsrmv(
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsespmm_bufferSize(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const T* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC,
-  cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream);
+cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle,
+                                         cusparseOperation_t opA,
+                                         cusparseOperation_t opB,
+                                         const T* alpha,
+                                         const cusparseSpMatDescr_t matA,
+                                         const cusparseDnMatDescr_t matB,
+                                         const T* beta,
+                                         cusparseDnMatDescr_t matC,
+                                         cusparseSpMMAlg_t alg,
+                                         size_t* bufferSize,
+                                         cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmm_bufferSize(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const float* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC,
-  cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle,
+                                                cusparseOperation_t opA,
+                                                cusparseOperation_t opB,
+                                                const float* alpha,
+                                                const cusparseSpMatDescr_t matA,
+                                                const cusparseDnMatDescr_t matB,
+                                                const float* beta,
+                                                cusparseDnMatDescr_t matC,
+                                                cusparseSpMMAlg_t alg,
+                                                size_t* bufferSize,
+                                                cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta,
-                                 matC, CUDA_R_32F, alg, bufferSize);
+  return cusparseSpMM_bufferSize(
+    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, bufferSize);
 }
 template <>
-inline cusparseStatus_t cusparsespmm_bufferSize(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const double* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const double* beta,
-  cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle,
+                                                cusparseOperation_t opA,
+                                                cusparseOperation_t opB,
+                                                const double* alpha,
+                                                const cusparseSpMatDescr_t matA,
+                                                const cusparseDnMatDescr_t matB,
+                                                const double* beta,
+                                                cusparseDnMatDescr_t matC,
+                                                cusparseSpMMAlg_t alg,
+                                                size_t* bufferSize,
+                                                cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta,
-                                 matC, CUDA_R_64F, alg, bufferSize);
+  return cusparseSpMM_bufferSize(
+    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, bufferSize);
 }
 template <typename T>
-inline cusparseStatus_t cusparsespmm(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const T* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC,
-  cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream);
+inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     cusparseOperation_t opB,
+                                     const T* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnMatDescr_t matB,
+                                     const T* beta,
+                                     cusparseDnMatDescr_t matC,
+                                     cusparseSpMMAlg_t alg,
+                                     T* externalBuffer,
+                                     cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmm(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const float* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC,
-  cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     cusparseOperation_t opB,
+                                     const float* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnMatDescr_t matB,
+                                     const float* beta,
+                                     cusparseDnMatDescr_t matC,
+                                     cusparseSpMMAlg_t alg,
+                                     float* externalBuffer,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC,
-                      CUDA_R_32F, alg, externalBuffer);
+  return cusparseSpMM(
+    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, externalBuffer);
 }
 template <>
-inline cusparseStatus_t cusparsespmm(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const double* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const double* beta,
-  cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     cusparseOperation_t opB,
+                                     const double* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnMatDescr_t matB,
+                                     const double* beta,
+                                     cusparseDnMatDescr_t matC,
+                                     cusparseSpMMAlg_t alg,
+                                     double* externalBuffer,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC,
-                      CUDA_R_64F, alg, externalBuffer);
+  return cusparseSpMM(
+    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, externalBuffer);
 }
 /** @} */
 #else
@@ -492,31 +711,68 @@ inline cusparseStatus_t cusparsespmm(
  */
 template <typename T>
 cusparseStatus_t cusparsecsrmm(  // NOLINT
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
-  int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal,
-  const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx,
-  const T* beta, T* y, const int ldy, cudaStream_t stream);
+  cusparseHandle_t handle,
+  cusparseOperation_t trans,
+  int m,
+  int n,
+  int k,
+  int nnz,
+  const T* alpha,
+  const cusparseMatDescr_t descr,
+  const T* csrVal,
+  const int* csrRowPtr,
+  const int* csrColInd,
+  const T* x,
+  const int ldx,
+  const T* beta,
+  T* y,
+  const int ldy,
+  cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsecsrmm(
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
-  int nnz, const float* alpha, const cusparseMatDescr_t descr,
-  const float* csrVal, const int* csrRowPtr, const int* csrColInd,
-  const float* x, const int ldx, const float* beta, float* y, const int ldy,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle,
+                                      cusparseOperation_t trans,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const float* alpha,
+                                      const cusparseMatDescr_t descr,
+                                      const float* csrVal,
+                                      const int* csrRowPtr,
+                                      const int* csrColInd,
+                                      const float* x,
+                                      const int ldx,
+                                      const float* beta,
+                                      float* y,
+                                      const int ldy,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal,
-                        csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
+  return cusparseScsrmm(
+    handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
 }
 template <>
-inline cusparseStatus_t cusparsecsrmm(
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
-  int nnz, const double* alpha, const cusparseMatDescr_t descr,
-  const double* csrVal, const int* csrRowPtr, const int* csrColInd,
-  const double* x, const int ldx, const double* beta, double* y, const int ldy,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle,
+                                      cusparseOperation_t trans,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const double* alpha,
+                                      const cusparseMatDescr_t descr,
+                                      const double* csrVal,
+                                      const int* csrRowPtr,
+                                      const int* csrColInd,
+                                      const double* x,
+                                      const int ldx,
+                                      const double* beta,
+                                      double* y,
+                                      const int ldy,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal,
-                        csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
+  return cusparseDcsrmm(
+    handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
 }
 /** @} */
 #endif
@@ -527,15 +783,22 @@ inline cusparseStatus_t cusparsecsrmm(
  */
 template <typename T>
 void cusparsecsr2coo(  // NOLINT
-  cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr,
-  T* cooRowInd, cudaStream_t stream);
+  cusparseHandle_t handle,
+  const int n,
+  const int nnz,
+  const T* csrRowPtr,
+  T* cooRowInd,
+  cudaStream_t stream);
 template <>
-inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz,
-                            const int* csrRowPtr, int* cooRowInd,
-                            cudaStream_t stream) {
+inline void cusparsecsr2coo(cusparseHandle_t handle,
+                            const int n,
+                            const int nnz,
+                            const int* csrRowPtr,
+                            int* cooRowInd,
+                            cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd,
-                                  CUSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO));
 }
 /** @} */
 
@@ -553,7 +816,8 @@ inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz,
 // template<>
 inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle,
                                                cusparsePointerMode_t mode,
-                                               cudaStream_t stream) {
+                                               cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
   return cusparseSetPointerMode(handle, mode);
 }
@@ -564,69 +828,203 @@ inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsecsrmvex_bufferSize(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA,
-  const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x,
-  const T* beta, T* y, size_t* bufferSizeInBytes, cudaStream_t stream);
-template <>
-inline cusparseStatus_t cusparsecsrmvex_bufferSize(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA,
-  const float* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-  const float* x, const float* beta, float* y, size_t* bufferSizeInBytes,
-  cudaStream_t stream) {
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx_bufferSize(
-    handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, descrA, csrValA,
-    CUDA_R_32F, csrRowPtrA, csrColIndA, x, CUDA_R_32F, beta, CUDA_R_32F, y,
-    CUDA_R_32F, CUDA_R_32F, bufferSizeInBytes);
-}
-template <>
-inline cusparseStatus_t cusparsecsrmvex_bufferSize(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA,
-  const double* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-  const double* x, const double* beta, double* y, size_t* bufferSizeInBytes,
-  cudaStream_t stream) {
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx_bufferSize(
-    handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, descrA, csrValA,
-    CUDA_R_64F, csrRowPtrA, csrColIndA, x, CUDA_R_64F, beta, CUDA_R_64F, y,
-    CUDA_R_64F, CUDA_R_64F, bufferSizeInBytes);
+cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
+                                            cusparseAlgMode_t alg,
+                                            cusparseOperation_t transA,
+                                            int m,
+                                            int n,
+                                            int nnz,
+                                            const T* alpha,
+                                            const cusparseMatDescr_t descrA,
+                                            const T* csrValA,
+                                            const int* csrRowPtrA,
+                                            const int* csrColIndA,
+                                            const T* x,
+                                            const T* beta,
+                                            T* y,
+                                            size_t* bufferSizeInBytes,
+                                            cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
+                                                   cusparseAlgMode_t alg,
+                                                   cusparseOperation_t transA,
+                                                   int m,
+                                                   int n,
+                                                   int nnz,
+                                                   const float* alpha,
+                                                   const cusparseMatDescr_t descrA,
+                                                   const float* csrValA,
+                                                   const int* csrRowPtrA,
+                                                   const int* csrColIndA,
+                                                   const float* x,
+                                                   const float* beta,
+                                                   float* y,
+                                                   size_t* bufferSizeInBytes,
+                                                   cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx_bufferSize(handle,
+                                    alg,
+                                    transA,
+                                    m,
+                                    n,
+                                    nnz,
+                                    alpha,
+                                    CUDA_R_32F,
+                                    descrA,
+                                    csrValA,
+                                    CUDA_R_32F,
+                                    csrRowPtrA,
+                                    csrColIndA,
+                                    x,
+                                    CUDA_R_32F,
+                                    beta,
+                                    CUDA_R_32F,
+                                    y,
+                                    CUDA_R_32F,
+                                    CUDA_R_32F,
+                                    bufferSizeInBytes);
+}
+template <>
+inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
+                                                   cusparseAlgMode_t alg,
+                                                   cusparseOperation_t transA,
+                                                   int m,
+                                                   int n,
+                                                   int nnz,
+                                                   const double* alpha,
+                                                   const cusparseMatDescr_t descrA,
+                                                   const double* csrValA,
+                                                   const int* csrRowPtrA,
+                                                   const int* csrColIndA,
+                                                   const double* x,
+                                                   const double* beta,
+                                                   double* y,
+                                                   size_t* bufferSizeInBytes,
+                                                   cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx_bufferSize(handle,
+                                    alg,
+                                    transA,
+                                    m,
+                                    n,
+                                    nnz,
+                                    alpha,
+                                    CUDA_R_64F,
+                                    descrA,
+                                    csrValA,
+                                    CUDA_R_64F,
+                                    csrRowPtrA,
+                                    csrColIndA,
+                                    x,
+                                    CUDA_R_64F,
+                                    beta,
+                                    CUDA_R_64F,
+                                    y,
+                                    CUDA_R_64F,
+                                    CUDA_R_64F,
+                                    bufferSizeInBytes);
 }
 
 template <typename T>
-cusparseStatus_t cusparsecsrmvex(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA,
-  const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x,
-  const T* beta, T* y, T* buffer, cudaStream_t stream);
-template <>
-inline cusparseStatus_t cusparsecsrmvex(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA,
-  const float* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-  const float* x, const float* beta, float* y, float* buffer,
-  cudaStream_t stream) {
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F,
-                         descrA, csrValA, CUDA_R_32F, csrRowPtrA, csrColIndA, x,
-                         CUDA_R_32F, beta, CUDA_R_32F, y, CUDA_R_32F,
-                         CUDA_R_32F, buffer);
-}
-template <>
-inline cusparseStatus_t cusparsecsrmvex(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA,
-  const double* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-  const double* x, const double* beta, double* y, double* buffer,
-  cudaStream_t stream) {
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F,
-                         descrA, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, x,
-                         CUDA_R_64F, beta, CUDA_R_64F, y, CUDA_R_64F,
-                         CUDA_R_64F, buffer);
+cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
+                                 cusparseAlgMode_t alg,
+                                 cusparseOperation_t transA,
+                                 int m,
+                                 int n,
+                                 int nnz,
+                                 const T* alpha,
+                                 const cusparseMatDescr_t descrA,
+                                 const T* csrValA,
+                                 const int* csrRowPtrA,
+                                 const int* csrColIndA,
+                                 const T* x,
+                                 const T* beta,
+                                 T* y,
+                                 T* buffer,
+                                 cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
+                                        cusparseAlgMode_t alg,
+                                        cusparseOperation_t transA,
+                                        int m,
+                                        int n,
+                                        int nnz,
+                                        const float* alpha,
+                                        const cusparseMatDescr_t descrA,
+                                        const float* csrValA,
+                                        const int* csrRowPtrA,
+                                        const int* csrColIndA,
+                                        const float* x,
+                                        const float* beta,
+                                        float* y,
+                                        float* buffer,
+                                        cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx(handle,
+                         alg,
+                         transA,
+                         m,
+                         n,
+                         nnz,
+                         alpha,
+                         CUDA_R_32F,
+                         descrA,
+                         csrValA,
+                         CUDA_R_32F,
+                         csrRowPtrA,
+                         csrColIndA,
+                         x,
+                         CUDA_R_32F,
+                         beta,
+                         CUDA_R_32F,
+                         y,
+                         CUDA_R_32F,
+                         CUDA_R_32F,
+                         buffer);
+}
+template <>
+inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
+                                        cusparseAlgMode_t alg,
+                                        cusparseOperation_t transA,
+                                        int m,
+                                        int n,
+                                        int nnz,
+                                        const double* alpha,
+                                        const cusparseMatDescr_t descrA,
+                                        const double* csrValA,
+                                        const int* csrRowPtrA,
+                                        const int* csrColIndA,
+                                        const double* x,
+                                        const double* beta,
+                                        double* y,
+                                        double* buffer,
+                                        cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx(handle,
+                         alg,
+                         transA,
+                         m,
+                         n,
+                         nnz,
+                         alpha,
+                         CUDA_R_64F,
+                         descrA,
+                         csrValA,
+                         CUDA_R_64F,
+                         csrRowPtrA,
+                         csrColIndA,
+                         x,
+                         CUDA_R_64F,
+                         beta,
+                         CUDA_R_64F,
+                         y,
+                         CUDA_R_64F,
+                         CUDA_R_64F,
+                         buffer);
 }
 
 /** @} */
@@ -637,68 +1035,180 @@ inline cusparseStatus_t cusparsecsrmvex(
  */
 
 template <typename T>
-cusparseStatus_t cusparsecsr2csc_bufferSize(
-  cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream);
+cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle,
+                                            int m,
+                                            int n,
+                                            int nnz,
+                                            const T* csrVal,
+                                            const int* csrRowPtr,
+                                            const int* csrColInd,
+                                            void* cscVal,
+                                            int* cscColPtr,
+                                            int* cscRowInd,
+                                            cusparseAction_t copyValues,
+                                            cusparseIndexBase_t idxBase,
+                                            cusparseCsr2CscAlg_t alg,
+                                            size_t* bufferSize,
+                                            cudaStream_t stream);
 
 template <>
-inline cusparseStatus_t cusparsecsr2csc_bufferSize(
-  cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle,
+                                                   int m,
+                                                   int n,
+                                                   int nnz,
+                                                   const float* csrVal,
+                                                   const int* csrRowPtr,
+                                                   const int* csrColInd,
+                                                   void* cscVal,
+                                                   int* cscColPtr,
+                                                   int* cscRowInd,
+                                                   cusparseAction_t copyValues,
+                                                   cusparseIndexBase_t idxBase,
+                                                   cusparseCsr2CscAlg_t alg,
+                                                   size_t* bufferSize,
+                                                   cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2_bufferSize(
-    handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr,
-    cscRowInd, CUDA_R_32F, copyValues, idxBase, alg, bufferSize);
+  return cusparseCsr2cscEx2_bufferSize(handle,
+                                       m,
+                                       n,
+                                       nnz,
+                                       csrVal,
+                                       csrRowPtr,
+                                       csrColInd,
+                                       cscVal,
+                                       cscColPtr,
+                                       cscRowInd,
+                                       CUDA_R_32F,
+                                       copyValues,
+                                       idxBase,
+                                       alg,
+                                       bufferSize);
 }
 template <>
-inline cusparseStatus_t cusparsecsr2csc_bufferSize(
-  cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle,
+                                                   int m,
+                                                   int n,
+                                                   int nnz,
+                                                   const double* csrVal,
+                                                   const int* csrRowPtr,
+                                                   const int* csrColInd,
+                                                   void* cscVal,
+                                                   int* cscColPtr,
+                                                   int* cscRowInd,
+                                                   cusparseAction_t copyValues,
+                                                   cusparseIndexBase_t idxBase,
+                                                   cusparseCsr2CscAlg_t alg,
+                                                   size_t* bufferSize,
+                                                   cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2_bufferSize(
-    handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr,
-    cscRowInd, CUDA_R_64F, copyValues, idxBase, alg, bufferSize);
+  return cusparseCsr2cscEx2_bufferSize(handle,
+                                       m,
+                                       n,
+                                       nnz,
+                                       csrVal,
+                                       csrRowPtr,
+                                       csrColInd,
+                                       cscVal,
+                                       cscColPtr,
+                                       cscRowInd,
+                                       CUDA_R_64F,
+                                       copyValues,
+                                       idxBase,
+                                       alg,
+                                       bufferSize);
 }
 
 template <typename T>
-cusparseStatus_t cusparsecsr2csc(
-  cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream);
+cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
+                                 int m,
+                                 int n,
+                                 int nnz,
+                                 const T* csrVal,
+                                 const int* csrRowPtr,
+                                 const int* csrColInd,
+                                 void* cscVal,
+                                 int* cscColPtr,
+                                 int* cscRowInd,
+                                 cusparseAction_t copyValues,
+                                 cusparseIndexBase_t idxBase,
+                                 cusparseCsr2CscAlg_t alg,
+                                 void* buffer,
+                                 cudaStream_t stream);
 
 template <>
-inline cusparseStatus_t cusparsecsr2csc(
-  cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
+                                        int m,
+                                        int n,
+                                        int nnz,
+                                        const float* csrVal,
+                                        const int* csrRowPtr,
+                                        const int* csrColInd,
+                                        void* cscVal,
+                                        int* cscColPtr,
+                                        int* cscRowInd,
+                                        cusparseAction_t copyValues,
+                                        cusparseIndexBase_t idxBase,
+                                        cusparseCsr2CscAlg_t alg,
+                                        void* buffer,
+                                        cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
-                            cscVal, cscColPtr, cscRowInd, CUDA_R_32F,
-                            copyValues, idxBase, alg, buffer);
+  return cusparseCsr2cscEx2(handle,
+                            m,
+                            n,
+                            nnz,
+                            csrVal,
+                            csrRowPtr,
+                            csrColInd,
+                            cscVal,
+                            cscColPtr,
+                            cscRowInd,
+                            CUDA_R_32F,
+                            copyValues,
+                            idxBase,
+                            alg,
+                            buffer);
 }
 
 template <>
-inline cusparseStatus_t cusparsecsr2csc(
-  cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
+                                        int m,
+                                        int n,
+                                        int nnz,
+                                        const double* csrVal,
+                                        const int* csrRowPtr,
+                                        const int* csrColInd,
+                                        void* cscVal,
+                                        int* cscColPtr,
+                                        int* cscRowInd,
+                                        cusparseAction_t copyValues,
+                                        cusparseIndexBase_t idxBase,
+                                        cusparseCsr2CscAlg_t alg,
+                                        void* buffer,
+                                        cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
-                            cscVal, cscColPtr, cscRowInd, CUDA_R_64F,
-                            copyValues, idxBase, alg, buffer);
+  return cusparseCsr2cscEx2(handle,
+                            m,
+                            n,
+                            nnz,
+                            csrVal,
+                            csrRowPtr,
+                            csrColInd,
+                            cscVal,
+                            cscColPtr,
+                            cscRowInd,
+                            CUDA_R_64F,
+                            copyValues,
+                            idxBase,
+                            alg,
+                            buffer);
 }
 
 /** @} */
@@ -709,120 +1219,329 @@ inline cusparseStatus_t cusparsecsr2csc(
  */
 
 template <typename T>
-cusparseStatus_t cusparsecsrgemm2_buffersizeext(
-  cusparseHandle_t handle, int m, int n, int k, const T* alpha, const T* beta,
-  const cusparseMatDescr_t matA, int nnzA, const int* rowindA,
-  const int* indicesA, const cusparseMatDescr_t matB, int nnzB,
-  const int* rowindB, const int* indicesB, const cusparseMatDescr_t matD,
-  int nnzD, const int* rowindD, const int* indicesD, csrgemm2Info_t info,
-  size_t* pBufferSizeInBytes, cudaStream_t stream);
-
-template <>
-inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(
-  cusparseHandle_t handle, int m, int n, int k, const float* alpha,
-  const float* beta, const cusparseMatDescr_t matA, int nnzA,
-  const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB,
-  int nnzB, const int* rowindB, const int* indicesB,
-  const cusparseMatDescr_t matD, int nnzD, const int* rowindD,
-  const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes,
-  cudaStream_t stream) {
+cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle,
+                                                int m,
+                                                int n,
+                                                int k,
+                                                const T* alpha,
+                                                const T* beta,
+                                                const cusparseMatDescr_t matA,
+                                                int nnzA,
+                                                const int* rowindA,
+                                                const int* indicesA,
+                                                const cusparseMatDescr_t matB,
+                                                int nnzB,
+                                                const int* rowindB,
+                                                const int* indicesB,
+                                                const cusparseMatDescr_t matD,
+                                                int nnzD,
+                                                const int* rowindD,
+                                                const int* indicesD,
+                                                csrgemm2Info_t info,
+                                                size_t* pBufferSizeInBytes,
+                                                cudaStream_t stream);
+
+template <>
+inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle,
+                                                       int m,
+                                                       int n,
+                                                       int k,
+                                                       const float* alpha,
+                                                       const float* beta,
+                                                       const cusparseMatDescr_t matA,
+                                                       int nnzA,
+                                                       const int* rowindA,
+                                                       const int* indicesA,
+                                                       const cusparseMatDescr_t matB,
+                                                       int nnzB,
+                                                       const int* rowindB,
+                                                       const int* indicesB,
+                                                       const cusparseMatDescr_t matD,
+                                                       int nnzD,
+                                                       const int* rowindD,
+                                                       const int* indicesD,
+                                                       csrgemm2Info_t info,
+                                                       size_t* pBufferSizeInBytes,
+                                                       cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseScsrgemm2_bufferSizeExt(
-    handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB,
-    indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes);
+  return cusparseScsrgemm2_bufferSizeExt(handle,
+                                         m,
+                                         n,
+                                         k,
+                                         alpha,
+                                         matA,
+                                         nnzA,
+                                         rowindA,
+                                         indicesA,
+                                         matB,
+                                         nnzB,
+                                         rowindB,
+                                         indicesB,
+                                         beta,
+                                         matD,
+                                         nnzD,
+                                         rowindD,
+                                         indicesD,
+                                         info,
+                                         pBufferSizeInBytes);
 #pragma GCC diagnostic pop
 }
 
 template <>
-inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(
-  cusparseHandle_t handle, int m, int n, int k, const double* alpha,
-  const double* beta, const cusparseMatDescr_t matA, int nnzA,
-  const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB,
-  int nnzB, const int* rowindB, const int* indicesB,
-  const cusparseMatDescr_t matD, int nnzD, const int* rowindD,
-  const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle,
+                                                       int m,
+                                                       int n,
+                                                       int k,
+                                                       const double* alpha,
+                                                       const double* beta,
+                                                       const cusparseMatDescr_t matA,
+                                                       int nnzA,
+                                                       const int* rowindA,
+                                                       const int* indicesA,
+                                                       const cusparseMatDescr_t matB,
+                                                       int nnzB,
+                                                       const int* rowindB,
+                                                       const int* indicesB,
+                                                       const cusparseMatDescr_t matD,
+                                                       int nnzD,
+                                                       const int* rowindD,
+                                                       const int* indicesD,
+                                                       csrgemm2Info_t info,
+                                                       size_t* pBufferSizeInBytes,
+                                                       cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDcsrgemm2_bufferSizeExt(
-    handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB,
-    indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes);
+  return cusparseDcsrgemm2_bufferSizeExt(handle,
+                                         m,
+                                         n,
+                                         k,
+                                         alpha,
+                                         matA,
+                                         nnzA,
+                                         rowindA,
+                                         indicesA,
+                                         matB,
+                                         nnzB,
+                                         rowindB,
+                                         indicesB,
+                                         beta,
+                                         matD,
+                                         nnzD,
+                                         rowindD,
+                                         indicesD,
+                                         info,
+                                         pBufferSizeInBytes);
 #pragma GCC diagnostic pop
 }
 
-inline cusparseStatus_t cusparsecsrgemm2nnz(
-  cusparseHandle_t handle, int m, int n, int k, const cusparseMatDescr_t matA,
-  int nnzA, const int* rowindA, const int* indicesA,
-  const cusparseMatDescr_t matB, int nnzB, const int* rowindB,
-  const int* indicesB, const cusparseMatDescr_t matD, int nnzD,
-  const int* rowindD, const int* indicesD, const cusparseMatDescr_t matC,
-  int* rowindC, int* nnzC, const csrgemm2Info_t info, void* pBuffer,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrgemm2nnz(cusparseHandle_t handle,
+                                            int m,
+                                            int n,
+                                            int k,
+                                            const cusparseMatDescr_t matA,
+                                            int nnzA,
+                                            const int* rowindA,
+                                            const int* indicesA,
+                                            const cusparseMatDescr_t matB,
+                                            int nnzB,
+                                            const int* rowindB,
+                                            const int* indicesB,
+                                            const cusparseMatDescr_t matD,
+                                            int nnzD,
+                                            const int* rowindD,
+                                            const int* indicesD,
+                                            const cusparseMatDescr_t matC,
+                                            int* rowindC,
+                                            int* nnzC,
+                                            const csrgemm2Info_t info,
+                                            void* pBuffer,
+                                            cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseXcsrgemm2Nnz(handle, m, n, k, matA, nnzA, rowindA, indicesA,
-                              matB, nnzB, rowindB, indicesB, matD, nnzD,
-                              rowindD, indicesD, matC, rowindC, nnzC, info,
+  return cusparseXcsrgemm2Nnz(handle,
+                              m,
+                              n,
+                              k,
+                              matA,
+                              nnzA,
+                              rowindA,
+                              indicesA,
+                              matB,
+                              nnzB,
+                              rowindB,
+                              indicesB,
+                              matD,
+                              nnzD,
+                              rowindD,
+                              indicesD,
+                              matC,
+                              rowindC,
+                              nnzC,
+                              info,
                               pBuffer);
 #pragma GCC diagnostic pop
 }
 
 template <typename T>
-cusparseStatus_t cusparsecsrgemm2(
-  cusparseHandle_t handle, int m, int n, int k, const T* alpha,
-  const cusparseMatDescr_t descrA, int nnzA, const T* csrValA,
-  const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB,
-  int nnzB, const T* csrValB, const int* csrRowPtrB, const int* csrColIndB,
-  const T* beta, const cusparseMatDescr_t descrD, int nnzD, const T* csrValD,
-  const int* csrRowPtrD, const int* csrColIndD, const cusparseMatDescr_t descrC,
-  T* csrValC, const int* csrRowPtrC, int* csrColIndC, const csrgemm2Info_t info,
-  void* pBuffer, cudaStream_t stream);
-
-template <>
-inline cusparseStatus_t cusparsecsrgemm2(
-  cusparseHandle_t handle, int m, int n, int k, const float* alpha,
-  const cusparseMatDescr_t descrA, int nnzA, const float* csrValA,
-  const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB,
-  int nnzB, const float* csrValB, const int* csrRowPtrB, const int* csrColIndB,
-  const float* beta, const cusparseMatDescr_t descrD, int nnzD,
-  const float* csrValD, const int* csrRowPtrD, const int* csrColIndD,
-  const cusparseMatDescr_t descrC, float* csrValC, const int* csrRowPtrC,
-  int* csrColIndC, const csrgemm2Info_t info, void* pBuffer,
-  cudaStream_t stream) {
+cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
+                                  int m,
+                                  int n,
+                                  int k,
+                                  const T* alpha,
+                                  const cusparseMatDescr_t descrA,
+                                  int nnzA,
+                                  const T* csrValA,
+                                  const int* csrRowPtrA,
+                                  const int* csrColIndA,
+                                  const cusparseMatDescr_t descrB,
+                                  int nnzB,
+                                  const T* csrValB,
+                                  const int* csrRowPtrB,
+                                  const int* csrColIndB,
+                                  const T* beta,
+                                  const cusparseMatDescr_t descrD,
+                                  int nnzD,
+                                  const T* csrValD,
+                                  const int* csrRowPtrD,
+                                  const int* csrColIndD,
+                                  const cusparseMatDescr_t descrC,
+                                  T* csrValC,
+                                  const int* csrRowPtrC,
+                                  int* csrColIndC,
+                                  const csrgemm2Info_t info,
+                                  void* pBuffer,
+                                  cudaStream_t stream);
+
+template <>
+inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
+                                         int m,
+                                         int n,
+                                         int k,
+                                         const float* alpha,
+                                         const cusparseMatDescr_t descrA,
+                                         int nnzA,
+                                         const float* csrValA,
+                                         const int* csrRowPtrA,
+                                         const int* csrColIndA,
+                                         const cusparseMatDescr_t descrB,
+                                         int nnzB,
+                                         const float* csrValB,
+                                         const int* csrRowPtrB,
+                                         const int* csrColIndB,
+                                         const float* beta,
+                                         const cusparseMatDescr_t descrD,
+                                         int nnzD,
+                                         const float* csrValD,
+                                         const int* csrRowPtrD,
+                                         const int* csrColIndD,
+                                         const cusparseMatDescr_t descrC,
+                                         float* csrValC,
+                                         const int* csrRowPtrC,
+                                         int* csrColIndC,
+                                         const csrgemm2Info_t info,
+                                         void* pBuffer,
+                                         cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseScsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA,
-                           csrRowPtrA, csrColIndA, descrB, nnzB, csrValB,
-                           csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD,
-                           csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC,
-                           csrColIndC, info, pBuffer);
+  return cusparseScsrgemm2(handle,
+                           m,
+                           n,
+                           k,
+                           alpha,
+                           descrA,
+                           nnzA,
+                           csrValA,
+                           csrRowPtrA,
+                           csrColIndA,
+                           descrB,
+                           nnzB,
+                           csrValB,
+                           csrRowPtrB,
+                           csrColIndB,
+                           beta,
+                           descrD,
+                           nnzD,
+                           csrValD,
+                           csrRowPtrD,
+                           csrColIndD,
+                           descrC,
+                           csrValC,
+                           csrRowPtrC,
+                           csrColIndC,
+                           info,
+                           pBuffer);
 #pragma GCC diagnostic pop
 }
 
 template <>
-inline cusparseStatus_t cusparsecsrgemm2(
-  cusparseHandle_t handle, int m, int n, int k, const double* alpha,
-  const cusparseMatDescr_t descrA, int nnzA, const double* csrValA,
-  const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB,
-  int nnzB, const double* csrValB, const int* csrRowPtrB, const int* csrColIndB,
-  const double* beta, const cusparseMatDescr_t descrD, int nnzD,
-  const double* csrValD, const int* csrRowPtrD, const int* csrColIndD,
-  const cusparseMatDescr_t descrC, double* csrValC, const int* csrRowPtrC,
-  int* csrColIndC, const csrgemm2Info_t info, void* pBuffer,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
+                                         int m,
+                                         int n,
+                                         int k,
+                                         const double* alpha,
+                                         const cusparseMatDescr_t descrA,
+                                         int nnzA,
+                                         const double* csrValA,
+                                         const int* csrRowPtrA,
+                                         const int* csrColIndA,
+                                         const cusparseMatDescr_t descrB,
+                                         int nnzB,
+                                         const double* csrValB,
+                                         const int* csrRowPtrB,
+                                         const int* csrColIndB,
+                                         const double* beta,
+                                         const cusparseMatDescr_t descrD,
+                                         int nnzD,
+                                         const double* csrValD,
+                                         const int* csrRowPtrD,
+                                         const int* csrColIndD,
+                                         const cusparseMatDescr_t descrC,
+                                         double* csrValC,
+                                         const int* csrRowPtrC,
+                                         int* csrColIndC,
+                                         const csrgemm2Info_t info,
+                                         void* pBuffer,
+                                         cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDcsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA,
-                           csrRowPtrA, csrColIndA, descrB, nnzB, csrValB,
-                           csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD,
-                           csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC,
-                           csrColIndC, info, pBuffer);
+  return cusparseDcsrgemm2(handle,
+                           m,
+                           n,
+                           k,
+                           alpha,
+                           descrA,
+                           nnzA,
+                           csrValA,
+                           csrRowPtrA,
+                           csrColIndA,
+                           descrB,
+                           nnzB,
+                           csrValB,
+                           csrRowPtrB,
+                           csrColIndB,
+                           beta,
+                           descrD,
+                           nnzD,
+                           csrValD,
+                           csrRowPtrD,
+                           csrColIndD,
+                           descrC,
+                           csrValC,
+                           csrRowPtrC,
+                           csrColIndC,
+                           info,
+                           pBuffer);
 #pragma GCC diagnostic pop
 }
 
@@ -834,33 +1553,46 @@ inline cusparseStatus_t cusparsecsrgemm2(
  */
 
 template <typename T>
-cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n,
+cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
+                                   int m,
+                                   int n,
                                    const cusparseMatDescr_t descrA,
-                                   const T* csrValA, const int* csrRowPtrA,
-                                   const int* csrColIndA, T* A, int lda,
+                                   const T* csrValA,
+                                   const int* csrRowPtrA,
+                                   const int* csrColIndA,
+                                   T* A,
+                                   int lda,
                                    cudaStream_t stream);
 
 template <>
-inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n,
+inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
+                                          int m,
+                                          int n,
                                           const cusparseMatDescr_t descrA,
                                           const float* csrValA,
                                           const int* csrRowPtrA,
-                                          const int* csrColIndA, float* A,
-                                          int lda, cudaStream_t stream) {
+                                          const int* csrColIndA,
+                                          float* A,
+                                          int lda,
+                                          cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA,
-                            csrColIndA, A, lda);
+  return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
 }
 template <>
-inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n,
+inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
+                                          int m,
+                                          int n,
                                           const cusparseMatDescr_t descrA,
                                           const double* csrValA,
                                           const int* csrRowPtrA,
-                                          const int* csrColIndA, double* A,
-                                          int lda, cudaStream_t stream) {
+                                          const int* csrColIndA,
+                                          double* A,
+                                          int lda,
+                                          cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA,
-                            csrColIndA, A, lda);
+  return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
 }
 
 /** @} */
diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh
index f3109556b7..aef19122da 100644
--- a/cpp/include/raft/sparse/distance/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/bin_distance.cuh
@@ -37,9 +37,11 @@ namespace distance {
 
 // @TODO: Move this into sparse prims (coo_norm)
 template <typename value_idx, typename value_t>
-__global__ void compute_binary_row_norm_kernel(
-  value_t *out, const value_idx *__restrict__ coo_rows,
-  const value_t *__restrict__ data, value_idx nnz) {
+__global__ void compute_binary_row_norm_kernel(value_t* out,
+                                               const value_idx* __restrict__ coo_rows,
+                                               const value_t* __restrict__ data,
+                                               value_idx nnz)
+{
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i < nnz) {
     // We do conditional here only because it's
@@ -51,55 +53,64 @@ __global__ void compute_binary_row_norm_kernel(
 }
 
 template <typename value_idx, typename value_t, typename expansion_f>
-__global__ void compute_binary_warp_kernel(value_t *__restrict__ C,
-                                           const value_t *__restrict__ Q_norms,
-                                           const value_t *__restrict__ R_norms,
-                                           value_idx n_rows, value_idx n_cols,
-                                           expansion_f expansion_func) {
+__global__ void compute_binary_warp_kernel(value_t* __restrict__ C,
+                                           const value_t* __restrict__ Q_norms,
+                                           const value_t* __restrict__ R_norms,
+                                           value_idx n_rows,
+                                           value_idx n_cols,
+                                           expansion_f expansion_func)
+{
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
-  value_idx i = tid / n_cols;
-  value_idx j = tid % n_cols;
+  value_idx i   = tid / n_cols;
+  value_idx j   = tid % n_cols;
 
   if (i >= n_rows || j >= n_cols) return;
 
-  value_t q_norm = Q_norms[i];
-  value_t r_norm = R_norms[j];
-  value_t dot = C[(size_t)i * n_cols + j];
+  value_t q_norm            = Q_norms[i];
+  value_t r_norm            = R_norms[j];
+  value_t dot               = C[(size_t)i * n_cols + j];
   C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm);
 }
 
-template <typename value_idx, typename value_t, typename expansion_f,
-          int tpb = 1024>
-void compute_binary(value_t *C, const value_t *Q_norms, const value_t *R_norms,
-                    value_idx n_rows, value_idx n_cols,
-                    expansion_f expansion_func, cudaStream_t stream) {
+template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
+void compute_binary(value_t* C,
+                    const value_t* Q_norms,
+                    const value_t* R_norms,
+                    value_idx n_rows,
+                    value_idx n_cols,
+                    expansion_f expansion_func,
+                    cudaStream_t stream)
+{
   int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
   compute_binary_warp_kernel<<<blocks, tpb, 0, stream>>>(
     C, Q_norms, R_norms, n_rows, n_cols, expansion_func);
 }
 
-template <typename value_idx, typename value_t, typename expansion_f,
-          int tpb = 1024>
-void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows,
-                          const value_t *Q_data, value_idx Q_nnz,
-                          const value_idx *R_coo_rows, const value_t *R_data,
-                          value_idx R_nnz, value_idx m, value_idx n,
+template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
+void compute_bin_distance(value_t* out,
+                          const value_idx* Q_coo_rows,
+                          const value_t* Q_data,
+                          value_idx Q_nnz,
+                          const value_idx* R_coo_rows,
+                          const value_t* R_data,
+                          value_idx R_nnz,
+                          value_idx m,
+                          value_idx n,
                           std::shared_ptr<raft::mr::device::allocator> alloc,
-                          cudaStream_t stream, expansion_f expansion_func) {
+                          cudaStream_t stream,
+                          expansion_f expansion_func)
+{
   rmm::device_uvector<value_t> Q_norms(m, stream);
   rmm::device_uvector<value_t> R_norms(n, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(
-    cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
 
   compute_binary_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_norms.data(), Q_coo_rows, Q_data, Q_nnz);
   compute_binary_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
     R_norms.data(), R_coo_rows, R_data, R_nnz);
 
-  compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func,
-                 stream);
+  compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream);
 }
 
 /**
@@ -109,44 +120,52 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows,
 template <typename value_idx = int, typename value_t = float>
 class jaccard_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit jaccard_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(0, config.handle.get_stream()),
-      ip_dists(config) {}
+  explicit jaccard_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_bin_distance(
-      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
-      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_device_allocator(), config_->handle.get_stream(),
-      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-        value_t q_r_union = q_norm + r_norm;
-        value_t denom = q_r_union - dot;
-
-        value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom);
-
-        // flip the similarity when both rows are 0
-        bool both_empty = q_r_union == 0;
-        return 1 - ((!both_empty * jacc) + both_empty);
-      });
+    compute_bin_distance(out_dists,
+                         search_coo_rows.data(),
+                         config_->a_data,
+                         config_->a_nnz,
+                         b_indices,
+                         b_data,
+                         config_->b_nnz,
+                         config_->a_nrows,
+                         config_->b_nrows,
+                         config_->handle.get_device_allocator(),
+                         config_->handle.get_stream(),
+                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                           value_t q_r_union = q_norm + r_norm;
+                           value_t denom     = q_r_union - dot;
+
+                           value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom);
+
+                           // flip the similarity when both rows are 0
+                           bool both_empty = q_r_union == 0;
+                           return 1 - ((!both_empty * jacc) + both_empty);
+                         });
   }
 
   ~jaccard_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
@@ -158,40 +177,48 @@ class jaccard_expanded_distances_t : public distances_t<value_t> {
 template <typename value_idx = int, typename value_t = float>
 class dice_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit dice_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(0, config.handle.get_stream()),
-      ip_dists(config) {}
+  explicit dice_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_bin_distance(
-      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
-      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_device_allocator(), config_->handle.get_stream(),
-      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-        value_t q_r_union = q_norm + r_norm;
-        value_t dice = (2 * dot) / q_r_union;
-        bool both_empty = q_r_union == 0;
-        return 1 - ((!both_empty * dice) + both_empty);
-      });
+    compute_bin_distance(out_dists,
+                         search_coo_rows.data(),
+                         config_->a_data,
+                         config_->a_nnz,
+                         b_indices,
+                         b_data,
+                         config_->b_nnz,
+                         config_->a_nrows,
+                         config_->b_nrows,
+                         config_->handle.get_device_allocator(),
+                         config_->handle.get_stream(),
+                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                           value_t q_r_union = q_norm + r_norm;
+                           value_t dice      = (2 * dot) / q_r_union;
+                           bool both_empty   = q_r_union == 0;
+                           return 1 - ((!both_empty * dice) + both_empty);
+                         });
   }
 
   ~dice_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h
index 1c55412eec..29c823bcdb 100644
--- a/cpp/include/raft/sparse/distance/common.h
+++ b/cpp/include/raft/sparse/distance/common.h
@@ -24,31 +24,31 @@ namespace distance {
 
 template <typename value_idx, typename value_t>
 struct distances_config_t {
-  distances_config_t(const raft::handle_t &handle_) : handle(handle_) {}
+  distances_config_t(const raft::handle_t& handle_) : handle(handle_) {}
 
   // left side
   value_idx a_nrows;
   value_idx a_ncols;
   value_idx a_nnz;
-  value_idx *a_indptr;
-  value_idx *a_indices;
-  value_t *a_data;
+  value_idx* a_indptr;
+  value_idx* a_indices;
+  value_t* a_data;
 
   // right side
   value_idx b_nrows;
   value_idx b_ncols;
   value_idx b_nnz;
-  value_idx *b_indptr;
-  value_idx *b_indices;
-  value_t *b_data;
+  value_idx* b_indptr;
+  value_idx* b_indices;
+  value_t* b_data;
 
-  const raft::handle_t &handle;
+  const raft::handle_t& handle;
 };
 
 template <typename value_t>
 class distances_t {
  public:
-  virtual void compute(value_t *out) {}
+  virtual void compute(value_t* out) {}
   virtual ~distances_t() = default;
 };
 
diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh
index 3a78f9ada0..cdf1be0c68 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh
@@ -41,19 +41,29 @@ namespace raft {
 namespace sparse {
 namespace distance {
 
-template <typename value_idx, typename value_t, int threads_per_block = 1024,
-          typename product_f, typename accum_f, typename write_f,
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f,
           typename strategy_t>
 inline void balanced_coo_pairwise_generalized_spmv(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
-  value_idx *coo_rows_b, product_f product_func, accum_f accum_func,
-  write_f write_func, strategy_t strategy, int chunk_size = 500000) {
-  CUDA_CHECK(cudaMemsetAsync(
-    out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows,
-    config_.handle.get_stream()));
-
-  strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func,
-                    chunk_size);
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_b,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  strategy_t strategy,
+  int chunk_size = 500000)
+{
+  CUDA_CHECK(cudaMemsetAsync(out_dists,
+                             0,
+                             sizeof(value_t) * config_.a_nrows * config_.b_nrows,
+                             config_.handle.get_stream()));
+
+  strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
 };
 
 /**
@@ -89,39 +99,55 @@ inline void balanced_coo_pairwise_generalized_spmv(
  *            this value was found through profiling and represents a reasonable
  *            setting for both large and small densities
  */
-template <typename value_idx, typename value_t, int threads_per_block = 1024,
-          typename product_f, typename accum_f, typename write_f>
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
 inline void balanced_coo_pairwise_generalized_spmv(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
-  value_idx *coo_rows_b, product_f product_func, accum_f accum_func,
-  write_f write_func, int chunk_size = 500000) {
-  CUDA_CHECK(cudaMemsetAsync(
-    out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows,
-    config_.handle.get_stream()));
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_b,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  int chunk_size = 500000)
+{
+  CUDA_CHECK(cudaMemsetAsync(out_dists,
+                             0,
+                             sizeof(value_t) * config_.a_nrows * config_.b_nrows,
+                             config_.handle.get_stream()));
 
   int max_cols = max_cols_per_block<value_idx, value_t>();
 
   if (max_cols > config_.a_ncols) {
-    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(
-      config_);
-    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func,
-                      write_func, chunk_size);
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
   } else {
     hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
-    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func,
-                      write_func, chunk_size);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
   }
 };
 
-template <typename value_idx, typename value_t, int threads_per_block = 1024,
-          typename product_f, typename accum_f, typename write_f,
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f,
           typename strategy_t>
 inline void balanced_coo_pairwise_generalized_spmv_rev(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
-  value_idx *coo_rows_a, product_f product_func, accum_f accum_func,
-  write_f write_func, strategy_t strategy, int chunk_size = 500000) {
-  strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func,
-                        write_func, chunk_size);
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_a,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  strategy_t strategy,
+  int chunk_size = 500000)
+{
+  strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
 };
 
 /**
@@ -160,24 +186,30 @@ inline void balanced_coo_pairwise_generalized_spmv_rev(
  *            this value was found through profiling and represents a reasonable
  *            setting for both large and small densities
  */
-template <typename value_idx, typename value_t, int threads_per_block = 1024,
-          typename product_f, typename accum_f, typename write_f>
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
 inline void balanced_coo_pairwise_generalized_spmv_rev(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
-  value_idx *coo_rows_a, product_f product_func, accum_f accum_func,
-  write_f write_func, int chunk_size = 500000) {
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_a,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  int chunk_size = 500000)
+{
   // try dense first
   int max_cols = max_cols_per_block<value_idx, value_t>();
 
   if (max_cols > config_.b_ncols) {
-    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(
-      config_);
-    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func,
-                          write_func, chunk_size);
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
   } else {
     hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
-    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func,
-                          write_func, chunk_size);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
   }
 };
 
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
index 5ace978a23..7a83e73183 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
@@ -32,58 +32,114 @@ namespace distance {
 template <typename value_idx, typename value_t, int tpb>
 class coo_spmv_strategy {
  public:
-  coo_spmv_strategy(const distances_config_t<value_idx, value_t> &config_)
-    : config(config_) {
+  coo_spmv_strategy(const distances_config_t<value_idx, value_t>& config_) : config(config_)
+  {
     smem = raft::getSharedMemPerBlock();
   }
 
-  template <typename strategy_t, typename indptr_it, typename product_f,
-            typename accum_f, typename write_f>
-  void _dispatch_base(strategy_t &strategy, int smem_dim, indptr_it &a_indptr,
-                      value_t *out_dists, value_idx *coo_rows_b,
-                      product_f product_func, accum_f accum_func,
-                      write_f write_func, int chunk_size, int n_blocks,
-                      int n_blocks_per_row) {
-    CUDA_CHECK(cudaFuncSetCacheConfig(
-      balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
-                                           value_t, false, tpb, product_f,
-                                           accum_f, write_f>,
-      cudaFuncCachePreferShared));
+  template <typename strategy_t,
+            typename indptr_it,
+            typename product_f,
+            typename accum_f,
+            typename write_f>
+  void _dispatch_base(strategy_t& strategy,
+                      int smem_dim,
+                      indptr_it& a_indptr,
+                      value_t* out_dists,
+                      value_idx* coo_rows_b,
+                      product_f product_func,
+                      accum_f accum_func,
+                      write_f write_func,
+                      int chunk_size,
+                      int n_blocks,
+                      int n_blocks_per_row)
+  {
+    CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                           indptr_it,
+                                                                           value_idx,
+                                                                           value_t,
+                                                                           false,
+                                                                           tpb,
+                                                                           product_f,
+                                                                           accum_f,
+                                                                           write_f>,
+                                      cudaFuncCachePreferShared));
 
-    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
-                                         value_t, false, tpb>
-      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(
-        strategy, a_indptr, config.a_indices, config.a_data, config.a_nnz,
-        coo_rows_b, config.b_indices, config.b_data, config.a_nrows,
-        config.b_nrows, smem_dim, config.b_nnz, out_dists, n_blocks_per_row,
-        chunk_size, config.b_ncols, product_func, accum_func, write_func);
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, false, tpb>
+      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(strategy,
+                                                            a_indptr,
+                                                            config.a_indices,
+                                                            config.a_data,
+                                                            config.a_nnz,
+                                                            coo_rows_b,
+                                                            config.b_indices,
+                                                            config.b_data,
+                                                            config.a_nrows,
+                                                            config.b_nrows,
+                                                            smem_dim,
+                                                            config.b_nnz,
+                                                            out_dists,
+                                                            n_blocks_per_row,
+                                                            chunk_size,
+                                                            config.b_ncols,
+                                                            product_func,
+                                                            accum_func,
+                                                            write_func);
   }
 
-  template <typename strategy_t, typename indptr_it, typename product_f,
-            typename accum_f, typename write_f>
-  void _dispatch_base_rev(strategy_t &strategy, int smem_dim,
-                          indptr_it &b_indptr, value_t *out_dists,
-                          value_idx *coo_rows_a, product_f product_func,
-                          accum_f accum_func, write_f write_func,
-                          int chunk_size, int n_blocks, int n_blocks_per_row) {
-    CUDA_CHECK(cudaFuncSetCacheConfig(
-      balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
-                                           value_t, true, tpb, product_f,
-                                           accum_f, write_f>,
-      cudaFuncCachePreferShared));
+  template <typename strategy_t,
+            typename indptr_it,
+            typename product_f,
+            typename accum_f,
+            typename write_f>
+  void _dispatch_base_rev(strategy_t& strategy,
+                          int smem_dim,
+                          indptr_it& b_indptr,
+                          value_t* out_dists,
+                          value_idx* coo_rows_a,
+                          product_f product_func,
+                          accum_f accum_func,
+                          write_f write_func,
+                          int chunk_size,
+                          int n_blocks,
+                          int n_blocks_per_row)
+  {
+    CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                           indptr_it,
+                                                                           value_idx,
+                                                                           value_t,
+                                                                           true,
+                                                                           tpb,
+                                                                           product_f,
+                                                                           accum_f,
+                                                                           write_f>,
+                                      cudaFuncCachePreferShared));
 
-    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
-                                         value_t, true, tpb>
-      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(
-        strategy, b_indptr, config.b_indices, config.b_data, config.b_nnz,
-        coo_rows_a, config.a_indices, config.a_data, config.b_nrows,
-        config.a_nrows, smem_dim, config.a_nnz, out_dists, n_blocks_per_row,
-        chunk_size, config.a_ncols, product_func, accum_func, write_func);
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, true, tpb>
+      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(strategy,
+                                                            b_indptr,
+                                                            config.b_indices,
+                                                            config.b_data,
+                                                            config.b_nnz,
+                                                            coo_rows_a,
+                                                            config.a_indices,
+                                                            config.a_data,
+                                                            config.b_nrows,
+                                                            config.a_nrows,
+                                                            smem_dim,
+                                                            config.a_nnz,
+                                                            out_dists,
+                                                            n_blocks_per_row,
+                                                            chunk_size,
+                                                            config.a_ncols,
+                                                            product_func,
+                                                            accum_func,
+                                                            write_func);
   }
 
  protected:
   int smem;
-  const distances_config_t<value_idx, value_t> &config;
+  const distances_config_t<value_idx, value_t>& config;
 };
 
 }  // namespace distance
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
index 44c3833f96..6586067b56 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -29,11 +29,15 @@ namespace distance {
 template <typename value_idx>
 class mask_row_it {
  public:
-  mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_,
-              value_idx *mask_row_idx_ = NULL)
-    : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) {}
+  mask_row_it(const value_idx* full_indptr_,
+              const value_idx& n_rows_,
+              value_idx* mask_row_idx_ = NULL)
+    : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_)
+  {
+  }
 
-  __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) {
+  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
+  {
     if (mask_row_idx != NULL) {
       return mask_row_idx[blockIdx.x / n_blocks_nnz_b];
     } else {
@@ -41,37 +45,49 @@ class mask_row_it {
     }
   }
 
-  __device__ inline void get_row_offsets(
-    const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset,
-    const value_idx &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) {
+  __device__ inline void get_row_offsets(const value_idx& row_idx,
+                                         value_idx& start_offset,
+                                         value_idx& stop_offset,
+                                         const value_idx& n_blocks_nnz_b,
+                                         bool& first_a_chunk,
+                                         bool& last_a_chunk)
+  {
     start_offset = full_indptr[row_idx];
-    stop_offset = full_indptr[row_idx + 1] - 1;
+    stop_offset  = full_indptr[row_idx + 1] - 1;
   }
 
-  __device__ constexpr inline void get_indices_boundary(
-    const value_idx *indices, value_idx &indices_len, value_idx &start_offset,
-    value_idx &stop_offset, value_idx &start_index, value_idx &stop_index,
-    bool &first_a_chunk, bool &last_a_chunk) {
+  __device__ constexpr inline void get_indices_boundary(const value_idx* indices,
+                                                        value_idx& indices_len,
+                                                        value_idx& start_offset,
+                                                        value_idx& stop_offset,
+                                                        value_idx& start_index,
+                                                        value_idx& stop_index,
+                                                        bool& first_a_chunk,
+                                                        bool& last_a_chunk)
+  {
     // do nothing;
   }
 
-  __device__ constexpr inline bool check_indices_bounds(
-    value_idx &start_index_a, value_idx &stop_index_a, value_idx &index_b) {
+  __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a,
+                                                        value_idx& stop_index_a,
+                                                        value_idx& index_b)
+  {
     return true;
   }
 
   const value_idx *full_indptr, &n_rows;
-  value_idx *mask_row_idx;
+  value_idx* mask_row_idx;
 };
 
 template <typename value_idx>
-__global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row,
-                                          value_idx *chunk_indices,
-                                          value_idx n_rows) {
+__global__ void fill_chunk_indices_kernel(value_idx* n_chunks_per_row,
+                                          value_idx* chunk_indices,
+                                          value_idx n_rows)
+{
   auto tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n_rows) {
     auto start = n_chunks_per_row[tid];
-    auto end = n_chunks_per_row[tid + 1];
+    auto end   = n_chunks_per_row[tid + 1];
 
 #pragma unroll
     for (int i = start; i < end; i++) {
@@ -83,73 +99,89 @@ __global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row,
 template <typename value_idx>
 class chunked_mask_row_it : public mask_row_it<value_idx> {
  public:
-  chunked_mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_,
-                      value_idx *mask_row_idx_, int row_chunk_size_,
-                      const value_idx *n_chunks_per_row_,
-                      const value_idx *chunk_indices_,
+  chunked_mask_row_it(const value_idx* full_indptr_,
+                      const value_idx& n_rows_,
+                      value_idx* mask_row_idx_,
+                      int row_chunk_size_,
+                      const value_idx* n_chunks_per_row_,
+                      const value_idx* chunk_indices_,
                       const cudaStream_t stream_)
     : mask_row_it<value_idx>(full_indptr_, n_rows_, mask_row_idx_),
       row_chunk_size(row_chunk_size_),
       n_chunks_per_row(n_chunks_per_row_),
       chunk_indices(chunk_indices_),
-      stream(stream_) {}
+      stream(stream_)
+  {
+  }
 
-  static void init(const value_idx *indptr, const value_idx *mask_row_idx,
-                   const value_idx &n_rows, const int row_chunk_size,
-                   rmm::device_uvector<value_idx> &n_chunks_per_row,
-                   rmm::device_uvector<value_idx> &chunk_indices,
-                   cudaStream_t stream) {
+  static void init(const value_idx* indptr,
+                   const value_idx* mask_row_idx,
+                   const value_idx& n_rows,
+                   const int row_chunk_size,
+                   rmm::device_uvector<value_idx>& n_chunks_per_row,
+                   rmm::device_uvector<value_idx>& chunk_indices,
+                   cudaStream_t stream)
+  {
     auto policy = rmm::exec_policy(stream);
 
     constexpr value_idx first_element = 0;
     n_chunks_per_row.set_element_async(0, first_element, stream);
     n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size);
-    thrust::transform(policy, mask_row_idx, mask_row_idx + n_rows,
-                      n_chunks_per_row.begin() + 1, chunk_functor);
+    thrust::transform(
+      policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor);
 
-    thrust::inclusive_scan(policy, n_chunks_per_row.begin() + 1,
-                           n_chunks_per_row.end(),
-                           n_chunks_per_row.begin() + 1);
+    thrust::inclusive_scan(
+      policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1);
 
-    raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1,
-                      stream);
+    raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream);
 
     fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream);
   }
 
-  __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) {
+  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
+  {
     return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]];
   }
 
-  __device__ inline void get_row_offsets(
-    const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset,
-    const int &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) {
-    auto chunk_index = blockIdx.x / n_blocks_nnz_b;
-    auto chunk_val = chunk_indices[chunk_index];
-    auto prev_n_chunks = n_chunks_per_row[chunk_val];
+  __device__ inline void get_row_offsets(const value_idx& row_idx,
+                                         value_idx& start_offset,
+                                         value_idx& stop_offset,
+                                         const int& n_blocks_nnz_b,
+                                         bool& first_a_chunk,
+                                         bool& last_a_chunk)
+  {
+    auto chunk_index    = blockIdx.x / n_blocks_nnz_b;
+    auto chunk_val      = chunk_indices[chunk_index];
+    auto prev_n_chunks  = n_chunks_per_row[chunk_val];
     auto relative_chunk = chunk_index - prev_n_chunks;
-    first_a_chunk = relative_chunk == 0;
+    first_a_chunk       = relative_chunk == 0;
 
     start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size;
-    stop_offset = start_offset + row_chunk_size;
+    stop_offset  = start_offset + row_chunk_size;
 
     auto final_stop_offset = this->full_indptr[row_idx + 1];
 
     last_a_chunk = stop_offset >= final_stop_offset;
-    stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1;
+    stop_offset  = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1;
   }
 
-  __device__ inline void get_indices_boundary(
-    const value_idx *indices, value_idx &row_idx, value_idx &start_offset,
-    value_idx &stop_offset, value_idx &start_index, value_idx &stop_index,
-    bool &first_a_chunk, bool &last_a_chunk) {
+  __device__ inline void get_indices_boundary(const value_idx* indices,
+                                              value_idx& row_idx,
+                                              value_idx& start_offset,
+                                              value_idx& stop_offset,
+                                              value_idx& start_index,
+                                              value_idx& stop_index,
+                                              bool& first_a_chunk,
+                                              bool& last_a_chunk)
+  {
     start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1;
-    stop_index = last_a_chunk ? stop_index : indices[stop_offset];
+    stop_index  = last_a_chunk ? stop_index : indices[stop_offset];
   }
 
-  __device__ inline bool check_indices_bounds(value_idx &start_index_a,
-                                              value_idx &stop_index_a,
-                                              value_idx &index_b) {
+  __device__ inline bool check_indices_bounds(value_idx& start_index_a,
+                                              value_idx& stop_index_a,
+                                              value_idx& index_b)
+  {
     return (index_b >= start_index_a && index_b <= stop_index_a);
   }
 
@@ -160,30 +192,34 @@ class chunked_mask_row_it : public mask_row_it<value_idx> {
 
   struct n_chunks_per_row_functor {
    public:
-    n_chunks_per_row_functor(const value_idx *indptr_,
-                             value_idx row_chunk_size_)
-      : indptr(indptr_), row_chunk_size(row_chunk_size_) {}
+    n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_)
+      : indptr(indptr_), row_chunk_size(row_chunk_size_)
+    {
+    }
 
-    __host__ __device__ value_idx operator()(const value_idx &i) {
+    __host__ __device__ value_idx operator()(const value_idx& i)
+    {
       auto degree = indptr[i + 1] - indptr[i];
       return raft::ceildiv(degree, (value_idx)row_chunk_size);
     }
 
-    const value_idx *indptr;
+    const value_idx* indptr;
     value_idx row_chunk_size;
   };
 
  private:
-  static void fill_chunk_indices(
-    const value_idx &n_rows, rmm::device_uvector<value_idx> &n_chunks_per_row,
-    rmm::device_uvector<value_idx> &chunk_indices, cudaStream_t stream) {
+  static void fill_chunk_indices(const value_idx& n_rows,
+                                 rmm::device_uvector<value_idx>& n_chunks_per_row,
+                                 rmm::device_uvector<value_idx>& chunk_indices,
+                                 cudaStream_t stream)
+  {
     auto n_threads = std::min(n_rows, 256);
-    auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads);
+    auto n_blocks  = raft::ceildiv(n_rows, (value_idx)n_threads);
 
     chunk_indices.resize(total_row_blocks, stream);
 
-    fill_chunk_indices_kernel<value_idx><<<n_blocks, n_threads, 0, stream>>>(
-      n_chunks_per_row.data(), chunk_indices.data(), n_rows);
+    fill_chunk_indices_kernel<value_idx>
+      <<<n_blocks, n_threads, 0, stream>>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows);
   }
 };
 
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh
index c463654a3b..aac98d6b02 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh
@@ -25,71 +25,91 @@ namespace distance {
 template <typename value_idx, typename value_t, int tpb>
 class dense_smem_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
  public:
-  using smem_type = value_t *;
+  using smem_type   = value_t*;
   using insert_type = smem_type;
-  using find_type = smem_type;
+  using find_type   = smem_type;
 
-  dense_smem_strategy(const distances_config_t<value_idx, value_t> &config_)
-    : coo_spmv_strategy<value_idx, value_t, tpb>(config_) {}
+  dense_smem_strategy(const distances_config_t<value_idx, value_t>& config_)
+    : coo_spmv_strategy<value_idx, value_t, tpb>(config_)
+  {
+  }
 
-  inline static int smem_per_block(int n_cols) {
-    return (n_cols * sizeof(value_t)) +
-           ((1024 / raft::warp_size()) * sizeof(value_t));
+  inline static int smem_per_block(int n_cols)
+  {
+    return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t));
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch(value_t *out_dists, value_idx *coo_rows_b,
-                product_f product_func, accum_f accum_func, write_f write_func,
-                int chunk_size) {
-    auto n_blocks_per_row =
-      raft::ceildiv(this->config.b_nnz, chunk_size * 1024);
-    auto n_blocks = this->config.a_nrows * n_blocks_per_row;
-
-    mask_row_it<value_idx> a_indptr(this->config.a_indptr,
-                                    this->config.a_nrows);
-
-    this->_dispatch_base(*this, this->config.b_ncols, a_indptr, out_dists,
-                         coo_rows_b, product_func, accum_func, write_func,
-                         chunk_size, n_blocks, n_blocks_per_row);
+  void dispatch(value_t* out_dists,
+                value_idx* coo_rows_b,
+                product_f product_func,
+                accum_f accum_func,
+                write_f write_func,
+                int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024);
+    auto n_blocks         = this->config.a_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> a_indptr(this->config.a_indptr, this->config.a_nrows);
+
+    this->_dispatch_base(*this,
+                         this->config.b_ncols,
+                         a_indptr,
+                         out_dists,
+                         coo_rows_b,
+                         product_func,
+                         accum_func,
+                         write_func,
+                         chunk_size,
+                         n_blocks,
+                         n_blocks_per_row);
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a,
-                    product_f product_func, accum_f accum_func,
-                    write_f write_func, int chunk_size) {
-    auto n_blocks_per_row =
-      raft::ceildiv(this->config.a_nnz, chunk_size * 1024);
-    auto n_blocks = this->config.b_nrows * n_blocks_per_row;
-
-    mask_row_it<value_idx> b_indptr(this->config.b_indptr,
-                                    this->config.b_nrows);
-
-    this->_dispatch_base_rev(*this, this->config.a_ncols, b_indptr, out_dists,
-                             coo_rows_a, product_func, accum_func, write_func,
-                             chunk_size, n_blocks, n_blocks_per_row);
+  void dispatch_rev(value_t* out_dists,
+                    value_idx* coo_rows_a,
+                    product_f product_func,
+                    accum_f accum_func,
+                    write_f write_func,
+                    int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024);
+    auto n_blocks         = this->config.b_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> b_indptr(this->config.b_indptr, this->config.b_nrows);
+
+    this->_dispatch_base_rev(*this,
+                             this->config.a_ncols,
+                             b_indptr,
+                             out_dists,
+                             coo_rows_a,
+                             product_func,
+                             accum_func,
+                             write_func,
+                             chunk_size,
+                             n_blocks,
+                             n_blocks_per_row);
   }
 
-  __device__ inline insert_type init_insert(smem_type cache,
-                                            const value_idx &cache_size) {
+  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
+  {
     for (int k = threadIdx.x; k < cache_size; k += blockDim.x) {
       cache[k] = 0.0;
     }
     return cache;
   }
 
-  __device__ inline void insert(insert_type cache, const value_idx &key,
-                                const value_t &value) {
+  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
+  {
     cache[key] = value;
   }
 
-  __device__ inline find_type init_find(smem_type cache,
-                                        const value_idx &cache_size) {
+  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
+  {
     return cache;
   }
 
-  __device__ inline value_t find(find_type cache, const value_idx &key) {
-    return cache[key];
-  }
+  __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; }
 };
 
 }  // namespace distance
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
index 1295d24103..3f8f4b21ad 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
@@ -1,18 +1,18 @@
 /*
-  * Copyright (c) 2021, NVIDIA CORPORATION.
-  *
-  * Licensed under the Apache License, Version 2.0 (the "License");
-  * you may not use this file except in compliance with the License.
-  * You may obtain a copy of the License at
-  *
-  *     http://www.apache.org/licenses/LICENSE-2.0
-  *
-  * Unless required by applicable law or agreed to in writing, software
-  * distributed under the License is distributed on an "AS IS" BASIS,
-  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  * See the License for the specific language governing permissions and
-  * limitations under the License.
-  */
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 
@@ -38,177 +38,238 @@ template <typename value_idx, typename value_t, int tpb>
 class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
  public:
   using insert_type =
-    typename cuco::static_map<value_idx, value_t,
-                              cuda::thread_scope_block>::device_mutable_view;
-  using smem_type = typename insert_type::slot_type *;
+    typename cuco::static_map<value_idx, value_t, cuda::thread_scope_block>::device_mutable_view;
+  using smem_type = typename insert_type::slot_type*;
   using find_type =
-    typename cuco::static_map<value_idx, value_t,
-                              cuda::thread_scope_block>::device_view;
+    typename cuco::static_map<value_idx, value_t, cuda::thread_scope_block>::device_view;
 
-  hash_strategy(const distances_config_t<value_idx, value_t> &config_,
-                float capacity_threshold_ = 0.5, int map_size_ = get_map_size())
+  hash_strategy(const distances_config_t<value_idx, value_t>& config_,
+                float capacity_threshold_ = 0.5,
+                int map_size_             = get_map_size())
     : coo_spmv_strategy<value_idx, value_t, tpb>(config_),
       capacity_threshold(capacity_threshold_),
-      map_size(map_size_) {}
+      map_size(map_size_)
+  {
+  }
 
-  void chunking_needed(const value_idx *indptr, const value_idx n_rows,
-                       rmm::device_uvector<value_idx> &mask_indptr,
-                       std::tuple<value_idx, value_idx> &n_rows_divided,
-                       cudaStream_t stream) {
+  void chunking_needed(const value_idx* indptr,
+                       const value_idx n_rows,
+                       rmm::device_uvector<value_idx>& mask_indptr,
+                       std::tuple<value_idx, value_idx>& n_rows_divided,
+                       cudaStream_t stream)
+  {
     auto policy = rmm::exec_policy(stream);
 
-    auto less = thrust::copy_if(
-      policy, thrust::make_counting_iterator(value_idx(0)),
-      thrust::make_counting_iterator(n_rows), mask_indptr.data(),
-      fits_in_hash_table(indptr, 0, capacity_threshold * map_size));
+    auto less                   = thrust::copy_if(policy,
+                                thrust::make_counting_iterator(value_idx(0)),
+                                thrust::make_counting_iterator(n_rows),
+                                mask_indptr.data(),
+                                fits_in_hash_table(indptr, 0, capacity_threshold * map_size));
     std::get<0>(n_rows_divided) = less - mask_indptr.data();
 
     auto more = thrust::copy_if(
-      policy, thrust::make_counting_iterator(value_idx(0)),
-      thrust::make_counting_iterator(n_rows), less,
-      fits_in_hash_table(indptr, capacity_threshold * map_size,
-                         std::numeric_limits<value_idx>::max()));
+      policy,
+      thrust::make_counting_iterator(value_idx(0)),
+      thrust::make_counting_iterator(n_rows),
+      less,
+      fits_in_hash_table(
+        indptr, capacity_threshold * map_size, std::numeric_limits<value_idx>::max()));
     std::get<1>(n_rows_divided) = more - less;
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch(value_t *out_dists, value_idx *coo_rows_b,
-                product_f product_func, accum_f accum_func, write_f write_func,
-                int chunk_size) {
+  void dispatch(value_t* out_dists,
+                value_idx* coo_rows_b,
+                product_f product_func,
+                accum_f accum_func,
+                write_f write_func,
+                int chunk_size)
+  {
     auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb);
-    rmm::device_uvector<value_idx> mask_indptr(
-      this->config.a_nrows, this->config.handle.get_stream());
+    rmm::device_uvector<value_idx> mask_indptr(this->config.a_nrows,
+                                               this->config.handle.get_stream());
     std::tuple<value_idx, value_idx> n_rows_divided;
 
-    chunking_needed(this->config.a_indptr, this->config.a_nrows, mask_indptr,
-                    n_rows_divided, this->config.handle.get_stream());
+    chunking_needed(this->config.a_indptr,
+                    this->config.a_nrows,
+                    mask_indptr,
+                    n_rows_divided,
+                    this->config.handle.get_stream());
 
     auto less_rows = std::get<0>(n_rows_divided);
     if (less_rows > 0) {
-      mask_row_it<value_idx> less(this->config.a_indptr, less_rows,
-                                  mask_indptr.data());
+      mask_row_it<value_idx> less(this->config.a_indptr, less_rows, mask_indptr.data());
 
       auto n_less_blocks = less_rows * n_blocks_per_row;
-      this->_dispatch_base(*this, map_size, less, out_dists, coo_rows_b,
-                           product_func, accum_func, write_func, chunk_size,
-                           n_less_blocks, n_blocks_per_row);
+      this->_dispatch_base(*this,
+                           map_size,
+                           less,
+                           out_dists,
+                           coo_rows_b,
+                           product_func,
+                           accum_func,
+                           write_func,
+                           chunk_size,
+                           n_less_blocks,
+                           n_blocks_per_row);
     }
 
     auto more_rows = std::get<1>(n_rows_divided);
     if (more_rows > 0) {
-      rmm::device_uvector<value_idx> n_chunks_per_row(
-        more_rows + 1, this->config.handle.get_stream());
-      rmm::device_uvector<value_idx> chunk_indices(
-        0, this->config.handle.get_stream());
-      chunked_mask_row_it<value_idx>::init(
-        this->config.a_indptr, mask_indptr.data() + less_rows, more_rows,
-        capacity_threshold * map_size, n_chunks_per_row, chunk_indices,
-        this->config.handle.get_stream());
-
-      chunked_mask_row_it<value_idx> more(
-        this->config.a_indptr, more_rows, mask_indptr.data() + less_rows,
-        capacity_threshold * map_size, n_chunks_per_row.data(),
-        chunk_indices.data(), this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> n_chunks_per_row(more_rows + 1,
+                                                      this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> chunk_indices(0, this->config.handle.get_stream());
+      chunked_mask_row_it<value_idx>::init(this->config.a_indptr,
+                                           mask_indptr.data() + less_rows,
+                                           more_rows,
+                                           capacity_threshold * map_size,
+                                           n_chunks_per_row,
+                                           chunk_indices,
+                                           this->config.handle.get_stream());
+
+      chunked_mask_row_it<value_idx> more(this->config.a_indptr,
+                                          more_rows,
+                                          mask_indptr.data() + less_rows,
+                                          capacity_threshold * map_size,
+                                          n_chunks_per_row.data(),
+                                          chunk_indices.data(),
+                                          this->config.handle.get_stream());
 
       auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
-      this->_dispatch_base(*this, map_size, more, out_dists, coo_rows_b,
-                           product_func, accum_func, write_func, chunk_size,
-                           n_more_blocks, n_blocks_per_row);
+      this->_dispatch_base(*this,
+                           map_size,
+                           more,
+                           out_dists,
+                           coo_rows_b,
+                           product_func,
+                           accum_func,
+                           write_func,
+                           chunk_size,
+                           n_more_blocks,
+                           n_blocks_per_row);
     }
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a,
-                    product_f product_func, accum_f accum_func,
-                    write_f write_func, int chunk_size) {
+  void dispatch_rev(value_t* out_dists,
+                    value_idx* coo_rows_a,
+                    product_f product_func,
+                    accum_f accum_func,
+                    write_f write_func,
+                    int chunk_size)
+  {
     auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb);
-    rmm::device_uvector<value_idx> mask_indptr(
-      this->config.b_nrows, this->config.handle.get_stream());
+    rmm::device_uvector<value_idx> mask_indptr(this->config.b_nrows,
+                                               this->config.handle.get_stream());
     std::tuple<value_idx, value_idx> n_rows_divided;
 
-    chunking_needed(this->config.b_indptr, this->config.b_nrows, mask_indptr,
-                    n_rows_divided, this->config.handle.get_stream());
+    chunking_needed(this->config.b_indptr,
+                    this->config.b_nrows,
+                    mask_indptr,
+                    n_rows_divided,
+                    this->config.handle.get_stream());
 
     auto less_rows = std::get<0>(n_rows_divided);
     if (less_rows > 0) {
-      mask_row_it<value_idx> less(this->config.b_indptr, less_rows,
-                                  mask_indptr.data());
+      mask_row_it<value_idx> less(this->config.b_indptr, less_rows, mask_indptr.data());
 
       auto n_less_blocks = less_rows * n_blocks_per_row;
-      this->_dispatch_base_rev(*this, map_size, less, out_dists, coo_rows_a,
-                               product_func, accum_func, write_func, chunk_size,
-                               n_less_blocks, n_blocks_per_row);
+      this->_dispatch_base_rev(*this,
+                               map_size,
+                               less,
+                               out_dists,
+                               coo_rows_a,
+                               product_func,
+                               accum_func,
+                               write_func,
+                               chunk_size,
+                               n_less_blocks,
+                               n_blocks_per_row);
     }
 
     auto more_rows = std::get<1>(n_rows_divided);
     if (more_rows > 0) {
-      rmm::device_uvector<value_idx> n_chunks_per_row(
-        more_rows + 1, this->config.handle.get_stream());
-      rmm::device_uvector<value_idx> chunk_indices(
-        0, this->config.handle.get_stream());
-      chunked_mask_row_it<value_idx>::init(
-        this->config.b_indptr, mask_indptr.data() + less_rows, more_rows,
-        capacity_threshold * map_size, n_chunks_per_row, chunk_indices,
-        this->config.handle.get_stream());
-
-      chunked_mask_row_it<value_idx> more(
-        this->config.b_indptr, more_rows, mask_indptr.data() + less_rows,
-        capacity_threshold * map_size, n_chunks_per_row.data(),
-        chunk_indices.data(), this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> n_chunks_per_row(more_rows + 1,
+                                                      this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> chunk_indices(0, this->config.handle.get_stream());
+      chunked_mask_row_it<value_idx>::init(this->config.b_indptr,
+                                           mask_indptr.data() + less_rows,
+                                           more_rows,
+                                           capacity_threshold * map_size,
+                                           n_chunks_per_row,
+                                           chunk_indices,
+                                           this->config.handle.get_stream());
+
+      chunked_mask_row_it<value_idx> more(this->config.b_indptr,
+                                          more_rows,
+                                          mask_indptr.data() + less_rows,
+                                          capacity_threshold * map_size,
+                                          n_chunks_per_row.data(),
+                                          chunk_indices.data(),
+                                          this->config.handle.get_stream());
 
       auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
-      this->_dispatch_base_rev(*this, map_size, more, out_dists, coo_rows_a,
-                               product_func, accum_func, write_func, chunk_size,
-                               n_more_blocks, n_blocks_per_row);
+      this->_dispatch_base_rev(*this,
+                               map_size,
+                               more,
+                               out_dists,
+                               coo_rows_a,
+                               product_func,
+                               accum_func,
+                               write_func,
+                               chunk_size,
+                               n_more_blocks,
+                               n_blocks_per_row);
     }
   }
 
-  __device__ inline insert_type init_insert(smem_type cache,
-                                            const value_idx &cache_size) {
+  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
+  {
     return insert_type::make_from_uninitialized_slots(
       cooperative_groups::this_thread_block(), cache, cache_size, -1, 0);
   }
 
-  __device__ inline void insert(insert_type cache, const value_idx &key,
-                                const value_t &value) {
+  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
+  {
     auto success = cache.insert(cuco::pair<value_idx, value_t>(key, value));
   }
 
-  __device__ inline find_type init_find(smem_type cache,
-                                        const value_idx &cache_size) {
+  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
+  {
     return find_type(cache, cache_size, -1, 0);
   }
 
-  __device__ inline value_t find(find_type cache, const value_idx &key) {
+  __device__ inline value_t find(find_type cache, const value_idx& key)
+  {
     auto a_pair = cache.find(key);
 
     value_t a_col = 0.0;
-    if (a_pair != cache.end()) {
-      a_col = a_pair->second;
-    }
+    if (a_pair != cache.end()) { a_col = a_pair->second; }
     return a_col;
   }
 
   struct fits_in_hash_table {
    public:
-    fits_in_hash_table(const value_idx *indptr_, value_idx degree_l_,
-                       value_idx degree_r_)
-      : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) {}
+    fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_)
+      : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_)
+    {
+    }
 
-    __host__ __device__ bool operator()(const value_idx &i) {
+    __host__ __device__ bool operator()(const value_idx& i)
+    {
       auto degree = indptr[i + 1] - indptr[i];
 
       return degree >= degree_l && degree < degree_r;
     }
 
    private:
-    const value_idx *indptr;
+    const value_idx* indptr;
     const value_idx degree_l, degree_r;
   };
 
-  inline static int get_map_size() {
-    return (raft::getSharedMemPerBlock() -
-            ((tpb / raft::warp_size()) * sizeof(value_t))) /
+  inline static int get_map_size()
+  {
+    return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
            sizeof(typename insert_type::slot_type);
   }
 
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
index 51f9a05394..b12252ab25 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
@@ -27,68 +27,88 @@ namespace sparse {
 namespace distance {
 
 /**
-  * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
-  * sparse-matrix-sparse-vector multiplication layout (SPMV).
-  * This is intended to be scheduled n_chunks_b times for each row of a.
-  * The steps are as follows:
-  *
-  * 1. Load row from A into dense vector in shared memory.
-  *    This can be further chunked in the future if necessary to support larger
-  *    column sizes.
-  * 2. Threads of block all step through chunks of B in parallel.
-  *    When a new row is encountered in row_indices_b, a segmented
-  *    reduction is performed across the warps and then across the
-  *    block and the final value written out to host memory.
-  *
-  * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf
-  *
-  * @tparam value_idx index type
-  * @tparam value_t value type
-  * @tparam tpb threads per block configured on launch
-  * @tparam rev if this is true, the reduce/accumulate functions are only
-  *         executed when A[col] == 0.0. when executed before/after !rev
-  *         and A & B are reversed, this allows the full symmetric difference
-  *         and intersection to be computed.
-  * @tparam kv_t data type stored in shared mem cache
-  * @tparam product_f reduce function type (semiring product() function).
-  *                  accepts two arguments of value_t and returns a value_t
-  * @tparam accum_f accumulation function type (semiring sum() function).
-  *                 accepts two arguments of value_t and returns a value_t
-  * @tparam write_f function to write value out. this should be mathematically
-  *                 equivalent to the accumulate function but implemented as
-  *                 an atomic operation on global memory. Accepts two arguments
-  *                 of value_t* and value_t and updates the value given by the
-  *                 pointer.
-  * @param[in] indptrA column pointer array for A
-  * @param[in] indicesA column indices array for A
-  * @param[in] dataA data array for A
-  * @param[in] rowsB coo row array for B
-  * @param[in] indicesB column indices array for B
-  * @param[in] dataB data array for B
-  * @param[in] m number of rows in A
-  * @param[in] n number of rows in B
-  * @param[in] dim number of features
-  * @param[in] nnz_b number of nonzeros in B
-  * @param[out] out array of size m*n
-  * @param[in] n_blocks_per_row number of blocks of B per row of A
-  * @param[in] chunk_size number of nnz for B to use for each row of A
-  * @param[in] buffer_size amount of smem to use for each row of A
-  * @param[in] product_func semiring product() function
-  * @param[in] accum_func semiring sum() function
-  * @param[in] write_func atomic semiring sum() function
-  */
-template <typename strategy_t, typename indptr_it, typename value_idx,
-          typename value_t, bool rev, int tpb, typename product_f,
-          typename accum_f, typename write_f>
-__global__ void balanced_coo_generalized_spmv_kernel(
-  strategy_t strategy, indptr_it indptrA, value_idx *indicesA, value_t *dataA,
-  value_idx nnz_a, value_idx *rowsB, value_idx *indicesB, value_t *dataB,
-  value_idx m, value_idx n, int dim, value_idx nnz_b, value_t *out,
-  int n_blocks_per_row, int chunk_size, value_idx b_ncols,
-  product_f product_func, accum_f accum_func, write_f write_func) {
+ * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
+ * sparse-matrix-sparse-vector multiplication layout (SPMV).
+ * This is intended to be scheduled n_chunks_b times for each row of a.
+ * The steps are as follows:
+ *
+ * 1. Load row from A into dense vector in shared memory.
+ *    This can be further chunked in the future if necessary to support larger
+ *    column sizes.
+ * 2. Threads of block all step through chunks of B in parallel.
+ *    When a new row is encountered in row_indices_b, a segmented
+ *    reduction is performed across the warps and then across the
+ *    block and the final value written out to host memory.
+ *
+ * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam tpb threads per block configured on launch
+ * @tparam rev if this is true, the reduce/accumulate functions are only
+ *         executed when A[col] == 0.0. when executed before/after !rev
+ *         and A & B are reversed, this allows the full symmetric difference
+ *         and intersection to be computed.
+ * @tparam kv_t data type stored in shared mem cache
+ * @tparam product_f reduce function type (semiring product() function).
+ *                  accepts two arguments of value_t and returns a value_t
+ * @tparam accum_f accumulation function type (semiring sum() function).
+ *                 accepts two arguments of value_t and returns a value_t
+ * @tparam write_f function to write value out. this should be mathematically
+ *                 equivalent to the accumulate function but implemented as
+ *                 an atomic operation on global memory. Accepts two arguments
+ *                 of value_t* and value_t and updates the value given by the
+ *                 pointer.
+ * @param[in] indptrA column pointer array for A
+ * @param[in] indicesA column indices array for A
+ * @param[in] dataA data array for A
+ * @param[in] rowsB coo row array for B
+ * @param[in] indicesB column indices array for B
+ * @param[in] dataB data array for B
+ * @param[in] m number of rows in A
+ * @param[in] n number of rows in B
+ * @param[in] dim number of features
+ * @param[in] nnz_b number of nonzeros in B
+ * @param[out] out array of size m*n
+ * @param[in] n_blocks_per_row number of blocks of B per row of A
+ * @param[in] chunk_size number of nnz for B to use for each row of A
+ * @param[in] buffer_size amount of smem to use for each row of A
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ */
+template <typename strategy_t,
+          typename indptr_it,
+          typename value_idx,
+          typename value_t,
+          bool rev,
+          int tpb,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+__global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy,
+                                                     indptr_it indptrA,
+                                                     value_idx* indicesA,
+                                                     value_t* dataA,
+                                                     value_idx nnz_a,
+                                                     value_idx* rowsB,
+                                                     value_idx* indicesB,
+                                                     value_t* dataB,
+                                                     value_idx m,
+                                                     value_idx n,
+                                                     int dim,
+                                                     value_idx nnz_b,
+                                                     value_t* out,
+                                                     int n_blocks_per_row,
+                                                     int chunk_size,
+                                                     value_idx b_ncols,
+                                                     product_f product_func,
+                                                     accum_f accum_func,
+                                                     write_f write_func)
+{
   typedef cub::WarpReduce<value_t> warp_reduce;
 
-  value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row);
+  value_idx cur_row_a        = indptrA.get_row_idx(n_blocks_per_row);
   value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row;
 
   // chunk starting offset
@@ -96,18 +116,17 @@ __global__ void balanced_coo_generalized_spmv_kernel(
   // how many total cols will be processed by this block (should be <= chunk_size * n_threads)
   value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset);
 
-  int tid = threadIdx.x;
+  int tid     = threadIdx.x;
   int warp_id = tid / raft::warp_size();
 
   // compute id relative to current warp
   unsigned int lane_id = tid & (raft::warp_size() - 1);
-  value_idx ind = ind_offset + threadIdx.x;
+  value_idx ind        = ind_offset + threadIdx.x;
 
   extern __shared__ char smem[];
 
-  typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem);
-  typename warp_reduce::TempStorage *temp_storage =
-    (typename warp_reduce::TempStorage *)(A + dim);
+  typename strategy_t::smem_type A                = (typename strategy_t::smem_type)(smem);
+  typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim);
 
   auto inserter = strategy.init_insert(A, dim);
 
@@ -115,13 +134,12 @@ __global__ void balanced_coo_generalized_spmv_kernel(
 
   value_idx start_offset_a, stop_offset_a;
   bool first_a_chunk, last_a_chunk;
-  indptrA.get_row_offsets(cur_row_a, start_offset_a, stop_offset_a,
-                          n_blocks_per_row, first_a_chunk, last_a_chunk);
+  indptrA.get_row_offsets(
+    cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk);
 
   // Convert current row vector in A to dense
   for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) {
-    strategy.insert(inserter, indicesA[start_offset_a + i],
-                    dataA[start_offset_a + i]);
+    strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]);
   }
 
   __syncthreads();
@@ -132,34 +150,36 @@ __global__ void balanced_coo_generalized_spmv_kernel(
   if (ind >= nnz_b) return;
 
   value_idx start_index_a = 0, stop_index_a = b_ncols - 1;
-  indptrA.get_indices_boundary(indicesA, cur_row_a, start_offset_a,
-                               stop_offset_a, start_index_a, stop_index_a,
-                               first_a_chunk, last_a_chunk);
+  indptrA.get_indices_boundary(indicesA,
+                               cur_row_a,
+                               start_offset_a,
+                               stop_offset_a,
+                               start_index_a,
+                               stop_index_a,
+                               first_a_chunk,
+                               last_a_chunk);
 
   value_idx cur_row_b = -1;
-  value_t c = 0.0;
+  value_t c           = 0.0;
 
   auto warp_red = warp_reduce(*(temp_storage + warp_id));
 
   if (tid < active_chunk_size) {
     cur_row_b = rowsB[ind];
 
-    auto index_b = indicesB[ind];
-    auto in_bounds =
-      indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+    auto index_b   = indicesB[ind];
+    auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
 
     if (in_bounds) {
       value_t a_col = strategy.find(finder, index_b);
-      if (!rev || a_col == 0.0) {
-        c = product_func(a_col, dataB[ind]);
-      }
+      if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); }
     }
   }
 
   // loop through chunks in parallel, reducing when a new row is
   // encountered by each thread
   for (int i = tid; i < active_chunk_size; i += blockDim.x) {
-    value_idx ind_next = ind + blockDim.x;
+    value_idx ind_next   = ind + blockDim.x;
     value_idx next_row_b = -1;
 
     if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next];
@@ -170,14 +190,13 @@ __global__ void balanced_coo_generalized_spmv_kernel(
       // grab the threads currently participating in loops.
       // because any other threads should have returned already.
       unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b);
-      bool is_leader = get_lowest_peer(peer_group) == lane_id;
-      value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func);
+      bool is_leader          = get_lowest_peer(peer_group) == lane_id;
+      value_t v               = warp_red.HeadSegmentedReduce(c, is_leader, accum_func);
 
       // thread with lowest lane id among peers writes out
       if (is_leader && v != 0.0) {
         // this conditional should be uniform, since rev is constant
-        size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b
-                          : (size_t)cur_row_b * m + cur_row_a;
+        size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a;
         write_func(out + idx, v);
       }
 
@@ -187,15 +206,12 @@ __global__ void balanced_coo_generalized_spmv_kernel(
     if (next_row_b != -1) {
       ind = ind_next;
 
-      auto index_b = indicesB[ind];
-      auto in_bounds =
-        indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+      auto index_b   = indicesB[ind];
+      auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
       if (in_bounds) {
         value_t a_col = strategy.find(finder, index_b);
 
-        if (!rev || a_col == 0.0) {
-          c = accum_func(c, product_func(a_col, dataB[ind]));
-        }
+        if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); }
       }
 
       cur_row_b = next_row_b;
diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh
index a1974b3666..228a62ed7a 100644
--- a/cpp/include/raft/sparse/distance/distance.cuh
+++ b/cpp/include/raft/sparse/distance/distance.cuh
@@ -74,16 +74,17 @@ static const std::unordered_set<raft::distance::DistanceType> supportedDistance{
  * @param[in] metric distance metric to use
  */
 template <typename value_idx = int, typename value_t = float>
-void pairwiseDistance(value_t *out,
+void pairwiseDistance(value_t* out,
                       distances_config_t<value_idx, value_t> input_config,
-                      raft::distance::DistanceType metric, float metric_arg) {
+                      raft::distance::DistanceType metric,
+                      float metric_arg)
+{
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
       l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L2SqrtExpanded:
-      l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::InnerProduct:
       ip_distances_t<value_idx, value_t>(input_config).compute(out);
@@ -92,62 +93,49 @@ void pairwiseDistance(value_t *out,
       l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L2SqrtUnexpanded:
-      l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L1:
       l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
-      lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg)
-        .compute(out);
+      lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg).compute(out);
       break;
     case raft::distance::DistanceType::Linf:
-      linf_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      linf_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::Canberra:
-      canberra_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      canberra_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::JaccardExpanded:
-      jaccard_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      jaccard_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      cosine_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      cosine_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      hellinger_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      hellinger_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::DiceExpanded:
       dice_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::CorrelationExpanded:
-      correlation_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      correlation_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::RusselRaoExpanded:
-      russelrao_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      russelrao_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::HammingUnexpanded:
-      hamming_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      hamming_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::JensenShannon:
-      jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::KLDivergence:
-      kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
 
-    default:
-      THROW("Unsupported distance: %d", metric);
+    default: THROW("Unsupported distance: %d", metric);
   }
 }
 
diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh
index 882ccba027..8d77f9f5b5 100644
--- a/cpp/include/raft/sparse/distance/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/ip_distance.cuh
@@ -45,10 +45,13 @@ class ip_distances_t : public distances_t<value_t> {
    * Computes simple sparse inner product distances as sum(x_y * y_k)
    * @param[in] config specifies inputs, outputs, and sizes
    */
-  ip_distances_t(const distances_config_t<value_idx, value_t> &config)
-    : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) {
-    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
-                                      coo_rows_b.data(), config_->b_nnz,
+  ip_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream())
+  {
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows_b.data(),
+                                      config_->b_nnz,
                                       config_->handle.get_stream());
   }
 
@@ -56,21 +59,21 @@ class ip_distances_t : public distances_t<value_t> {
    * Performs pairwise distance computation and computes output distances
    * @param out_distances dense output matrix (size a_nrows * b_nrows)
    */
-  void compute(value_t *out_distances) {
+  void compute(value_t* out_distances)
+  {
     /**
-	   * Compute pairwise distances and return dense matrix in row-major format
-	   */
+     * Compute pairwise distances and return dense matrix in row-major format
+     */
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_distances, *config_, coo_rows_b.data(), Product(), Sum(),
-      AtomicAdd());
+      out_distances, *config_, coo_rows_b.data(), Product(), Sum(), AtomicAdd());
   }
 
-  value_idx *b_rows_coo() { return coo_rows_b.data(); }
+  value_idx* b_rows_coo() { return coo_rows_b.data(); }
 
-  value_t *b_data_coo() { return config_->b_data; }
+  value_t* b_data_coo() { return config_->b_data; }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<value_idx> coo_rows_b;
 };
 };  // END namespace distance
diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh
index 8886d4c9df..a9a2d1ee91 100644
--- a/cpp/include/raft/sparse/distance/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/l2_distance.cuh
@@ -41,35 +41,36 @@ namespace distance {
 
 // @TODO: Move this into sparse prims (coo_norm)
 template <typename value_idx, typename value_t>
-__global__ void compute_row_norm_kernel(value_t *out,
-                                        const value_idx *__restrict__ coo_rows,
-                                        const value_t *__restrict__ data,
-                                        value_idx nnz) {
+__global__ void compute_row_norm_kernel(value_t* out,
+                                        const value_idx* __restrict__ coo_rows,
+                                        const value_t* __restrict__ data,
+                                        value_idx nnz)
+{
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < nnz) {
-    atomicAdd(&out[coo_rows[i]], data[i] * data[i]);
-  }
+  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); }
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_row_sum_kernel(value_t *out,
-                                       const value_idx *__restrict__ coo_rows,
-                                       const value_t *__restrict__ data,
-                                       value_idx nnz) {
+__global__ void compute_row_sum_kernel(value_t* out,
+                                       const value_idx* __restrict__ coo_rows,
+                                       const value_t* __restrict__ data,
+                                       value_idx nnz)
+{
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < nnz) {
-    atomicAdd(&out[coo_rows[i]], data[i]);
-  }
+  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); }
 }
 
 template <typename value_idx, typename value_t, typename expansion_f>
-__global__ void compute_euclidean_warp_kernel(
-  value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms,
-  const value_t *__restrict__ R_sq_norms, value_idx n_rows, value_idx n_cols,
-  expansion_f expansion_func) {
+__global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C,
+                                              const value_t* __restrict__ Q_sq_norms,
+                                              const value_t* __restrict__ R_sq_norms,
+                                              value_idx n_rows,
+                                              value_idx n_cols,
+                                              expansion_f expansion_func)
+{
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
-  value_idx i = tid / n_cols;
-  value_idx j = tid % n_cols;
+  value_idx i   = tid / n_cols;
+  value_idx j   = tid % n_cols;
 
   if (i >= n_rows || j >= n_cols) return;
 
@@ -83,25 +84,29 @@ __global__ void compute_euclidean_warp_kernel(
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_correlation_warp_kernel(
-  value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms,
-  const value_t *__restrict__ R_sq_norms, const value_t *__restrict__ Q_norms,
-  const value_t *__restrict__ R_norms, value_idx n_rows, value_idx n_cols,
-  value_idx n) {
+__global__ void compute_correlation_warp_kernel(value_t* __restrict__ C,
+                                                const value_t* __restrict__ Q_sq_norms,
+                                                const value_t* __restrict__ R_sq_norms,
+                                                const value_t* __restrict__ Q_norms,
+                                                const value_t* __restrict__ R_norms,
+                                                value_idx n_rows,
+                                                value_idx n_cols,
+                                                value_idx n)
+{
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
-  value_idx i = tid / n_cols;
-  value_idx j = tid % n_cols;
+  value_idx i   = tid / n_cols;
+  value_idx j   = tid % n_cols;
 
   if (i >= n_rows || j >= n_cols) return;
 
-  value_t dot = C[(size_t)i * n_cols + j];
+  value_t dot  = C[(size_t)i * n_cols + j];
   value_t Q_l1 = Q_norms[i];
   value_t R_l1 = R_norms[j];
 
   value_t Q_l2 = Q_sq_norms[i];
   value_t R_l2 = R_sq_norms[j];
 
-  value_t numer = n * dot - (Q_l1 * R_l1);
+  value_t numer   = n * dot - (Q_l1 * R_l1);
   value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1);
   value_t R_denom = n * R_l2 - (R_l1 * R_l1);
 
@@ -111,58 +116,77 @@ __global__ void compute_correlation_warp_kernel(
   C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001);
 }
 
-template <typename value_idx, typename value_t, int tpb = 256,
-          typename expansion_f>
-void compute_euclidean(value_t *C, const value_t *Q_sq_norms,
-                       const value_t *R_sq_norms, value_idx n_rows,
-                       value_idx n_cols, cudaStream_t stream,
-                       expansion_f expansion_func) {
+template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
+void compute_euclidean(value_t* C,
+                       const value_t* Q_sq_norms,
+                       const value_t* R_sq_norms,
+                       value_idx n_rows,
+                       value_idx n_cols,
+                       cudaStream_t stream,
+                       expansion_f expansion_func)
+{
   int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
   compute_euclidean_warp_kernel<<<blocks, tpb, 0, stream>>>(
     C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func);
 }
 
-template <typename value_idx, typename value_t, int tpb = 256,
-          typename expansion_f>
-void compute_l2(value_t *out, const value_idx *Q_coo_rows,
-                const value_t *Q_data, value_idx Q_nnz,
-                const value_idx *R_coo_rows, const value_t *R_data,
-                value_idx R_nnz, value_idx m, value_idx n,
+template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
+void compute_l2(value_t* out,
+                const value_idx* Q_coo_rows,
+                const value_t* Q_data,
+                value_idx Q_nnz,
+                const value_idx* R_coo_rows,
+                const value_t* R_data,
+                value_idx R_nnz,
+                value_idx m,
+                value_idx n,
                 std::shared_ptr<raft::mr::device::allocator> alloc,
-                cudaStream_t stream, expansion_f expansion_func) {
+                cudaStream_t stream,
+                expansion_f expansion_func)
+{
   rmm::device_uvector<value_t> Q_sq_norms(m, stream);
   rmm::device_uvector<value_t> R_sq_norms(n, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(
-    cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
 
   compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
   compute_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
     R_sq_norms.data(), R_coo_rows, R_data, R_nnz);
 
-  compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream,
-                    expansion_func);
+  compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func);
 }
 
 template <typename value_idx, typename value_t, int tpb = 256>
-void compute_correlation(value_t *C, const value_t *Q_sq_norms,
-                         const value_t *R_sq_norms, const value_t *Q_norms,
-                         const value_t *R_norms, value_idx n_rows,
-                         value_idx n_cols, value_idx n, cudaStream_t stream) {
+void compute_correlation(value_t* C,
+                         const value_t* Q_sq_norms,
+                         const value_t* R_sq_norms,
+                         const value_t* Q_norms,
+                         const value_t* R_norms,
+                         value_idx n_rows,
+                         value_idx n_cols,
+                         value_idx n,
+                         cudaStream_t stream)
+{
   int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
   compute_correlation_warp_kernel<<<blocks, tpb, 0, stream>>>(
     C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n);
 }
 
 template <typename value_idx, typename value_t, int tpb = 256>
-void compute_corr(value_t *out, const value_idx *Q_coo_rows,
-                  const value_t *Q_data, value_idx Q_nnz,
-                  const value_idx *R_coo_rows, const value_t *R_data,
-                  value_idx R_nnz, value_idx m, value_idx n, value_idx n_cols,
+void compute_corr(value_t* out,
+                  const value_idx* Q_coo_rows,
+                  const value_t* Q_data,
+                  value_idx Q_nnz,
+                  const value_idx* R_coo_rows,
+                  const value_t* R_data,
+                  value_idx R_nnz,
+                  value_idx m,
+                  value_idx n,
+                  value_idx n_cols,
                   std::shared_ptr<raft::mr::device::allocator> alloc,
-                  cudaStream_t stream) {
+                  cudaStream_t stream)
+{
   // sum_sq for std dev
   rmm::device_uvector<value_t> Q_sq_norms(m, stream);
   rmm::device_uvector<value_t> R_sq_norms(n, stream);
@@ -171,15 +195,11 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows,
   rmm::device_uvector<value_t> Q_norms(m, stream);
   rmm::device_uvector<value_t> R_norms(n, stream);
 
-  CUDA_CHECK(
-    cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(
-    cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
 
-  CUDA_CHECK(
-    cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(
-    cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
 
   compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
@@ -191,8 +211,15 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows,
   compute_row_sum_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
     R_norms.data(), R_coo_rows, R_data, R_nnz);
 
-  compute_correlation(out, Q_sq_norms.data(), R_sq_norms.data(), Q_norms.data(),
-                      R_norms.data(), m, n, n_cols, stream);
+  compute_correlation(out,
+                      Q_sq_norms.data(),
+                      R_sq_norms.data(),
+                      Q_norms.data(),
+                      R_norms.data(),
+                      m,
+                      n,
+                      n_cols,
+                      stream);
 }
 
 /**
@@ -202,35 +229,45 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows,
 template <typename value_idx = int, typename value_t = float>
 class l2_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit l2_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config), ip_dists(config) {}
+  explicit l2_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_l2(
-      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
-      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_device_allocator(), config_->handle.get_stream(),
-      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-        return -2 * dot + q_norm + r_norm;
-      });
+    compute_l2(out_dists,
+               search_coo_rows.data(),
+               config_->a_data,
+               config_->a_nnz,
+               b_indices,
+               b_data,
+               config_->b_nnz,
+               config_->a_nrows,
+               config_->b_nrows,
+               config_->handle.get_device_allocator(),
+               config_->handle.get_stream(),
+               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                 return -2 * dot + q_norm + r_norm;
+               });
   }
 
   ~l2_expanded_distances_t() = default;
 
  protected:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
@@ -239,18 +276,21 @@ class l2_expanded_distances_t : public distances_t<value_t> {
  * The expanded form is more efficient for sparse data.
  */
 template <typename value_idx = int, typename value_t = float>
-class l2_sqrt_expanded_distances_t
-  : public l2_expanded_distances_t<value_idx, value_t> {
+class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t<value_idx, value_t> {
  public:
-  explicit l2_sqrt_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : l2_expanded_distances_t<value_idx, value_t>(config) {}
+  explicit l2_sqrt_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : l2_expanded_distances_t<value_idx, value_t>(config)
+  {
+  }
 
-  void compute(value_t *out_dists) override {
+  void compute(value_t* out_dists) override
+  {
     l2_expanded_distances_t<value_idx, value_t>::compute(out_dists);
     // Sqrt Post-processing
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows,
+      out_dists,
+      out_dists,
+      this->config_->a_nrows * this->config_->b_nrows,
       [] __device__(value_t input) {
         int neg = input < 0 ? -1 : 1;
         return sqrt(abs(input) * neg);
@@ -264,25 +304,35 @@ class l2_sqrt_expanded_distances_t
 template <typename value_idx, typename value_t>
 class correlation_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit correlation_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config), ip_dists(config) {}
+  explicit correlation_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_corr(out_dists, search_coo_rows.data(), config_->a_data,
-                 config_->a_nnz, b_indices, b_data, config_->b_nnz,
-                 config_->a_nrows, config_->b_nrows, config_->b_ncols,
+    compute_corr(out_dists,
+                 search_coo_rows.data(),
+                 config_->a_data,
+                 config_->a_nnz,
+                 b_indices,
+                 b_data,
+                 config_->b_nnz,
+                 config_->a_nrows,
+                 config_->b_nrows,
+                 config_->b_ncols,
                  config_->handle.get_device_allocator(),
                  config_->handle.get_stream());
   }
@@ -290,54 +340,62 @@ class correlation_expanded_distances_t : public distances_t<value_t> {
   ~correlation_expanded_distances_t() = default;
 
  protected:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
 /**
- * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * sqrt(sum(y_k)^2)))
- * The expanded form is more efficient for sparse data.
+ * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) *
+ * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data.
  */
 template <typename value_idx = int, typename value_t = float>
 class cosine_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit cosine_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(0, config.handle.get_stream()),
-      ip_dists(config) {}
+  explicit cosine_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_l2(
-      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
-      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_device_allocator(), config_->handle.get_stream(),
-      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-        value_t norms = sqrt(q_norm) * sqrt(r_norm);
-        // deal with potential for 0 in denominator by forcing 0/1 instead
-        value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
-
-        // flip the similarity when both rows are 0
-        bool both_empty = (q_norm == 0) && (r_norm == 0);
-        return 1 - ((!both_empty * cos) + both_empty);
-      });
+    compute_l2(out_dists,
+               search_coo_rows.data(),
+               config_->a_data,
+               config_->a_nnz,
+               b_indices,
+               b_data,
+               config_->b_nnz,
+               config_->a_nrows,
+               config_->b_nrows,
+               config_->handle.get_device_allocator(),
+               config_->handle.get_stream(),
+               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                 value_t norms = sqrt(q_norm) * sqrt(r_norm);
+                 // deal with potential for 0 in denominator by forcing 0/1 instead
+                 value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
+
+                 // flip the similarity when both rows are 0
+                 bool both_empty = (q_norm == 0) && (r_norm == 0);
+                 return 1 - ((!both_empty * cos) + both_empty);
+               });
   }
 
   ~cosine_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
@@ -354,25 +412,34 @@ class cosine_expanded_distances_t : public distances_t<value_t> {
 template <typename value_idx = int, typename value_t = float>
 class hellinger_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit hellinger_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config), workspace(0, config.handle.get_stream()) {}
+  explicit hellinger_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream())
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
                                             config_->handle.get_stream());
 
-    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
-                                      coo_rows.data(), config_->b_nnz,
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows.data(),
+                                      config_->b_nnz,
                                       config_->handle.get_stream());
 
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_dists, *config_, coo_rows.data(),
-      [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, Sum(),
+      out_dists,
+      *config_,
+      coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); },
+      Sum(),
       AtomicAdd());
 
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) {
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
         bool rectifier = (1 - input) > 0;
@@ -384,42 +451,43 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
   ~hellinger_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class russelrao_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit russelrao_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(0, config.handle.get_stream()),
-      ip_dists(config) {}
+  explicit russelrao_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_t n_cols = config_->a_ncols;
+    value_t n_cols     = config_->a_ncols;
     value_t n_cols_inv = 1.0 / n_cols;
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; },
       config_->handle.get_stream());
 
-    auto exec_policy = rmm::exec_policy(config_->handle.get_stream());
-    auto diags = thrust::counting_iterator<value_idx>(0);
+    auto exec_policy  = rmm::exec_policy(config_->handle.get_stream());
+    auto diags        = thrust::counting_iterator<value_idx>(0);
     value_idx b_nrows = config_->b_nrows;
-    thrust::for_each(exec_policy, diags, diags + config_->a_nrows,
-                     [=] __device__(value_idx input) {
-                       out_dists[input * b_nrows + input] = 0.0;
-                     });
+    thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) {
+      out_dists[input * b_nrows + input] = 0.0;
+    });
   }
 
   ~russelrao_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
diff --git a/cpp/include/raft/sparse/distance/lp_distance.cuh b/cpp/include/raft/sparse/distance/lp_distance.cuh
index 885d55ee50..7f9511ff03 100644
--- a/cpp/include/raft/sparse/distance/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/lp_distance.cuh
@@ -38,23 +38,33 @@ namespace raft {
 namespace sparse {
 namespace distance {
 
-template <typename value_idx = int, typename value_t = float,
-          typename product_f, typename accum_f, typename write_f>
-void unexpanded_lp_distances(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> *config_,
-  product_f product_func, accum_f accum_func, write_f write_func) {
+template <typename value_idx = int,
+          typename value_t   = float,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+void unexpanded_lp_distances(value_t* out_dists,
+                             const distances_config_t<value_idx, value_t>* config_,
+                             product_f product_func,
+                             accum_f accum_func,
+                             write_f write_func)
+{
   rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
                                           config_->handle.get_stream());
 
-  raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
-                                    coo_rows.data(), config_->b_nnz,
+  raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                    config_->b_nrows,
+                                    coo_rows.data(),
+                                    config_->b_nnz,
                                     config_->handle.get_stream());
 
   balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
     out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func);
 
-  raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                    coo_rows.data(), config_->a_nnz,
+  raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                    config_->a_nrows,
+                                    coo_rows.data(),
+                                    config_->a_nnz,
                                     config_->handle.get_stream());
 
   balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
@@ -71,48 +81,51 @@ void unexpanded_lp_distances(
 template <typename value_idx = int, typename value_t = float>
 class l1_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  l1_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  l1_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(),
-                                                Sum(), AtomicAdd());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(), Sum(), AtomicAdd());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class l2_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  l2_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  l2_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, SqDiff(),
-                                                Sum(), AtomicAdd());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, SqDiff(), Sum(), AtomicAdd());
   }
 
  protected:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
-class l2_sqrt_unexpanded_distances_t
-  : public l2_unexpanded_distances_t<value_idx, value_t> {
+class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t<value_idx, value_t> {
  public:
-  l2_sqrt_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : l2_unexpanded_distances_t<value_idx, value_t>(config) {}
+  l2_sqrt_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : l2_unexpanded_distances_t<value_idx, value_t>(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     l2_unexpanded_distances_t<value_idx, value_t>::compute(out_dists);
     // Sqrt Post-processing
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows,
+      out_dists,
+      out_dists,
+      this->config_->a_nrows * this->config_->b_nrows,
       [] __device__(value_t input) {
         int neg = input < 0 ? -1 : 1;
         return sqrt(abs(input) * neg);
@@ -124,29 +137,33 @@ class l2_sqrt_unexpanded_distances_t
 template <typename value_idx = int, typename value_t = float>
 class linf_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit linf_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  explicit linf_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(),
-                                                Max(), AtomicMax());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(), Max(), AtomicMax());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class canberra_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit canberra_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  explicit canberra_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     unexpanded_lp_distances<value_idx, value_t>(
-      out_dists, config_,
+      out_dists,
+      config_,
       [] __device__(value_t a, value_t b) {
         value_t d = fabs(a) + fabs(b);
 
@@ -154,70 +171,82 @@ class canberra_unexpanded_distances_t : public distances_t<value_t> {
         // forcing 1/0 instead
         return ((d != 0) * fabs(a - b)) / (d + (d == 0));
       },
-      Sum(), AtomicAdd());
+      Sum(),
+      AtomicAdd());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class lp_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit lp_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config, value_t p_)
-    : config_(&config), p(p_) {}
+  explicit lp_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config,
+                                     value_t p_)
+    : config_(&config), p(p_)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, PDiff(p),
-                                                Sum(), AtomicAdd());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, PDiff(p), Sum(), AtomicAdd());
 
     float one_over_p = 1.0f / p;
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return pow(input, one_over_p); },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   value_t p;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class hamming_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit hamming_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  explicit hamming_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, NotEqual(),
-                                                Sum(), AtomicAdd());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, NotEqual(), Sum(), AtomicAdd());
 
     value_t n_cols = 1.0 / config_->a_ncols;
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return input * n_cols; },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
  public:
   explicit jensen_shannon_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+    const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     unexpanded_lp_distances<value_idx, value_t>(
-      out_dists, config_,
+      out_dists,
+      config_,
       [] __device__(value_t a, value_t b) {
-        value_t m = 0.5f * (a + b);
+        value_t m   = 0.5f * (a + b);
         bool a_zero = a == 0;
         bool b_zero = b == 0;
 
@@ -227,49 +256,61 @@ class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
         bool x_zero = x == 0;
         bool y_zero = y == 0;
 
-        return (-a * (!x_zero * log(x + x_zero))) +
-               (-b * (!y_zero * log(y + y_zero)));
+        return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero)));
       },
-      Sum(), AtomicAdd());
+      Sum(),
+      AtomicAdd());
 
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return sqrt(0.5 * input); },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class kl_divergence_unexpanded_distances_t : public distances_t<value_t> {
  public:
   explicit kl_divergence_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+    const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
                                             config_->handle.get_stream());
 
-    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
-                                      coo_rows.data(), config_->b_nnz,
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows.data(),
+                                      config_->b_nnz,
                                       config_->handle.get_stream());
 
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_dists, *config_, coo_rows.data(),
-      [] __device__(value_t a, value_t b) { return a * log(a / b); }, Sum(),
+      out_dists,
+      *config_,
+      coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return a * log(a / b); },
+      Sum(),
       AtomicAdd());
 
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return 0.5 * input; },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 };  // END namespace distance
diff --git a/cpp/include/raft/sparse/distance/operators.cuh b/cpp/include/raft/sparse/distance/operators.cuh
index 89acda8b1a..3a9d0ba879 100644
--- a/cpp/include/raft/sparse/distance/operators.cuh
+++ b/cpp/include/raft/sparse/distance/operators.cuh
@@ -24,21 +24,24 @@ namespace distance {
 
 struct Sum {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return a + b;
   }
 };
 
 struct NotEqual {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return a != b;
   }
 };
 
 struct SqDiff {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return (a - b) * (a - b);
   }
 };
@@ -49,44 +52,48 @@ struct PDiff {
   PDiff(float p_) : p(p_) {}
 
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return pow(a - b, p);
   }
 };
 
 struct Max {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return fmax(a, b);
   }
 };
 
 struct AtomicAdd {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t *a,
-                                                         value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b)
+  {
     return atomicAdd(a, b);
   }
 };
 
 struct AtomicMax {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t *a,
-                                                         value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b)
+  {
     return atomicMax(a, b);
   }
 };
 
 struct Product {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return a * b;
   }
 };
 
 struct AbsDiff {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return fabs(a - b);
   }
 };
diff --git a/cpp/include/raft/sparse/distance/utils.cuh b/cpp/include/raft/sparse/distance/utils.cuh
index 6b6d77a2d5..d78b927e46 100644
--- a/cpp/include/raft/sparse/distance/utils.cuh
+++ b/cpp/include/raft/sparse/distance/utils.cuh
@@ -34,10 +34,10 @@ namespace distance {
  * @return the maximum number of columns that can be stored in smem
  */
 template <typename value_idx, typename value_t, int tpb = 1024>
-inline int max_cols_per_block() {
+inline int max_cols_per_block()
+{
   // max cols = (total smem available - cub reduction smem)
-  return (raft::getSharedMemPerBlock() -
-          ((tpb / raft::warp_size()) * sizeof(value_t))) /
+  return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
          sizeof(value_t);
 }
 
diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h
index 29f541498b..1738dd7498 100644
--- a/cpp/include/raft/sparse/hierarchy/common.h
+++ b/cpp/include/raft/sparse/hierarchy/common.h
@@ -37,13 +37,15 @@ class linkage_output {
   value_idx n_leaves;
   value_idx n_connected_components;
 
-  value_idx *labels;  // size: m
+  value_idx* labels;  // size: m
 
-  value_idx *children;  // size: (m-1, 2)
+  value_idx* children;  // size: (m-1, 2)
 };
 
-class linkage_output_int_float : public linkage_output<int, float> {};
-class linkage_output__int64_float : public linkage_output<int64_t, float> {};
+class linkage_output_int_float : public linkage_output<int, float> {
+};
+class linkage_output__int64_float : public linkage_output<int64_t, float> {
+};
 
 };  // namespace hierarchy
 };  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 1ac075489a..95df7f4642 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -42,31 +42,32 @@ class UnionFind {
   value_idx n_indices;
 
   UnionFind(value_idx N_)
-    : n_indices(2 * N_ - 1),
-      parent(2 * N_ - 1, -1),
-      size(2 * N_ - 1, 1),
-      next_label(N_) {
+    : n_indices(2 * N_ - 1), parent(2 * N_ - 1, -1), size(2 * N_ - 1, 1), next_label(N_)
+  {
     memset(size.data() + N_, 0, (size.size() - N_) * sizeof(value_idx));
   }
 
-  value_idx find(value_idx n) {
+  value_idx find(value_idx n)
+  {
     value_idx p;
     p = n;
 
-    while (parent[n] != -1) n = parent[n];
+    while (parent[n] != -1)
+      n = parent[n];
 
     // path compression
     while (parent[p] != n) {
-      p = parent[p == -1 ? n_indices - 1 : p];
+      p                                   = parent[p == -1 ? n_indices - 1 : p];
       parent[p == -1 ? n_indices - 1 : p] = n;
     }
     return n;
   }
 
-  void perform_union(value_idx m, value_idx n) {
+  void perform_union(value_idx m, value_idx n)
+  {
     size[next_label] = size[m] + size[n];
-    parent[m] = next_label;
-    parent[n] = next_label;
+    parent[m]        = next_label;
+    parent[n]        = next_label;
 
     next_label += 1;
   }
@@ -95,12 +96,17 @@ class UnionFind {
  * @param[out] out_size cluster sizes of output
  */
 template <typename value_idx, typename value_t>
-void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
-                           const value_idx *cols, const value_t *data,
-                           size_t nnz, value_idx *children, value_t *out_delta,
-                           value_idx *out_size) {
+void build_dendrogram_host(const handle_t& handle,
+                           const value_idx* rows,
+                           const value_idx* cols,
+                           const value_t* data,
+                           size_t nnz,
+                           value_idx* children,
+                           value_t* out_delta,
+                           value_idx* out_size)
+{
   auto d_alloc = handle.get_device_allocator();
-  auto stream = handle.get_stream();
+  auto stream  = handle.get_stream();
 
   value_idx n_edges = nnz;
 
@@ -121,8 +127,8 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
   UnionFind<value_idx, value_t> U(nnz + 1);
 
   for (value_idx i = 0; i < nnz; i++) {
-    value_idx a = mst_src_h[i];
-    value_idx b = mst_dst_h[i];
+    value_idx a   = mst_src_h[i];
+    value_idx b   = mst_dst_h[i];
     value_t delta = mst_weights_h[i];
 
     value_idx aa = U.find(a);
@@ -130,10 +136,10 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
 
     value_idx children_idx = i * 2;
 
-    children_h[children_idx] = aa;
+    children_h[children_idx]     = aa;
     children_h[children_idx + 1] = bb;
-    out_delta_h[i] = delta;
-    out_size_h[i] = U.size[aa] + U.size[bb];
+    out_delta_h[i]               = delta;
+    out_size_h[i]                = U.size[aa] + U.size[bb];
 
     U.perform_union(aa, bb);
   }
@@ -144,13 +150,15 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
 }
 
 template <typename value_idx>
-__global__ void write_levels_kernel(const value_idx *children,
-                                    value_idx *parents, value_idx n_vertices) {
+__global__ void write_levels_kernel(const value_idx* children,
+                                    value_idx* parents,
+                                    value_idx n_vertices)
+{
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid < n_vertices) {
     value_idx level = tid / 2;
     value_idx child = children[tid];
-    parents[child] = level;
+    parents[child]  = level;
   }
 }
 
@@ -166,14 +174,17 @@ __global__ void write_levels_kernel(const value_idx *children,
  * @param labels
  */
 template <typename value_idx>
-__global__ void inherit_labels(const value_idx *children,
-                               const value_idx *levels, size_t n_leaves,
-                               value_idx *labels, int cut_level,
-                               value_idx n_vertices) {
+__global__ void inherit_labels(const value_idx* children,
+                               const value_idx* levels,
+                               size_t n_leaves,
+                               value_idx* labels,
+                               int cut_level,
+                               value_idx n_vertices)
+{
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid < n_vertices) {
-    value_idx node = children[tid];
+    value_idx node      = children[tid];
     value_idx cur_level = tid / 2;
 
     /**
@@ -183,12 +194,12 @@ __global__ void inherit_labels(const value_idx *children,
     if (cur_level > cut_level) return;
 
     value_idx cur_parent = node;
-    value_idx label = labels[cur_parent];
+    value_idx label      = labels[cur_parent];
 
     while (label == -1) {
       cur_parent = cur_level + n_leaves;
-      cur_level = levels[cur_parent];
-      label = labels[cur_parent];
+      cur_level  = levels[cur_parent];
+      label      = labels[cur_parent];
     }
 
     labels[node] = label;
@@ -197,15 +208,16 @@ __global__ void inherit_labels(const value_idx *children,
 
 template <typename value_idx>
 struct init_label_roots {
-  init_label_roots(value_idx *labels_) : labels(labels_) {}
+  init_label_roots(value_idx* labels_) : labels(labels_) {}
 
   template <typename Tuple>
-  __host__ __device__ void operator()(Tuple t) {
+  __host__ __device__ void operator()(Tuple t)
+  {
     labels[thrust::get<1>(t)] = thrust::get<0>(t);
   }
 
  private:
-  value_idx *labels;
+  value_idx* labels;
 };
 
 /**
@@ -221,11 +233,14 @@ struct init_label_roots {
  * @param n_leaves
  */
 template <typename value_idx, int tpb = 256>
-void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
-                                const value_idx *children, size_t n_clusters,
-                                size_t n_leaves) {
-  auto d_alloc = handle.get_device_allocator();
-  auto stream = handle.get_stream();
+void extract_flattened_clusters(const raft::handle_t& handle,
+                                value_idx* labels,
+                                const value_idx* children,
+                                size_t n_clusters,
+                                size_t n_leaves)
+{
+  auto d_alloc       = handle.get_device_allocator();
+  auto stream        = handle.get_stream();
   auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
 
   // Handle special case where n_clusters == 1
@@ -243,10 +258,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
 
     size_t n_edges = (n_leaves - 1) * 2;
 
-    thrust::device_ptr<const value_idx> d_ptr =
-      thrust::device_pointer_cast(children);
-    value_idx n_vertices =
-      *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1;
+    thrust::device_ptr<const value_idx> d_ptr = thrust::device_pointer_cast(children);
+    value_idx n_vertices = *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1;
 
     // Prevent potential infinite loop from labeling disconnected
     // connectivities graph.
@@ -257,8 +270,7 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
     rmm::device_uvector<value_idx> levels(n_vertices, stream);
 
     value_idx n_blocks = ceildiv(n_vertices, (value_idx)tpb);
-    write_levels_kernel<<<n_blocks, tpb, 0, stream>>>(children, levels.data(),
-                                                      n_vertices);
+    write_levels_kernel<<<n_blocks, tpb, 0, stream>>>(children, levels.data(), n_vertices);
     /**
      * Step 1: Find label roots:
      *
@@ -272,27 +284,26 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
     rmm::device_uvector<value_idx> label_roots(child_size, stream);
 
     value_idx children_cpy_start = n_edges - child_size;
-    raft::copy_async(label_roots.data(), children + children_cpy_start,
-                     child_size, stream);
+    raft::copy_async(label_roots.data(), children + children_cpy_start, child_size, stream);
 
-    thrust::sort(thrust_policy, label_roots.data(),
+    thrust::sort(thrust_policy,
+                 label_roots.data(),
                  label_roots.data() + (child_size),
                  thrust::greater<value_idx>());
 
     rmm::device_uvector<value_idx> tmp_labels(n_vertices, stream);
 
     // Init labels to -1
-    thrust::fill(thrust_policy, tmp_labels.data(),
-                 tmp_labels.data() + n_vertices, -1);
+    thrust::fill(thrust_policy, tmp_labels.data(), tmp_labels.data() + n_vertices, -1);
 
     // Write labels for cluster roots to "labels"
     thrust::counting_iterator<uint> first(0);
 
-    auto z_iter = thrust::make_zip_iterator(thrust::make_tuple(
-      first, label_roots.data() + (label_roots.size() - n_clusters)));
+    auto z_iter = thrust::make_zip_iterator(
+      thrust::make_tuple(first, label_roots.data() + (label_roots.size() - n_clusters)));
 
-    thrust::for_each(thrust_policy, z_iter, z_iter + n_clusters,
-                     init_label_roots<value_idx>(tmp_labels.data()));
+    thrust::for_each(
+      thrust_policy, z_iter, z_iter + n_clusters, init_label_roots<value_idx>(tmp_labels.data()));
 
     /**
      * Step 2: Propagate labels by having children iterate through their parents
@@ -302,9 +313,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
      */
     value_idx cut_level = (n_edges / 2) - (n_clusters - 1);
 
-    inherit_labels<<<n_blocks, tpb, 0, stream>>>(children, levels.data(),
-                                                 n_leaves, tmp_labels.data(),
-                                                 cut_level, n_vertices);
+    inherit_labels<<<n_blocks, tpb, 0, stream>>>(
+      children, levels.data(), n_leaves, tmp_labels.data(), cut_level, n_vertices);
 
     // copy tmp labels to actual labels
     raft::copy_async(labels, tmp_labels.data(), n_leaves, stream);
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index 7cf959dda6..096f1c650f 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -37,14 +37,17 @@ namespace raft {
 namespace hierarchy {
 namespace detail {
 
-template <raft::hierarchy::LinkageDistance dist_type, typename value_idx,
-          typename value_t>
+template <raft::hierarchy::LinkageDistance dist_type, typename value_idx, typename value_t>
 struct distance_graph_impl {
-  void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n,
+  void run(const raft::handle_t& handle,
+           const value_t* X,
+           size_t m,
+           size_t n,
            raft::distance::DistanceType metric,
-           rmm::device_uvector<value_idx> &indptr,
-           rmm::device_uvector<value_idx> &indices,
-           rmm::device_uvector<value_t> &data, int c);
+           rmm::device_uvector<value_idx>& indptr,
+           rmm::device_uvector<value_idx>& indices,
+           rmm::device_uvector<value_t>& data,
+           int c);
 };
 
 /**
@@ -53,50 +56,51 @@ struct distance_graph_impl {
  * @tparam value_t
  */
 template <typename value_idx, typename value_t>
-struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
-                           value_idx, value_t> {
-  void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n,
+struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH, value_idx, value_t> {
+  void run(const raft::handle_t& handle,
+           const value_t* X,
+           size_t m,
+           size_t n,
            raft::distance::DistanceType metric,
-           rmm::device_uvector<value_idx> &indptr,
-           rmm::device_uvector<value_idx> &indices,
-           rmm::device_uvector<value_t> &data, int c) {
-    auto d_alloc = handle.get_device_allocator();
-    auto stream = handle.get_stream();
+           rmm::device_uvector<value_idx>& indptr,
+           rmm::device_uvector<value_idx>& indices,
+           rmm::device_uvector<value_t>& data,
+           int c)
+  {
+    auto d_alloc     = handle.get_device_allocator();
+    auto stream      = handle.get_stream();
     auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
 
     // Need to symmetrize knn into undirected graph
     raft::sparse::COO<value_t, value_idx> knn_graph_coo(d_alloc, stream);
 
-    raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo,
-                                       c);
+    raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c);
 
     indices.resize(knn_graph_coo.nnz, stream);
     data.resize(knn_graph_coo.nnz, stream);
 
     // self-loops get max distance
-    auto transform_in = thrust::make_zip_iterator(thrust::make_tuple(
-      knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals()));
-
-    thrust::transform(
-      exec_policy, transform_in, transform_in + knn_graph_coo.nnz,
-      knn_graph_coo.vals(),
-      [=] __device__(const thrust::tuple<value_idx, value_idx, value_t> &tup) {
-        bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup);
-        return (self_loop * std::numeric_limits<value_t>::max()) +
-               (!self_loop * thrust::get<2>(tup));
-      });
-
-    raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(),
-                                             knn_graph_coo.nnz, indptr.data(),
-                                             m + 1, d_alloc, stream);
+    auto transform_in = thrust::make_zip_iterator(
+      thrust::make_tuple(knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals()));
+
+    thrust::transform(exec_policy,
+                      transform_in,
+                      transform_in + knn_graph_coo.nnz,
+                      knn_graph_coo.vals(),
+                      [=] __device__(const thrust::tuple<value_idx, value_idx, value_t>& tup) {
+                        bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup);
+                        return (self_loop * std::numeric_limits<value_t>::max()) +
+                               (!self_loop * thrust::get<2>(tup));
+                      });
+
+    raft::sparse::convert::sorted_coo_to_csr(
+      knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, d_alloc, stream);
 
     // TODO: Wouldn't need to copy here if we could compute knn
     // graph directly on the device uvectors
     // ref: https://github.com/rapidsai/raft/issues/227
-    raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz,
-                     stream);
-    raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz,
-                     stream);
+    raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz, stream);
+    raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz, stream);
   }
 };
 
@@ -116,13 +120,17 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
  * @param[out] c constant 'c' used for nearest neighbors-based distances
  *             which will guarantee k <= log(n) + c
  */
-template <typename value_idx, typename value_t,
-          raft::hierarchy::LinkageDistance dist_type>
-void get_distance_graph(const raft::handle_t &handle, const value_t *X,
-                        size_t m, size_t n, raft::distance::DistanceType metric,
-                        rmm::device_uvector<value_idx> &indptr,
-                        rmm::device_uvector<value_idx> &indices,
-                        rmm::device_uvector<value_t> &data, int c) {
+template <typename value_idx, typename value_t, raft::hierarchy::LinkageDistance dist_type>
+void get_distance_graph(const raft::handle_t& handle,
+                        const value_t* X,
+                        size_t m,
+                        size_t n,
+                        raft::distance::DistanceType metric,
+                        rmm::device_uvector<value_idx>& indptr,
+                        rmm::device_uvector<value_idx>& indices,
+                        rmm::device_uvector<value_t>& data,
+                        int c)
+{
   auto stream = handle.get_stream();
 
   indptr.resize(m + 1, stream);
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 765a5ad77f..f939e87484 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -37,9 +37,10 @@ namespace hierarchy {
 namespace detail {
 
 template <typename value_idx, typename value_t>
-void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
-                raft::Graph_COO<value_idx, value_idx, value_t> &coo2,
-                cudaStream_t stream) {
+void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t>& coo1,
+                raft::Graph_COO<value_idx, value_idx, value_t>& coo2,
+                cudaStream_t stream)
+{
   /** Add edges to existing mst **/
   int final_nnz = coo2.n_edges + coo1.n_edges;
 
@@ -50,12 +51,9 @@ void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
   /**
    * Construct final edge list
    */
-  raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(),
-                   coo2.n_edges, stream);
-  raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(),
-                   coo2.n_edges, stream);
-  raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(),
-                   coo2.n_edges, stream);
+  raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), coo2.n_edges, stream);
+  raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), coo2.n_edges, stream);
+  raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), coo2.n_edges, stream);
 
   coo1.n_edges = final_nnz;
 }
@@ -74,14 +72,18 @@ void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
  * @return updated MST edge list
  */
 template <typename value_idx, typename value_t, typename red_op>
-void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
-                       raft::Graph_COO<value_idx, value_idx, value_t> &msf,
-                       size_t m, size_t n, value_idx *color,
-                       red_op reduction_op,
-                       raft::distance::DistanceType metric =
-                         raft::distance::DistanceType::L2SqrtExpanded) {
+void connect_knn_graph(
+  const raft::handle_t& handle,
+  const value_t* X,
+  raft::Graph_COO<value_idx, value_idx, value_t>& msf,
+  size_t m,
+  size_t n,
+  value_idx* color,
+  red_op reduction_op,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
+{
   auto d_alloc = handle.get_device_allocator();
-  auto stream = handle.get_stream();
+  auto stream  = handle.get_stream();
 
   raft::sparse::COO<value_t, value_idx> connected_edges(d_alloc, stream);
 
@@ -89,15 +91,21 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
     handle, connected_edges, X, color, m, n, reduction_op);
 
   rmm::device_uvector<value_idx> indptr2(m + 1, stream);
-  raft::sparse::convert::sorted_coo_to_csr(connected_edges.rows(),
-                                           connected_edges.nnz, indptr2.data(),
-                                           m + 1, d_alloc, stream);
+  raft::sparse::convert::sorted_coo_to_csr(
+    connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, d_alloc, stream);
 
   // On the second call, we hand the MST the original colors
   // and the new set of edges and let it restart the optimization process
-  auto new_mst = raft::mst::mst<value_idx, value_idx, value_t, double>(
-    handle, indptr2.data(), connected_edges.cols(), connected_edges.vals(), m,
-    connected_edges.nnz, color, stream, false, false);
+  auto new_mst = raft::mst::mst<value_idx, value_idx, value_t, double>(handle,
+                                                                       indptr2.data(),
+                                                                       connected_edges.cols(),
+                                                                       connected_edges.vals(),
+                                                                       m,
+                                                                       connected_edges.nnz,
+                                                                       color,
+                                                                       stream,
+                                                                       false,
+                                                                       false);
 
   merge_msts<value_idx, value_t>(msf, new_mst, stream);
 }
@@ -127,29 +135,35 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
  *  argument is really just a safeguard against the potential for infinite loops.
  */
 template <typename value_idx, typename value_t, typename red_op>
-void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
-                      const value_idx *indptr, const value_idx *indices,
-                      const value_t *pw_dists, size_t m, size_t n,
-                      value_idx *mst_src, value_idx *mst_dst,
-                      value_t *mst_weight, value_idx *color, size_t nnz,
-                      red_op reduction_op,
-                      raft::distance::DistanceType metric =
-                        raft::distance::DistanceType::L2SqrtExpanded,
-                      int max_iter = 10) {
+void build_sorted_mst(
+  const raft::handle_t& handle,
+  const value_t* X,
+  const value_idx* indptr,
+  const value_idx* indices,
+  const value_t* pw_dists,
+  size_t m,
+  size_t n,
+  value_idx* mst_src,
+  value_idx* mst_dst,
+  value_t* mst_weight,
+  value_idx* color,
+  size_t nnz,
+  red_op reduction_op,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded,
+  int max_iter                        = 10)
+{
   auto d_alloc = handle.get_device_allocator();
-  auto stream = handle.get_stream();
+  auto stream  = handle.get_stream();
 
   // We want to have MST initialize colors on first call.
   auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(
-    handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false,
-    true);
+    handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, true);
 
-  int iters = 1;
+  int iters        = 1;
   int n_components = linkage::get_n_components(color, m, d_alloc, stream);
 
   while (n_components > 1 && iters < max_iter) {
-    connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color,
-                                          reduction_op);
+    connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color, reduction_op);
 
     iters++;
 
@@ -176,9 +190,8 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
                " or increase 'max_iter'",
                max_iter);
 
-  raft::sparse::op::coo_sort_by_weight(mst_coo.src.data(), mst_coo.dst.data(),
-                                       mst_coo.weights.data(), mst_coo.n_edges,
-                                       stream);
+  raft::sparse::op::coo_sort_by_weight(
+    mst_coo.src.data(), mst_coo.dst.data(), mst_coo.weights.data(), mst_coo.n_edges, stream);
 
   raft::copy_async(mst_src, mst_coo.src.data(), mst_coo.n_edges, stream);
   raft::copy_async(mst_dst, mst_coo.dst.data(), mst_coo.n_edges, stream);
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index 01a033945c..fe9538120f 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -44,20 +44,26 @@ static const size_t EMPTY = 0;
  * @param[in] n number of columns in X
  * @param[in] metric distance metrix to use when constructing connectivities graph
  * @param[out] out struct containing output dendrogram and cluster assignments
- * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect control
+ * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect
+ control
  *            of k. The algorithm will set `k = log(n) + c`
  * @param[in] n_clusters number of clusters to assign data samples
  */
-template <typename value_idx, typename value_t,
+template <typename value_idx,
+          typename value_t,
           LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
-                    size_t n, raft::distance::DistanceType metric,
-                    linkage_output<value_idx, value_t> *out, int c,
-                    size_t n_clusters) {
-  ASSERT(n_clusters <= m,
-         "n_clusters must be less than or equal to the number of data points");
-
-  auto stream = handle.get_stream();
+void single_linkage(const raft::handle_t& handle,
+                    const value_t* X,
+                    size_t m,
+                    size_t n,
+                    raft::distance::DistanceType metric,
+                    linkage_output<value_idx, value_t>* out,
+                    int c,
+                    size_t n_clusters)
+{
+  ASSERT(n_clusters <= m, "n_clusters must be less than or equal to the number of data points");
+
+  auto stream  = handle.get_stream();
   auto d_alloc = handle.get_device_allocator();
 
   rmm::device_uvector<value_idx> indptr(EMPTY, stream);
@@ -79,10 +85,20 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
    */
   rmm::device_uvector<value_idx> color(m, stream);
   raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> op(color.data(), m);
-  detail::build_sorted_mst<value_idx, value_t>(
-    handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n,
-    mst_rows.data(), mst_cols.data(), mst_data.data(), color.data(),
-    indices.size(), op, metric);
+  detail::build_sorted_mst<value_idx, value_t>(handle,
+                                               X,
+                                               indptr.data(),
+                                               indices.data(),
+                                               pw_dists.data(),
+                                               m,
+                                               n,
+                                               mst_rows.data(),
+                                               mst_cols.data(),
+                                               mst_data.data(),
+                                               color.data(),
+                                               indices.size(),
+                                               op,
+                                               metric);
 
   pw_dists.release();
 
@@ -94,15 +110,19 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
   rmm::device_uvector<value_t> out_delta(n_edges, stream);
   rmm::device_uvector<value_idx> out_size(n_edges, stream);
   // Create dendrogram
-  detail::build_dendrogram_host<value_idx, value_t>(
-    handle, mst_rows.data(), mst_cols.data(), mst_data.data(), n_edges,
-    out->children, out_delta.data(), out_size.data());
-  detail::extract_flattened_clusters(handle, out->labels, out->children,
-                                     n_clusters, m);
-
-  out->m = m;
-  out->n_clusters = n_clusters;
-  out->n_leaves = m;
+  detail::build_dendrogram_host<value_idx, value_t>(handle,
+                                                    mst_rows.data(),
+                                                    mst_cols.data(),
+                                                    mst_data.data(),
+                                                    n_edges,
+                                                    out->children,
+                                                    out_delta.data(),
+                                                    out_size.data());
+  detail::extract_flattened_clusters(handle, out->labels, out->children, n_clusters, m);
+
+  out->m                      = m;
+  out->n_clusters             = n_clusters;
+  out->n_leaves               = m;
   out->n_connected_components = 1;
 }
 
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
index 47b1ba6e41..01735a102d 100644
--- a/cpp/include/raft/sparse/linalg/add.cuh
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -40,40 +40,47 @@ namespace sparse {
 namespace linalg {
 
 template <typename T, int TPB_X = 128>
-__global__ void csr_add_calc_row_counts_kernel(
-  const int *a_ind, const int *a_indptr, const T *a_val, int nnz1,
-  const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m,
-  int *out_rowcounts) {
+__global__ void csr_add_calc_row_counts_kernel(const int* a_ind,
+                                               const int* a_indptr,
+                                               const T* a_val,
+                                               int nnz1,
+                                               const int* b_ind,
+                                               const int* b_indptr,
+                                               const T* b_val,
+                                               int nnz2,
+                                               int m,
+                                               int* out_rowcounts)
+{
   // loop through columns in each set of rows and
   // calculate number of unique cols across both rows
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < m) {
     int a_start_idx = a_ind[row];
-    int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
+    int a_stop_idx  = get_stop_idx(row, m, nnz1, a_ind);
 
     int b_start_idx = b_ind[row];
-    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
+    int b_stop_idx  = get_stop_idx(row, m, nnz2, b_ind);
 
     /**
-         * Union of columns within each row of A and B so that we can scan through
-         * them, adding their values together.
-         */
+     * Union of columns within each row of A and B so that we can scan through
+     * them, adding their values together.
+     */
     int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx);
 
-    int *arr = new int[max_size];
+    int* arr        = new int[max_size];
     int cur_arr_idx = 0;
     for (int j = a_start_idx; j < a_stop_idx; j++) {
       arr[cur_arr_idx] = a_indptr[j];
       cur_arr_idx++;
     }
 
-    int arr_size = cur_arr_idx;
+    int arr_size   = cur_arr_idx;
     int final_size = arr_size;
 
     for (int j = b_start_idx; j < b_stop_idx; j++) {
       int cur_col = b_indptr[j];
-      bool found = false;
+      bool found  = false;
       for (int k = 0; k < arr_size; k++) {
         if (arr[k] == cur_col) {
           found = true;
@@ -81,9 +88,7 @@ __global__ void csr_add_calc_row_counts_kernel(
         }
       }
 
-      if (!found) {
-        final_size++;
-      }
+      if (!found) { final_size++; }
     }
 
     out_rowcounts[row] = final_size;
@@ -94,11 +99,19 @@ __global__ void csr_add_calc_row_counts_kernel(
 }
 
 template <typename T, int TPB_X = 128>
-__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
-                               const T *a_val, int nnz1, const int *b_ind,
-                               const int *b_indptr, const T *b_val, int nnz2,
-                               int m, int *out_ind, int *out_indptr,
-                               T *out_val) {
+__global__ void csr_add_kernel(const int* a_ind,
+                               const int* a_indptr,
+                               const T* a_val,
+                               int nnz1,
+                               const int* b_ind,
+                               const int* b_indptr,
+                               const T* b_val,
+                               int nnz2,
+                               int m,
+                               int* out_ind,
+                               int* out_indptr,
+                               T* out_val)
+{
   // 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
@@ -109,21 +122,21 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
     int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
 
     int b_start_idx = b_ind[row];
-    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
+    int b_stop_idx  = get_stop_idx(row, m, nnz2, b_ind);
 
     int o_idx = out_ind[row];
 
     int cur_o_idx = o_idx;
     for (int j = a_start_idx; j < a_stop_idx; j++) {
       out_indptr[cur_o_idx] = a_indptr[j];
-      out_val[cur_o_idx] = a_val[j];
+      out_val[cur_o_idx]    = a_val[j];
       cur_o_idx++;
     }
 
     int arr_size = cur_o_idx - o_idx;
     for (int j = b_start_idx; j < b_stop_idx; j++) {
       int cur_col = b_indptr[j];
-      bool found = false;
+      bool found  = false;
       for (int k = o_idx; k < o_idx + arr_size; k++) {
         // If we found a match, sum the two values
         if (out_indptr[k] == cur_col) {
@@ -136,7 +149,7 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
       // if we didn't find a match, add the value for b
       if (!found) {
         out_indptr[o_idx + arr_size] = cur_col;
-        out_val[o_idx + arr_size] = b_val[j];
+        out_val[o_idx + arr_size]    = b_val[j];
         arr_size++;
       }
     }
@@ -160,32 +173,36 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
  * @param stream: cuda stream to use
  */
 template <typename T, int TPB_X = 128>
-size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
-                         int nnz1, const int *b_ind, const int *b_indptr,
-                         const T *b_val, int nnz2, int m, int *out_ind,
+size_t csr_add_calc_inds(const int* a_ind,
+                         const int* a_indptr,
+                         const T* a_val,
+                         int nnz1,
+                         const int* b_ind,
+                         const int* b_indptr,
+                         const T* b_val,
+                         int nnz2,
+                         int m,
+                         int* out_ind,
                          std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                         cudaStream_t stream) {
+                         cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
   raft::mr::device::buffer<int> row_counts(d_alloc, stream, m + 1);
-  CUDA_CHECK(
-    cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
 
-  csr_add_calc_row_counts_kernel<T, TPB_X>
-    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
-                               b_val, nnz2, m, row_counts.data());
+  csr_add_calc_row_counts_kernel<T, TPB_X><<<grid, blk, 0, stream>>>(
+    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, row_counts.data());
 
   int cnnz = 0;
   raft::update_host(&cnnz, row_counts.data() + m, 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   // create csr compressed row index from row counts
-  thrust::device_ptr<int> row_counts_d =
-    thrust::device_pointer_cast(row_counts.data());
-  thrust::device_ptr<int> c_ind_d = thrust::device_pointer_cast(out_ind);
-  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
-                 c_ind_d);
+  thrust::device_ptr<int> row_counts_d = thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<int> c_ind_d      = thrust::device_pointer_cast(out_ind);
+  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, c_ind_d);
 
   return cnnz;
 }
@@ -208,16 +225,25 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
  * @param stream: cuda stream to use
  */
 template <typename T, int TPB_X = 128>
-void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val,
-                      int nnz1, const int *b_ind, const int *b_indptr,
-                      const T *b_val, int nnz2, int m, int *c_ind,
-                      int *c_indptr, T *c_val, cudaStream_t stream) {
+void csr_add_finalize(const int* a_ind,
+                      const int* a_indptr,
+                      const T* a_val,
+                      int nnz1,
+                      const int* b_ind,
+                      const int* b_indptr,
+                      const T* b_val,
+                      int nnz2,
+                      int m,
+                      int* c_ind,
+                      int* c_indptr,
+                      T* c_val,
+                      cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_add_kernel<T, TPB_X>
-    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
-                               b_val, nnz2, m, c_ind, c_indptr, c_val);
+  csr_add_kernel<T, TPB_X><<<grid, blk, 0, stream>>>(
+    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index 9bd322c90a..77a9445ab1 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -44,11 +44,10 @@ namespace linalg {
  * @param results array to place results
  */
 template <int TPB_X = 64>
-__global__ void coo_degree_kernel(const int *rows, int nnz, int *results) {
+__global__ void coo_degree_kernel(const int* rows, int nnz, int* results)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz) {
-    raft::myAtomicAdd(results + rows[row], 1);
-  }
+  if (row < nnz) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
 /**
@@ -60,7 +59,8 @@ __global__ void coo_degree_kernel(const int *rows, int nnz, int *results) {
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64>
-void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) {
+void coo_degree(const int* rows, int nnz, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
@@ -77,31 +77,28 @@ void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) {
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree(COO<T> *in, int *results, cudaStream_t stream) {
+void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
-  coo_degree_kernel<TPB_X>
-    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->nnz, results);
+  coo_degree_kernel<TPB_X><<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->nnz, results);
   CUDA_CHECK(cudaGetLastError());
 }
 
 template <int TPB_X = 64, typename T>
-__global__ void coo_degree_nz_kernel(const int *rows, const T *vals, int nnz,
-                                     int *results) {
+__global__ void coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz && vals[row] != 0.0) {
-    raft::myAtomicAdd(results + rows[row], 1);
-  }
+  if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
 template <int TPB_X = 64, typename T>
-__global__ void coo_degree_scalar_kernel(const int *rows, const T *vals,
-                                         int nnz, T scalar, int *results) {
+__global__ void coo_degree_scalar_kernel(
+  const int* rows, const T* vals, int nnz, T scalar, int* results)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz && vals[row] != scalar) {
-    raft::myAtomicAdd(results + rows[row], 1);
-  }
+  if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
 /**
@@ -114,12 +111,12 @@ __global__ void coo_degree_scalar_kernel(const int *rows, const T *vals,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_scalar(COO<T> *in, T scalar, int *results,
-                       cudaStream_t stream) {
+void coo_degree_scalar(COO<T>* in, T scalar, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
-  coo_degree_scalar_kernel<TPB_X, T><<<grid_rc, blk_rc, 0, stream>>>(
-    in->rows(), in->vals(), in->nnz, scalar, results);
+  coo_degree_scalar_kernel<TPB_X, T>
+    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->vals(), in->nnz, scalar, results);
   CUDA_CHECK(cudaGetLastError());
 }
 
@@ -135,8 +132,9 @@ void coo_degree_scalar(COO<T> *in, T scalar, int *results,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar,
-                       int *results, cudaStream_t stream = 0) {
+void coo_degree_scalar(
+  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
+{
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
   coo_degree_scalar_kernel<TPB_X, T>
@@ -154,12 +152,11 @@ void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results,
-                   cudaStream_t stream) {
+void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
-  coo_degree_nz_kernel<TPB_X, T>
-    <<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, results);
+  coo_degree_nz_kernel<TPB_X, T><<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, results);
 }
 
 /**
@@ -171,7 +168,8 @@ void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_nz(COO<T> *in, int *results, cudaStream_t stream) {
+void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index bfcd3fd592..59dc5ff3e4 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -41,10 +41,12 @@ __global__ void csr_row_normalize_l1_kernel(
   // @TODO: This can be done much more parallel by
   // having threads in a warp compute the sum in parallel
   // over each row and then divide the values in parallel.
-  const int *ia,           // csr row ex_scan (sorted by row)
-  const T *vals, int nnz,  // array of values and number of non-zeros
-  int m,                   // num rows in csr
-  T *result) {             // output array
+  const int* ia,  // csr row ex_scan (sorted by row)
+  const T* vals,
+  int nnz,  // array of values and number of non-zeros
+  int m,    // num rows in csr
+  T* result)
+{  // output array
 
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -52,7 +54,7 @@ __global__ void csr_row_normalize_l1_kernel(
   // sum all vals_arr for row and divide each val by sum
   if (row < m) {
     int start_idx = ia[row];
-    int stop_idx = 0;
+    int stop_idx  = 0;
     if (row < m - 1) {
       stop_idx = ia[row + 1];
     } else
@@ -65,7 +67,7 @@ __global__ void csr_row_normalize_l1_kernel(
 
     for (int j = start_idx; j < stop_idx; j++) {
       if (sum != 0.0) {
-        T val = vals[j];
+        T val     = vals[j];
         result[j] = val / sum;
       } else {
         result[j] = 0.0;
@@ -85,18 +87,18 @@ __global__ void csr_row_normalize_l1_kernel(
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void csr_row_normalize_l1(const int *ia,  // csr row ex_scan (sorted by row)
-                          const T *vals,
+void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
+                          const T* vals,
                           int nnz,  // array of values and number of non-zeros
                           int m,    // num rows in csr
-                          T *result,
-                          cudaStream_t stream) {  // output array
+                          T* result,
+                          cudaStream_t stream)
+{  // output array
 
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_row_normalize_l1_kernel<TPB_X, T>
-    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
+  csr_row_normalize_l1_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
   CUDA_CHECK(cudaGetLastError());
 }
 
@@ -105,10 +107,12 @@ __global__ void csr_row_normalize_max_kernel(
   // @TODO: This can be done much more parallel by
   // having threads in a warp compute the sum in parallel
   // over each row and then divide the values in parallel.
-  const int *ia,           // csr row ind array (sorted by row)
-  const T *vals, int nnz,  // array of values and number of non-zeros
-  int m,                   // num total rows in csr
-  T *result) {             // output array
+  const int* ia,  // csr row ind array (sorted by row)
+  const T* vals,
+  int nnz,  // array of values and number of non-zeros
+  int m,    // num total rows in csr
+  T* result)
+{  // output array
 
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -116,7 +120,7 @@ __global__ void csr_row_normalize_max_kernel(
   // find max across columns and divide
   if (row < m) {
     int start_idx = ia[row];
-    int stop_idx = 0;
+    int stop_idx  = 0;
     if (row < m - 1) {
       stop_idx = ia[row + 1];
     } else
@@ -130,7 +134,7 @@ __global__ void csr_row_normalize_max_kernel(
     // divide nonzeros in current row by max
     for (int j = start_idx; j < stop_idx; j++) {
       if (max != 0.0 && max > std::numeric_limits<float>::min()) {
-        T val = vals[j];
+        T val     = vals[j];
         result[j] = val / max;
       } else {
         result[j] = 0.0;
@@ -151,16 +155,17 @@ __global__ void csr_row_normalize_max_kernel(
  */
 
 template <int TPB_X = 64, typename T>
-void csr_row_normalize_max(const int *ia,  // csr row ind array (sorted by row)
-                           const T *vals,
+void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
+                           const T* vals,
                            int nnz,  // array of values and number of non-zeros
                            int m,    // num total rows in csr
-                           T *result, cudaStream_t stream) {
+                           T* result,
+                           cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_row_normalize_max_kernel<TPB_X, T>
-    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
+  csr_row_normalize_max_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
   CUDA_CHECK(cudaGetLastError());
 }
 
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index 15302f3b74..3b609d994f 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -31,16 +31,23 @@ namespace sparse {
 namespace spectral {
 
 template <typename T>
-void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
-                   int nnz, int n, int n_components, T *out,
-                   unsigned long long seed = 1234567) {
-  auto stream = handle.get_stream();
+void fit_embedding(const raft::handle_t& handle,
+                   int* rows,
+                   int* cols,
+                   T* vals,
+                   int nnz,
+                   int n,
+                   int n_components,
+                   T* out,
+                   unsigned long long seed = 1234567)
+{
+  auto stream  = handle.get_stream();
   auto d_alloc = handle.get_device_allocator();
   raft::mr::device::buffer<int> src_offsets(d_alloc, stream, n + 1);
   raft::mr::device::buffer<int> dst_cols(d_alloc, stream, nnz);
   raft::mr::device::buffer<T> dst_vals(d_alloc, stream, nnz);
-  convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(),
-                      dst_cols.data(), dst_vals.data());
+  convert::coo_to_csr(
+    handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data());
 
   raft::mr::device::buffer<T> eigVals(d_alloc, stream, n_components + 1);
   raft::mr::device::buffer<T> eigVecs(d_alloc, stream, n * (n_components + 1));
@@ -54,48 +61,53 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
   using index_type = int;
   using value_type = T;
 
-  index_type *ro = src_offsets.data();
-  index_type *ci = dst_cols.data();
-  value_type *vs = dst_vals.data();
+  index_type* ro = src_offsets.data();
+  index_type* ci = dst_cols.data();
+  value_type* vs = dst_vals.data();
 
-  raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
-    handle, ro, ci, vs, n, nnz};
+  raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{handle, ro, ci, vs, n, nnz};
 
-  index_type neigvs = n_components + 1;
-  index_type maxiter = 4000;  //default reset value (when set to 0);
-  value_type tol = 0.01;
-  index_type restart_iter = 15 + neigvs;  //what cugraph is using
-  auto t_exe_p = thrust::cuda::par.on(stream);
+  index_type neigvs         = n_components + 1;
+  index_type maxiter        = 4000;  // default reset value (when set to 0);
+  value_type tol            = 0.01;
+  index_type restart_iter   = 15 + neigvs;  // what cugraph is using
+  auto t_exe_p              = thrust::cuda::par.on(stream);
   using thrust_exe_policy_t = decltype(t_exe_p);
 
-  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter,
-                                                          restart_iter, tol};
+  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter, restart_iter, tol};
 
   cfg.seed = seed;
 
   raft::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
 
-  //cluster computation here is irrelevant,
-  //hence define a no-op such solver to
-  //feed partition():
+  // cluster computation here is irrelevant,
+  // hence define a no-op such solver to
+  // feed partition():
   //
   struct no_op_cluster_solver_t {
     using index_type_t = index_type;
-    using size_type_t = index_type;
+    using size_type_t  = index_type;
     using value_type_t = value_type;
 
-    std::pair<value_type_t, index_type_t> solve(
-      handle_t const &handle, thrust_exe_policy_t t_exe_policy,
-      size_type_t n_obs_vecs, size_type_t dim,
-      value_type_t const *__restrict__ obs,
-      index_type_t *__restrict__ codes) const {
+    std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+                                                thrust_exe_policy_t t_exe_policy,
+                                                size_type_t n_obs_vecs,
+                                                size_type_t dim,
+                                                value_type_t const* __restrict__ obs,
+                                                index_type_t* __restrict__ codes) const
+    {
       return std::make_pair<value_type_t, index_type_t>(0, 0);
     }
   };
 
-  raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver,
-                            no_op_cluster_solver_t{}, labels.data(),
-                            eigVals.data(), eigVecs.data());
+  raft::spectral::partition(handle,
+                            t_exe_p,
+                            r_csr_m,
+                            eig_solver,
+                            no_op_cluster_solver_t{},
+                            labels.data(),
+                            eigVals.data(),
+                            eigVecs.data());
 
   raft::copy<T>(out, eigVecs.data() + n, n * n_components, stream);
 
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index 5c2c78f0c3..b9426c284a 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -49,26 +49,34 @@ namespace linalg {
 // TODO: value_idx param needs to be used for this once FAISS is updated to use float32
 // for indices so that the index types can be uniform
 template <int TPB_X = 128, typename T, typename Lambda>
-__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
-                                      T *vals, int *orows, int *ocols, T *ovals,
-                                      int n, int cnnz, Lambda reduction_op) {
+__global__ void coo_symmetrize_kernel(int* row_ind,
+                                      int* rows,
+                                      int* cols,
+                                      T* vals,
+                                      int* orows,
+                                      int* ocols,
+                                      T* ovals,
+                                      int n,
+                                      int cnnz,
+                                      Lambda reduction_op)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < n) {
     int start_idx = row_ind[row];  // each thread processes one row
-    int stop_idx = get_stop_idx(row, n, cnnz, row_ind);
+    int stop_idx  = get_stop_idx(row, n, cnnz, row_ind);
 
-    int row_nnz = 0;
+    int row_nnz       = 0;
     int out_start_idx = start_idx * 2;
 
     for (int idx = 0; idx < stop_idx - start_idx; idx++) {
       int cur_row = rows[idx + start_idx];
       int cur_col = cols[idx + start_idx];
-      T cur_val = vals[idx + start_idx];
+      T cur_val   = vals[idx + start_idx];
 
       int lookup_row = cur_col;
-      int t_start = row_ind[lookup_row];  // Start at
-      int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind);
+      int t_start    = row_ind[lookup_row];  // Start at
+      int t_stop     = get_stop_idx(lookup_row, n, cnnz, row_ind);
 
       T transpose = 0.0;
 
@@ -79,7 +87,7 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
         // done in a different thread.
         if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) {
           // If it exists already, set transposed value to existing value
-          transpose = vals[t_idx];
+          transpose   = vals[t_idx];
           found_match = true;
           break;
         }
@@ -126,10 +134,12 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 128, typename T, typename Lambda>
-void coo_symmetrize(COO<T> *in, COO<T> *out,
+void coo_symmetrize(COO<T>* in,
+                    COO<T>* out,
                     Lambda reduction_op,  // two-argument reducer
                     std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                    cudaStream_t stream) {
+                    cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
@@ -141,9 +151,16 @@ void coo_symmetrize(COO<T> *in, COO<T> *out,
 
   out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream);
 
-  coo_symmetrize_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(
-    in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(),
-    out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op);
+  coo_symmetrize_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(in_row_ind.data(),
+                                                            in->rows(),
+                                                            in->cols(),
+                                                            in->vals(),
+                                                            out->rows(),
+                                                            out->cols(),
+                                                            out->vals(),
+                                                            in->n_rows,
+                                                            in->nnz,
+                                                            reduction_op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -159,14 +176,15 @@ void coo_symmetrize(COO<T> *in, COO<T> *out,
  * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_find_size(const value_t *restrict data,
-                                           const value_idx *restrict indices,
-                                           const value_idx n, const int k,
-                                           value_idx *restrict row_sizes,
-                                           value_idx *restrict row_sizes2) {
+__global__ static void symmetric_find_size(const value_t* restrict data,
+                                           const value_idx* restrict indices,
+                                           const value_idx n,
+                                           const int k,
+                                           value_idx* restrict row_sizes,
+                                           value_idx* restrict row_sizes2)
+{
   const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
-  const auto j =
-    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
+  const auto j   = blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
   if (row >= n || j >= k) return;
 
   const auto col = indices[row * k + j];
@@ -186,9 +204,11 @@ __global__ static void symmetric_find_size(const value_t *restrict data,
  * @param row_sizes2: Input row sum 2 array(n) for faster reduction
  */
 template <typename value_idx>
-__global__ static void reduce_find_size(const value_idx n, const int k,
-                                        value_idx *restrict row_sizes,
-                                        const value_idx *restrict row_sizes2) {
+__global__ static void reduce_find_size(const value_idx n,
+                                        const int k,
+                                        value_idx* restrict row_sizes,
+                                        const value_idx* restrict row_sizes2)
+{
   const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (i >= n) return;
   row_sizes[i] += (row_sizes2[i] + k);
@@ -209,20 +229,21 @@ __global__ static void reduce_find_size(const value_idx n, const int k,
  * @param k: Number of n_neighbors
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_sum(value_idx *restrict edges,
-                                     const value_t *restrict data,
-                                     const value_idx *restrict indices,
-                                     value_t *restrict VAL,
-                                     value_idx *restrict COL,
-                                     value_idx *restrict ROW, const value_idx n,
-                                     const int k) {
+__global__ static void symmetric_sum(value_idx* restrict edges,
+                                     const value_t* restrict data,
+                                     const value_idx* restrict indices,
+                                     value_t* restrict VAL,
+                                     value_idx* restrict COL,
+                                     value_idx* restrict ROW,
+                                     const value_idx n,
+                                     const int k)
+{
   const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
-  const auto j =
-    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
+  const auto j   = blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
   if (row >= n || j >= k) return;
 
-  const auto col = indices[row * k + j];
-  const auto original = atomicAdd(&edges[row], value_idx(1));
+  const auto col       = indices[row * k + j];
+  const auto original  = atomicAdd(&edges[row], value_idx(1));
   const auto transpose = atomicAdd(&edges[col], value_idx(1));
 
   VAL[transpose] = VAL[original] = data[row * k + j];
@@ -252,26 +273,26 @@ __global__ static void symmetric_sum(value_idx *restrict edges,
  * @param stream: Input cuda stream
  * @param d_alloc device allocator for temporary buffers
  */
-template <typename value_idx = int64_t, typename value_t = float,
-          int TPB_X = 32, int TPB_Y = 32>
-void from_knn_symmetrize_matrix(
-  const value_idx *restrict knn_indices, const value_t *restrict knn_dists,
-  const value_idx n, const int k, COO<value_t, value_idx> *out,
-  cudaStream_t stream, std::shared_ptr<raft::mr::device::allocator> d_alloc) {
+template <typename value_idx = int64_t, typename value_t = float, int TPB_X = 32, int TPB_Y = 32>
+void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices,
+                                const value_t* restrict knn_dists,
+                                const value_idx n,
+                                const int k,
+                                COO<value_t, value_idx>* out,
+                                cudaStream_t stream,
+                                std::shared_ptr<raft::mr::device::allocator> d_alloc)
+{
   // (1) Find how much space needed in each row
   // We look through all datapoints and increment the count for each row.
   const dim3 threadsPerBlock(TPB_X, TPB_Y);
-  const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X),
-                       raft::ceildiv(k, TPB_Y));
+  const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), raft::ceildiv(k, TPB_Y));
 
   // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4)
   raft::mr::device::buffer<value_idx> row_sizes(d_alloc, stream, n);
-  CUDA_CHECK(
-    cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
+  CUDA_CHECK(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
 
   raft::mr::device::buffer<value_idx> row_sizes2(d_alloc, stream, n);
-  CUDA_CHECK(
-    cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
+  CUDA_CHECK(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
 
   symmetric_find_size<<<numBlocks, threadsPerBlock, 0, stream>>>(
     knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data());
@@ -292,14 +313,12 @@ void from_knn_symmetrize_matrix(
   // This mirrors CSR matrix's row Pointer, were maximum bounds for each row
   // are calculated as the cumulative rolling sum of the previous rows.
   // Notice reusing old row_sizes2 memory
-  value_idx *edges = row_sizes2.data();
-  thrust::device_ptr<value_idx> __edges = thrust::device_pointer_cast(edges);
-  thrust::device_ptr<value_idx> __row_sizes =
-    thrust::device_pointer_cast(row_sizes.data());
+  value_idx* edges                          = row_sizes2.data();
+  thrust::device_ptr<value_idx> __edges     = thrust::device_pointer_cast(edges);
+  thrust::device_ptr<value_idx> __row_sizes = thrust::device_pointer_cast(row_sizes.data());
 
   // Rolling cumulative sum
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes,
-                         __row_sizes + n, __edges);
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes, __row_sizes + n, __edges);
 
   // (5) Perform final data + data.T operation in tandem with memcpying
   symmetric_sum<<<numBlocks, threadsPerBlock, 0, stream>>>(
@@ -311,11 +330,17 @@ void from_knn_symmetrize_matrix(
  * Symmetrizes a COO matrix
  */
 template <typename value_idx, typename value_t>
-void symmetrize(const raft::handle_t &handle, const value_idx *rows,
-                const value_idx *cols, const value_t *vals, size_t m, size_t n,
-                size_t nnz, raft::sparse::COO<value_t, value_idx> &out) {
+void symmetrize(const raft::handle_t& handle,
+                const value_idx* rows,
+                const value_idx* cols,
+                const value_t* vals,
+                size_t m,
+                size_t n,
+                size_t nnz,
+                raft::sparse::COO<value_t, value_idx>& out)
+{
   auto d_alloc = handle.get_device_allocator();
-  auto stream = handle.get_stream();
+  auto stream  = handle.get_stream();
 
   // copy rows to cols and cols to rows
   rmm::device_uvector<value_idx> symm_rows(nnz * 2, stream);
@@ -331,13 +356,17 @@ void symmetrize(const raft::handle_t &handle, const value_idx *rows,
   raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream);
 
   // sort COO
-  raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2,
-                             symm_rows.data(), symm_cols.data(),
-                             symm_vals.data(), d_alloc, stream);
-
-  raft::sparse::op::max_duplicates(handle, out, symm_rows.data(),
-                                   symm_cols.data(), symm_vals.data(), nnz * 2,
-                                   m, n);
+  raft::sparse::op::coo_sort((value_idx)m,
+                             (value_idx)n,
+                             (value_idx)nnz * 2,
+                             symm_rows.data(),
+                             symm_cols.data(),
+                             symm_vals.data(),
+                             d_alloc,
+                             stream);
+
+  raft::sparse::op::max_duplicates(
+    handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, m, n);
 }
 
 };  // end NAMESPACE linalg
diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h
index 6afe4ca8f6..ce90eb6702 100644
--- a/cpp/include/raft/sparse/linalg/transpose.h
+++ b/cpp/include/raft/sparse/linalg/transpose.h
@@ -57,29 +57,55 @@ namespace linalg {
  * @param[in] stream : Cuda stream for ordering events
  */
 template <typename value_idx, typename value_t>
-void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr,
-                   const value_idx *csr_indices, const value_t *csr_data,
-                   value_idx *csc_indptr, value_idx *csc_indices,
-                   value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols,
+void csr_transpose(cusparseHandle_t handle,
+                   const value_idx* csr_indptr,
+                   const value_idx* csr_indices,
+                   const value_t* csr_data,
+                   value_idx* csc_indptr,
+                   value_idx* csc_indices,
+                   value_t* csc_data,
+                   value_idx csr_nrows,
+                   value_idx csr_ncols,
                    value_idx nnz,
                    std::shared_ptr<raft::mr::device::allocator> allocator,
-                   cudaStream_t stream) {
+                   cudaStream_t stream)
+{
   size_t convert_csc_workspace_size = 0;
 
-  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(
-    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
-    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
-    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
-    &convert_csc_workspace_size, stream));
+  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(handle,
+                                                          csr_nrows,
+                                                          csr_ncols,
+                                                          nnz,
+                                                          csr_data,
+                                                          csr_indptr,
+                                                          csr_indices,
+                                                          csc_data,
+                                                          csc_indptr,
+                                                          csc_indices,
+                                                          CUSPARSE_ACTION_NUMERIC,
+                                                          CUSPARSE_INDEX_BASE_ZERO,
+                                                          CUSPARSE_CSR2CSC_ALG1,
+                                                          &convert_csc_workspace_size,
+                                                          stream));
 
   raft::mr::device::buffer<char> convert_csc_workspace(
     allocator, stream, convert_csc_workspace_size);
 
-  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(
-    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
-    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
-    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
-    convert_csc_workspace.data(), stream));
+  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(handle,
+                                               csr_nrows,
+                                               csr_ncols,
+                                               nnz,
+                                               csr_data,
+                                               csr_indptr,
+                                               csr_indices,
+                                               csc_data,
+                                               csc_indptr,
+                                               csc_indices,
+                                               CUSPARSE_ACTION_NUMERIC,
+                                               CUSPARSE_INDEX_BASE_ZERO,
+                                               CUSPARSE_CSR2CSC_ALG1,
+                                               convert_csc_workspace.data(),
+                                               stream));
 }
 
 };  // end NAMESPACE linalg
diff --git a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh
index f0d30b0cb7..36d426029b 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh
@@ -28,10 +28,16 @@ namespace mst {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename alteration_t>
-__global__ void kernel_min_edge_per_vertex(
-  const edge_t* offsets, const vertex_t* indices, const alteration_t* weights,
-  const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge,
-  const bool* mst_edge, alteration_t* min_edge_color, const vertex_t v) {
+__global__ void kernel_min_edge_per_vertex(const edge_t* offsets,
+                                           const vertex_t* indices,
+                                           const alteration_t* weights,
+                                           const vertex_t* color,
+                                           const vertex_t* color_index,
+                                           edge_t* new_mst_edge,
+                                           const bool* mst_edge,
+                                           alteration_t* min_edge_color,
+                                           const vertex_t v)
+{
   edge_t tid = threadIdx.x + blockIdx.x * blockDim.x;
 
   unsigned warp_id = tid / 32;
@@ -41,14 +47,14 @@ __global__ void kernel_min_edge_per_vertex(
   __shared__ alteration_t min_edge_weight[32];
   __shared__ vertex_t min_color[32];
 
-  min_edge_index[lane_id] = std::numeric_limits<edge_t>::max();
+  min_edge_index[lane_id]  = std::numeric_limits<edge_t>::max();
   min_edge_weight[lane_id] = std::numeric_limits<alteration_t>::max();
-  min_color[lane_id] = std::numeric_limits<vertex_t>::max();
+  min_color[lane_id]       = std::numeric_limits<vertex_t>::max();
 
   __syncthreads();
 
   vertex_t self_color_idx = color_index[warp_id];
-  vertex_t self_color = color[self_color_idx];
+  vertex_t self_color     = color[self_color_idx];
 
   // find the minimum edge associated per row
   // each thread in warp holds the minimum edge for
@@ -56,20 +62,20 @@ __global__ void kernel_min_edge_per_vertex(
   if (warp_id < v) {
     // one row is associated with one warp
     edge_t row_start = offsets[warp_id];
-    edge_t row_end = offsets[warp_id + 1];
+    edge_t row_end   = offsets[warp_id + 1];
 
     // assuming one warp per row
     // find min for each thread in warp
     for (edge_t e = row_start + lane_id; e < row_end; e += 32) {
       alteration_t curr_edge_weight = weights[e];
-      vertex_t successor_color_idx = color_index[indices[e]];
-      vertex_t successor_color = color[successor_color_idx];
+      vertex_t successor_color_idx  = color_index[indices[e]];
+      vertex_t successor_color      = color[successor_color_idx];
 
       if (!mst_edge[e] && self_color != successor_color) {
         if (curr_edge_weight < min_edge_weight[lane_id]) {
-          min_color[lane_id] = successor_color;
+          min_color[lane_id]       = successor_color;
           min_edge_weight[lane_id] = curr_edge_weight;
-          min_edge_index[lane_id] = e;
+          min_edge_index[lane_id]  = e;
         }
       }
     }
@@ -82,9 +88,9 @@ __global__ void kernel_min_edge_per_vertex(
   for (int offset = 16; offset > 0; offset >>= 1) {
     if (lane_id < offset) {
       if (min_edge_weight[lane_id] > min_edge_weight[lane_id + offset]) {
-        min_color[lane_id] = min_color[lane_id + offset];
+        min_color[lane_id]       = min_color[lane_id + offset];
         min_edge_weight[lane_id] = min_edge_weight[lane_id + offset];
-        min_edge_index[lane_id] = min_edge_index[lane_id + offset];
+        min_edge_index[lane_id]  = min_edge_index[lane_id + offset];
       }
     }
     __syncthreads();
@@ -102,19 +108,26 @@ __global__ void kernel_min_edge_per_vertex(
   }
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-__global__ void min_edge_per_supervertex(
-  const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge,
-  bool* mst_edge, const vertex_t* indices, const weight_t* weights,
-  const alteration_t* altered_weights, vertex_t* temp_src, vertex_t* temp_dst,
-  weight_t* temp_weights, const alteration_t* min_edge_color, const vertex_t v,
-  bool symmetrize_output) {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+__global__ void min_edge_per_supervertex(const vertex_t* color,
+                                         const vertex_t* color_index,
+                                         edge_t* new_mst_edge,
+                                         bool* mst_edge,
+                                         const vertex_t* indices,
+                                         const weight_t* weights,
+                                         const alteration_t* altered_weights,
+                                         vertex_t* temp_src,
+                                         vertex_t* temp_dst,
+                                         weight_t* temp_weights,
+                                         const alteration_t* min_edge_color,
+                                         const vertex_t v,
+                                         bool symmetrize_output)
+{
   auto tid = get_1D_idx<vertex_t>();
   if (tid < v) {
     vertex_t vertex_color_idx = color_index[tid];
-    vertex_t vertex_color = color[vertex_color_idx];
-    edge_t edge_idx = new_mst_edge[tid];
+    vertex_t vertex_color     = color[vertex_color_idx];
+    edge_t edge_idx           = new_mst_edge[tid];
 
     // check if valid outgoing edge was found
     // find minimum edge is same as minimum edge of whole supervertex
@@ -129,32 +142,27 @@ __global__ void min_edge_per_supervertex(
         auto dst = indices[edge_idx];
         if (!symmetrize_output) {
           auto dst_edge_idx = new_mst_edge[dst];
-          auto dst_color = color[color_index[dst]];
+          auto dst_color    = color[color_index[dst]];
 
           // vertices added each other
           // only if destination has found an edge
           // the edge points back to source
           // the edge is minimum edge found for dst color
-          if (dst_edge_idx != std::numeric_limits<edge_t>::max() &&
-              indices[dst_edge_idx] == tid &&
+          if (dst_edge_idx != std::numeric_limits<edge_t>::max() && indices[dst_edge_idx] == tid &&
               min_edge_color[dst_color] == altered_weights[dst_edge_idx]) {
-            if (vertex_color > dst_color) {
-              add_edge = false;
-            }
+            if (vertex_color > dst_color) { add_edge = false; }
           }
         }
 
         if (add_edge) {
-          temp_src[tid] = tid;
-          temp_dst[tid] = dst;
-          temp_weights[tid] = weights[edge_idx];
+          temp_src[tid]      = tid;
+          temp_dst[tid]      = dst;
+          temp_weights[tid]  = weights[edge_idx];
           mst_edge[edge_idx] = true;
         }
       }
 
-      if (!add_edge) {
-        new_mst_edge[tid] = std::numeric_limits<edge_t>::max();
-      }
+      if (!add_edge) { new_mst_edge[tid] = std::numeric_limits<edge_t>::max(); }
     }
   }
 }
@@ -162,9 +170,13 @@ __global__ void min_edge_per_supervertex(
 template <typename vertex_t, typename edge_t, typename weight_t>
 __global__ void add_reverse_edge(const edge_t* new_mst_edge,
                                  const vertex_t* indices,
-                                 const weight_t* weights, vertex_t* temp_src,
-                                 vertex_t* temp_dst, weight_t* temp_weights,
-                                 const vertex_t v, bool symmetrize_output) {
+                                 const weight_t* weights,
+                                 vertex_t* temp_src,
+                                 vertex_t* temp_dst,
+                                 weight_t* temp_weights,
+                                 const vertex_t v,
+                                 bool symmetrize_output)
+{
   auto tid = get_1D_idx<vertex_t>();
 
   if (tid < v) {
@@ -186,9 +198,7 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
 
           // if vertices did not pick each other
           // add a reverse edge
-          if (tid != neighbor_vertex_neighbor) {
-            reverse_needed = true;
-          }
+          if (tid != neighbor_vertex_neighbor) { reverse_needed = true; }
         }
       }
 
@@ -197,8 +207,8 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
         // it is assumed the each vertex only picks one valid min edge
         // per cycle
         // hence, we store at index tid + v for the reverse edge scenario
-        temp_src[tid + v] = neighbor_vertex;
-        temp_dst[tid + v] = tid;
+        temp_src[tid + v]     = neighbor_vertex;
+        temp_dst[tid + v]     = tid;
         temp_weights[tid + v] = weights[edge_idx];
       }
     }
@@ -207,11 +217,13 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
 
 // executes for newly added mst edges and updates the colors of both vertices to the lower color
 template <typename vertex_t, typename edge_t>
-__global__ void min_pair_colors(const vertex_t v, const vertex_t* indices,
+__global__ void min_pair_colors(const vertex_t v,
+                                const vertex_t* indices,
                                 const edge_t* new_mst_edge,
                                 const vertex_t* color,
                                 const vertex_t* color_index,
-                                vertex_t* next_color) {
+                                vertex_t* next_color)
+{
   auto i = get_1D_idx<vertex_t>();
 
   if (i < v) {
@@ -220,9 +232,9 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices,
     if (edge_idx != std::numeric_limits<edge_t>::max()) {
       vertex_t neighbor_vertex = indices[edge_idx];
       // vertex_t self_color = color[i];
-      vertex_t self_color_idx = color_index[i];
-      vertex_t self_color = color[self_color_idx];
-      vertex_t neighbor_color_idx = color_index[neighbor_vertex];
+      vertex_t self_color_idx       = color_index[i];
+      vertex_t self_color           = color[self_color_idx];
+      vertex_t neighbor_color_idx   = color_index[neighbor_vertex];
       vertex_t neighbor_super_color = color[neighbor_color_idx];
 
       // update my own color as source of edge
@@ -238,33 +250,36 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices,
 
 // for each vertex, update color if it was changed in min_pair_colors kernel
 template <typename vertex_t>
-__global__ void update_colors(const vertex_t v, vertex_t* color,
+__global__ void update_colors(const vertex_t v,
+                              vertex_t* color,
                               const vertex_t* color_index,
-                              const vertex_t* next_color, bool* done) {
+                              const vertex_t* next_color,
+                              bool* done)
+{
   auto i = get_1D_idx<vertex_t>();
 
   if (i < v) {
-    vertex_t self_color = color[i];
+    vertex_t self_color     = color[i];
     vertex_t self_color_idx = color_index[i];
-    vertex_t new_color = next_color[self_color_idx];
+    vertex_t new_color      = next_color[self_color_idx];
 
     // update self color to new smaller color
     if (self_color > new_color) {
       color[i] = new_color;
-      *done = false;
+      *done    = false;
     }
   }
 }
 
 // point vertices to their final color index
 template <typename vertex_t>
-__global__ void final_color_indices(const vertex_t v, const vertex_t* color,
-                                    vertex_t* color_index) {
+__global__ void final_color_indices(const vertex_t v, const vertex_t* color, vertex_t* color_index)
+{
   auto i = get_1D_idx<vertex_t>();
 
   if (i < v) {
     vertex_t self_color_idx = color_index[i];
-    vertex_t self_color = color[self_color_idx];
+    vertex_t self_color     = color[self_color_idx];
 
     // if self color is not equal to self color index,
     // it means self is not supervertex
@@ -272,7 +287,7 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color,
     // parent supervertex
     while (self_color_idx != self_color) {
       self_color_idx = color_index[self_color];
-      self_color = color[self_color_idx];
+      self_color     = color[self_color_idx];
     }
 
     // point to new supervertex
@@ -282,22 +297,23 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color,
 
 // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
 // Consider using curand device API instead of precomputed random_values array
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-__global__ void alteration_kernel(const vertex_t v, const edge_t e,
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+__global__ void alteration_kernel(const vertex_t v,
+                                  const edge_t e,
                                   const edge_t* offsets,
                                   const vertex_t* indices,
-                                  const weight_t* weights, alteration_t max,
+                                  const weight_t* weights,
+                                  alteration_t max,
                                   alteration_t* random_values,
-                                  alteration_t* altered_weights) {
+                                  alteration_t* altered_weights)
+{
   auto row = get_1D_idx<vertex_t>();
   if (row < v) {
     auto row_begin = offsets[row];
-    auto row_end = offsets[row + 1];
+    auto row_end   = offsets[row + 1];
     for (auto i = row_begin; i < row_end; i++) {
-      auto column = indices[i];
-      altered_weights[i] =
-        weights[i] + max * (random_values[row] + random_values[column]);
+      auto column        = indices[i];
+      altered_weights[i] = weights[i] + max * (random_values[row] + random_values[column]);
     }
   }
 }
@@ -305,17 +321,15 @@ __global__ void alteration_kernel(const vertex_t v, const edge_t e,
 template <typename vertex_t, typename edge_t>
 __global__ void kernel_count_new_mst_edges(const vertex_t* mst_src,
                                            edge_t* mst_edge_count,
-                                           const vertex_t v) {
+                                           const vertex_t v)
+{
   auto tid = get_1D_idx<vertex_t>();
 
   // count number of new mst edges added
-  bool predicate =
-    tid < v && (mst_src[tid] != std::numeric_limits<vertex_t>::max());
+  bool predicate       = tid < v && (mst_src[tid] != std::numeric_limits<vertex_t>::max());
   vertex_t block_count = __syncthreads_count(predicate);
 
-  if (threadIdx.x == 0 && block_count > 0) {
-    atomicAdd(mst_edge_count, block_count);
-  }
+  if (threadIdx.x == 0 && block_count > 0) { atomicAdd(mst_edge_count, block_count); }
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index c5ba4fcb4f..158f4cc314 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -46,21 +46,30 @@ typedef std::chrono::high_resolution_clock Clock;
 
 // curand generator uniform
 inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator,
-                                               float* outputPtr, size_t n) {
+                                               float* outputPtr,
+                                               size_t n)
+{
   return curandGenerateUniform(generator, outputPtr, n);
 }
 inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator,
-                                               double* outputPtr, size_t n) {
+                                               double* outputPtr,
+                                               size_t n)
+{
   return curandGenerateUniformDouble(generator, outputPtr, n);
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
-  const raft::handle_t& handle_, const edge_t* offsets_,
-  const vertex_t* indices_, const weight_t* weights_, const vertex_t v_,
-  const edge_t e_, vertex_t* color_, cudaStream_t stream_,
-  bool symmetrize_output_, bool initialize_colors_, int iterations_)
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(const raft::handle_t& handle_,
+                                                                 const edge_t* offsets_,
+                                                                 const vertex_t* indices_,
+                                                                 const weight_t* weights_,
+                                                                 const vertex_t v_,
+                                                                 const edge_t e_,
+                                                                 vertex_t* color_,
+                                                                 cudaStream_t stream_,
+                                                                 bool symmetrize_output_,
+                                                                 bool initialize_colors_,
+                                                                 int iterations_)
   : handle(handle_),
     offsets(offsets_),
     indices(indices_),
@@ -82,12 +91,13 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
     stream(stream_),
     symmetrize_output(symmetrize_output_),
     initialize_colors(initialize_colors_),
-    iterations(iterations_) {
-  max_blocks = handle_.get_device_properties().maxGridSize[0];
+    iterations(iterations_)
+{
+  max_blocks  = handle_.get_device_properties().maxGridSize[0];
   max_threads = handle_.get_device_properties().maxThreadsPerBlock;
-  sm_count = handle_.get_device_properties().multiProcessorCount;
+  sm_count    = handle_.get_device_properties().multiProcessorCount;
 
-  //Initially, color holds the vertex id as color
+  // Initially, color holds the vertex id as color
   auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
   if (initialize_colors_) {
     thrust::sequence(policy, color.begin(), color.end(), 0);
@@ -98,10 +108,10 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
   thrust::sequence(policy, next_color.begin(), next_color.end(), 0);
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
 raft::Graph_COO<vertex_t, edge_t, weight_t>
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
+MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve()
+{
   RAFT_EXPECTS(v > 0, "0 vertices");
   RAFT_EXPECTS(e > 0, "0 edges");
   RAFT_EXPECTS(offsets != nullptr, "Null offsets.");
@@ -114,12 +124,13 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
 
   // Alterating the weights
   // this is done by identifying the lowest cost edge weight gap that is not 0, call this theta.
-  // For each edge, add noise that is less than theta. That is, generate a random number in the range [0.0, theta) and add it to each edge weight.
+  // For each edge, add noise that is less than theta. That is, generate a random number in the
+  // range [0.0, theta) and add it to each edge weight.
   alteration();
 
 #ifdef MST_TIME
   auto stop = Clock::now();
-  timer0 = duration_us(stop - start);
+  timer0    = duration_us(stop - start);
 #endif
 
   auto max_mst_edges = symmetrize_output ? 2 * v - 2 : v - 1;
@@ -168,8 +179,8 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
     if (curr_mst_edge_count == prev_mst_edge_count[0]) {
 #ifdef MST_TIME
       std::cout << "Iterations: " << i << std::endl;
-      std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3
-                << "," << timer4 << "," << timer5 << std::endl;
+      std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 << "," << timer4 << ","
+                << timer5 << std::endl;
 #endif
       // exit here when reaching steady state
       break;
@@ -179,8 +190,7 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
     start = Clock::now();
 #endif
     // append the newly found MST edges to the final output
-    append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(),
-                        mst_result.weights.data());
+    append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), mst_result.weights.data());
 #ifdef MST_TIME
     stop = Clock::now();
     timer4 += duration_us(stop - start);
@@ -201,7 +211,7 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
 
   // result packaging
   thrust::host_vector<edge_t> host_mst_edge_count = mst_edge_count;
-  mst_result.n_edges = host_mst_edge_count[0];
+  mst_result.n_edges                              = host_mst_edge_count[0];
   mst_result.src.resize(mst_result.n_edges, stream);
   mst_result.dst.resize(mst_result.n_edges, stream);
   mst_result.weights.resize(mst_result.n_edges, stream);
@@ -212,50 +222,46 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
 // ||y|-|x||
 template <typename weight_t>
 struct alteration_functor {
-  __host__ __device__ weight_t
-  operator()(const thrust::tuple<weight_t, weight_t>& t) {
+  __host__ __device__ weight_t operator()(const thrust::tuple<weight_t, weight_t>& t)
+  {
     auto x = thrust::get<0>(t);
     auto y = thrust::get<1>(t);
-    x = x < 0 ? -x : x;
-    y = y < 0 ? -y : y;
+    x      = x < 0 ? -x : x;
+    y      = y < 0 ? -y : y;
     return x < y ? y - x : x - y;
   }
 };
 
 // Compute the uper bound for the alteration
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-alteration_t
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+alteration_t MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max()
+{
   auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
   rmm::device_vector<weight_t> tmp(e);
   thrust::device_ptr<const weight_t> weights_ptr(weights);
   thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin());
-  //sort tmp weights
+  // sort tmp weights
   thrust::sort(policy, tmp.begin(), tmp.end());
 
-  //remove duplicates
+  // remove duplicates
   auto new_end = thrust::unique(policy, tmp.begin(), tmp.end());
 
-  //min(a[i+1]-a[i])/2
-  auto begin =
-    thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1));
-  auto end =
-    thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end));
-  auto init = tmp[1] - tmp[0];
-  auto max =
-    thrust::transform_reduce(policy, begin, end, alteration_functor<weight_t>(),
-                             init, thrust::minimum<weight_t>());
+  // min(a[i+1]-a[i])/2
+  auto begin = thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1));
+  auto end   = thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end));
+  auto init  = tmp[1] - tmp[0];
+  auto max   = thrust::transform_reduce(
+    policy, begin, end, alteration_functor<weight_t>(), init, thrust::minimum<weight_t>());
   return max / static_cast<alteration_t>(2);
 }
 
 // Compute the alteration to make all undirected edge weight unique
 // Preserves weights order
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration()
+{
   auto nthreads = std::min(v, max_threads);
-  auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks);
+  auto nblocks  = std::min((v + nthreads - 1) / nthreads, max_blocks);
 
   // maximum alteration that does not change realtive weights order
   alteration_t max = alteration_max();
@@ -269,35 +275,32 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
   curandSetPseudoRandomGeneratorSeed(randGen, 1234567);
 
   // Initialize rand values
-  auto curand_status =
-    curand_generate_uniformX(randGen, rand_values.data().get(), v);
+  auto curand_status = curand_generate_uniformX(randGen, rand_values.data().get(), v);
   RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed");
   curand_status = curandDestroyGenerator(randGen);
-  RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS,
-               "MST: CURAND cleanup failed");
+  RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND cleanup failed");
 
-  //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
+  // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
   detail::alteration_kernel<<<nblocks, nthreads, 0, stream>>>(
-    v, e, offsets, indices, weights, max, rand_values.data().get(),
-    altered_weights.data().get());
+    v, e, offsets, indices, weights, max, rand_values.data().get(), altered_weights.data().get());
 }
 
 // updates colors of vertices by propagating the lower color to the higher
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(
-  vertex_t* mst_src, vertex_t* mst_dst) {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(vertex_t* mst_src,
+                                                                      vertex_t* mst_dst)
+{
   // update the colors of both ends its until there is no change in colors
   thrust::host_vector<edge_t> curr_mst_edge_count = mst_edge_count;
 
   auto min_pair_nthreads = std::min(v, (vertex_t)max_threads);
-  auto min_pair_nblocks = std::min(
-    (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks);
+  auto min_pair_nblocks =
+    std::min((v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks);
 
   rmm::device_vector<bool> done(1, false);
 
   edge_t* new_mst_edge_ptr = new_mst_edge.data().get();
-  vertex_t* color_ptr = color.data().get();
+  vertex_t* color_ptr      = color.data().get();
   vertex_t* next_color_ptr = next_color.data().get();
 
   bool* done_ptr = done.data().get();
@@ -314,84 +317,99 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(
     i++;
   }
 
-  detail::
-    final_color_indices<<<min_pair_nblocks, min_pair_nthreads, 0, stream>>>(
-      v, color_ptr, color_index);
+  detail::final_color_indices<<<min_pair_nblocks, min_pair_nthreads, 0, stream>>>(
+    v, color_ptr, color_index);
 #ifdef MST_TIME
   std::cout << "Label prop iterations: " << i << std::endl;
 #endif
 }
 
 // Finds the minimum edge from each vertex to the lowest color
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t,
-                alteration_t>::min_edge_per_vertex() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::min_edge_per_vertex()
+{
   auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
-  thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(),
-               std::numeric_limits<alteration_t>::max());
-  thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(),
-               std::numeric_limits<weight_t>::max());
+  thrust::fill(
+    policy, min_edge_color.begin(), min_edge_color.end(), std::numeric_limits<alteration_t>::max());
+  thrust::fill(
+    policy, new_mst_edge.begin(), new_mst_edge.end(), std::numeric_limits<weight_t>::max());
 
   int n_threads = 32;
 
-  vertex_t* color_ptr = color.data().get();
-  edge_t* new_mst_edge_ptr = new_mst_edge.data().get();
-  bool* mst_edge_ptr = mst_edge.data().get();
-  alteration_t* min_edge_color_ptr = min_edge_color.data().get();
+  vertex_t* color_ptr               = color.data().get();
+  edge_t* new_mst_edge_ptr          = new_mst_edge.data().get();
+  bool* mst_edge_ptr                = mst_edge.data().get();
+  alteration_t* min_edge_color_ptr  = min_edge_color.data().get();
   alteration_t* altered_weights_ptr = altered_weights.data().get();
 
-  detail::kernel_min_edge_per_vertex<<<v, n_threads, 0, stream>>>(
-    offsets, indices, altered_weights_ptr, color_ptr, color_index,
-    new_mst_edge_ptr, mst_edge_ptr, min_edge_color_ptr, v);
+  detail::kernel_min_edge_per_vertex<<<v, n_threads, 0, stream>>>(offsets,
+                                                                  indices,
+                                                                  altered_weights_ptr,
+                                                                  color_ptr,
+                                                                  color_index,
+                                                                  new_mst_edge_ptr,
+                                                                  mst_edge_ptr,
+                                                                  min_edge_color_ptr,
+                                                                  v);
 }
 
 // Finds the minimum edge from each supervertex to the lowest color
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t,
-                alteration_t>::min_edge_per_supervertex() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::min_edge_per_supervertex()
+{
   auto nthreads = std::min(v, max_threads);
-  auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks);
+  auto nblocks  = std::min((v + nthreads - 1) / nthreads, max_blocks);
 
   auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
-  thrust::fill(policy, temp_src.begin(), temp_src.end(),
-               std::numeric_limits<vertex_t>::max());
+  thrust::fill(policy, temp_src.begin(), temp_src.end(), std::numeric_limits<vertex_t>::max());
 
-  vertex_t* color_ptr = color.data().get();
-  edge_t* new_mst_edge_ptr = new_mst_edge.data().get();
-  bool* mst_edge_ptr = mst_edge.data().get();
-  alteration_t* min_edge_color_ptr = min_edge_color.data().get();
+  vertex_t* color_ptr               = color.data().get();
+  edge_t* new_mst_edge_ptr          = new_mst_edge.data().get();
+  bool* mst_edge_ptr                = mst_edge.data().get();
+  alteration_t* min_edge_color_ptr  = min_edge_color.data().get();
   alteration_t* altered_weights_ptr = altered_weights.data().get();
-  vertex_t* temp_src_ptr = temp_src.data().get();
-  vertex_t* temp_dst_ptr = temp_dst.data().get();
-  weight_t* temp_weights_ptr = temp_weights.data().get();
-
-  detail::min_edge_per_supervertex<<<nblocks, nthreads, 0, stream>>>(
-    color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights,
-    altered_weights_ptr, temp_src_ptr, temp_dst_ptr, temp_weights_ptr,
-    min_edge_color_ptr, v, symmetrize_output);
+  vertex_t* temp_src_ptr            = temp_src.data().get();
+  vertex_t* temp_dst_ptr            = temp_dst.data().get();
+  weight_t* temp_weights_ptr        = temp_weights.data().get();
+
+  detail::min_edge_per_supervertex<<<nblocks, nthreads, 0, stream>>>(color_ptr,
+                                                                     color_index,
+                                                                     new_mst_edge_ptr,
+                                                                     mst_edge_ptr,
+                                                                     indices,
+                                                                     weights,
+                                                                     altered_weights_ptr,
+                                                                     temp_src_ptr,
+                                                                     temp_dst_ptr,
+                                                                     temp_weights_ptr,
+                                                                     min_edge_color_ptr,
+                                                                     v,
+                                                                     symmetrize_output);
 
   // the above kernel only adds directed mst edges in the case where
   // a pair of vertices don't pick the same min edge between them
   // so, now we add the reverse edge to make it undirected
   if (symmetrize_output) {
-    detail::add_reverse_edge<<<nblocks, nthreads, 0, stream>>>(
-      new_mst_edge_ptr, indices, weights, temp_src_ptr, temp_dst_ptr,
-      temp_weights_ptr, v, symmetrize_output);
+    detail::add_reverse_edge<<<nblocks, nthreads, 0, stream>>>(new_mst_edge_ptr,
+                                                               indices,
+                                                               weights,
+                                                               temp_src_ptr,
+                                                               temp_dst_ptr,
+                                                               temp_weights_ptr,
+                                                               v,
+                                                               symmetrize_output);
   }
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination()
+{
   vertex_t nthreads = std::min(2 * v, (vertex_t)max_threads);
-  vertex_t nblocks =
-    std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks);
+  vertex_t nblocks  = std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks);
 
   // count number of new mst edges
   edge_t* mst_edge_count_ptr = mst_edge_count.data().get();
-  vertex_t* temp_src_ptr = temp_src.data().get();
+  vertex_t* temp_src_ptr     = temp_src.data().get();
 
   detail::kernel_count_new_mst_edges<<<nblocks, nthreads, 0, stream>>>(
     temp_src_ptr, mst_edge_count_ptr, 2 * v);
@@ -399,36 +417,40 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination() {
 
 template <typename vertex_t, typename weight_t>
 struct new_edges_functor {
-  __host__ __device__ bool operator()(
-    const thrust::tuple<vertex_t, vertex_t, weight_t>& t) {
+  __host__ __device__ bool operator()(const thrust::tuple<vertex_t, vertex_t, weight_t>& t)
+  {
     auto src = thrust::get<0>(t);
 
     return src != std::numeric_limits<vertex_t>::max() ? true : false;
   }
 };
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
 void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::append_src_dst_pair(
-  vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) {
+  vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights)
+{
   auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
 
   auto curr_mst_edge_count = prev_mst_edge_count[0];
 
   // iterator to end of mst edges added to final output in previous iteration
-  auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple(
-    mst_src + curr_mst_edge_count, mst_dst + curr_mst_edge_count,
-    mst_weights + curr_mst_edge_count));
+  auto src_dst_zip_end =
+    thrust::make_zip_iterator(thrust::make_tuple(mst_src + curr_mst_edge_count,
+                                                 mst_dst + curr_mst_edge_count,
+                                                 mst_weights + curr_mst_edge_count));
 
   // iterator to new mst edges found
-  auto temp_src_dst_zip_begin = thrust::make_zip_iterator(thrust::make_tuple(
-    temp_src.begin(), temp_dst.begin(), temp_weights.begin()));
+  auto temp_src_dst_zip_begin = thrust::make_zip_iterator(
+    thrust::make_tuple(temp_src.begin(), temp_dst.begin(), temp_weights.begin()));
   auto temp_src_dst_zip_end = thrust::make_zip_iterator(
     thrust::make_tuple(temp_src.end(), temp_dst.end(), temp_weights.end()));
 
   // copy new mst edges to final output
-  thrust::copy_if(policy, temp_src_dst_zip_begin, temp_src_dst_zip_end,
-                  src_dst_zip_end, new_edges_functor<vertex_t, weight_t>());
+  thrust::copy_if(policy,
+                  temp_src_dst_zip_begin,
+                  temp_src_dst_zip_end,
+                  src_dst_zip_end,
+                  new_edges_functor<vertex_t, weight_t>());
 }
 
 }  // namespace mst
diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh
index 8f755de459..24127c993f 100644
--- a/cpp/include/raft/sparse/mst/detail/utils.cuh
+++ b/cpp/include/raft/sparse/mst/detail/utils.cuh
@@ -26,32 +26,29 @@ namespace mst {
 namespace detail {
 
 template <typename idx_t>
-__device__ idx_t get_1D_idx() {
+__device__ idx_t get_1D_idx()
+{
   return blockIdx.x * blockDim.x + threadIdx.x;
 }
 
 // somewhat smart vector print
 template <typename T>
-void printv(rmm::device_vector<T>& vec, const std::string& name = "",
-            const size_t displ = 5) {
+void printv(rmm::device_vector<T>& vec, const std::string& name = "", const size_t displ = 5)
+{
 #ifdef MST_TIME
   std::cout.precision(15);
   std::cout << name << " size = " << vec.size() << std::endl;
   if (displ < vec.size()) {
-    thrust::copy(vec.begin(), vec.begin() + displ,
-                 std::ostream_iterator<T>(std::cout, " "));
+    thrust::copy(vec.begin(), vec.begin() + displ, std::ostream_iterator<T>(std::cout, " "));
     std::cout << " ... ";
-    thrust::copy(vec.end() - displ, vec.end(),
-                 std::ostream_iterator<T>(std::cout, " "));
+    thrust::copy(vec.end() - displ, vec.end(), std::ostream_iterator<T>(std::cout, " "));
   } else {
-    thrust::copy(vec.begin(), vec.end(),
-                 std::ostream_iterator<T>(std::cout, " "));
+    thrust::copy(vec.begin(), vec.end(), std::ostream_iterator<T>(std::cout, " "));
   }
   std::cout << std::endl << std::endl;
 #endif
 }
-#define duration_us(a) \
-  std::chrono::duration_cast<std::chrono::microseconds>(a).count()
+#define duration_us(a) std::chrono::duration_cast<std::chrono::microseconds>(a).count()
 
 }  // namespace detail
 }  // namespace mst
diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh
index 10c981445e..b49003467b 100644
--- a/cpp/include/raft/sparse/mst/mst.cuh
+++ b/cpp/include/raft/sparse/mst/mst.cuh
@@ -22,16 +22,30 @@
 namespace raft {
 namespace mst {
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t = weight_t>
-raft::Graph_COO<vertex_t, edge_t, weight_t> mst(
-  const raft::handle_t& handle, edge_t const* offsets, vertex_t const* indices,
-  weight_t const* weights, vertex_t const v, edge_t const e, vertex_t* color,
-  cudaStream_t stream, bool symmetrize_output = true,
-  bool initialize_colors = true, int iterations = 0) {
-  MST_solver<vertex_t, edge_t, weight_t, alteration_t> mst_solver(
-    handle, offsets, indices, weights, v, e, color, stream, symmetrize_output,
-    initialize_colors, iterations);
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t = weight_t>
+raft::Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
+                                                edge_t const* offsets,
+                                                vertex_t const* indices,
+                                                weight_t const* weights,
+                                                vertex_t const v,
+                                                edge_t const e,
+                                                vertex_t* color,
+                                                cudaStream_t stream,
+                                                bool symmetrize_output = true,
+                                                bool initialize_colors = true,
+                                                int iterations         = 0)
+{
+  MST_solver<vertex_t, edge_t, weight_t, alteration_t> mst_solver(handle,
+                                                                  offsets,
+                                                                  indices,
+                                                                  weights,
+                                                                  v,
+                                                                  e,
+                                                                  color,
+                                                                  stream,
+                                                                  symmetrize_output,
+                                                                  initialize_colors,
+                                                                  iterations);
   return mst_solver.solve();
 }
 
diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh
index 833882ea0d..e32bcfacac 100644
--- a/cpp/include/raft/sparse/mst/mst_solver.cuh
+++ b/cpp/include/raft/sparse/mst/mst_solver.cuh
@@ -31,20 +31,27 @@ struct Graph_COO {
   edge_t n_edges;
 
   Graph_COO(vertex_t size, cudaStream_t stream)
-    : src(size, stream), dst(size, stream), weights(size, stream) {}
+    : src(size, stream), dst(size, stream), weights(size, stream)
+  {
+  }
 };
 
 namespace mst {
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
 class MST_solver {
  public:
-  MST_solver(const raft::handle_t& handle_, const edge_t* offsets_,
-             const vertex_t* indices_, const weight_t* weights_,
-             const vertex_t v_, const edge_t e_, vertex_t* color_,
-             cudaStream_t stream_, bool symmetrize_output_,
-             bool initialize_colors_, int iterations_);
+  MST_solver(const raft::handle_t& handle_,
+             const edge_t* offsets_,
+             const vertex_t* indices_,
+             const weight_t* weights_,
+             const vertex_t v_,
+             const edge_t e_,
+             vertex_t* color_,
+             cudaStream_t stream_,
+             bool symmetrize_output_,
+             bool initialize_colors_,
+             int iterations_);
 
   raft::Graph_COO<vertex_t, edge_t, weight_t> solve();
 
@@ -56,7 +63,7 @@ class MST_solver {
   bool symmetrize_output, initialize_colors;
   int iterations;
 
-  //CSR
+  // CSR
   const edge_t* offsets;
   const vertex_t* indices;
   const weight_t* weights;
@@ -67,20 +74,16 @@ class MST_solver {
   vertex_t max_threads;
   vertex_t sm_count;
 
-  vertex_t* color_index;  // represent each supervertex as a color
-  rmm::device_vector<alteration_t>
-    min_edge_color;  // minimum incident edge weight per color
-  rmm::device_vector<edge_t> new_mst_edge;  // new minimum edge per vertex
-  rmm::device_vector<alteration_t>
-    altered_weights;  // weights to be used for mst
+  vertex_t* color_index;                             // represent each supervertex as a color
+  rmm::device_vector<alteration_t> min_edge_color;   // minimum incident edge weight per color
+  rmm::device_vector<edge_t> new_mst_edge;           // new minimum edge per vertex
+  rmm::device_vector<alteration_t> altered_weights;  // weights to be used for mst
+  rmm::device_vector<edge_t> mst_edge_count;  // total number of edges added after every iteration
   rmm::device_vector<edge_t>
-    mst_edge_count;  // total number of edges added after every iteration
-  rmm::device_vector<edge_t>
-    prev_mst_edge_count;  // total number of edges up to the previous iteration
-  rmm::device_vector<bool>
-    mst_edge;  // mst output -  true if the edge belongs in mst
+    prev_mst_edge_count;                    // total number of edges up to the previous iteration
+  rmm::device_vector<bool> mst_edge;        // mst output -  true if the edge belongs in mst
   rmm::device_vector<vertex_t> next_color;  //  next iteration color
-  rmm::device_vector<vertex_t> color;  // index of color that vertex points to
+  rmm::device_vector<vertex_t> color;       // index of color that vertex points to
 
   // new src-dst pairs found per iteration
   rmm::device_vector<vertex_t> temp_src;
@@ -93,8 +96,7 @@ class MST_solver {
   void check_termination();
   void alteration();
   alteration_t alteration_max();
-  void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst,
-                           weight_t* mst_weights);
+  void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights);
 };
 
 }  // namespace mst
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index 562d506cfe..397fecaaea 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -42,15 +42,23 @@ namespace sparse {
 namespace op {
 
 template <int TPB_X, typename T>
-__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols,
-                                         const T *vals, int nnz, int *crows,
-                                         int *ccols, T *cvals, int *ex_scan,
-                                         int *cur_ex_scan, int m, T scalar) {
+__global__ void coo_remove_scalar_kernel(const int* rows,
+                                         const int* cols,
+                                         const T* vals,
+                                         int nnz,
+                                         int* crows,
+                                         int* ccols,
+                                         T* cvals,
+                                         int* ex_scan,
+                                         int* cur_ex_scan,
+                                         int m,
+                                         T scalar)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < m) {
-    int start = cur_ex_scan[row];
-    int stop = get_stop_idx(row, m, nnz, cur_ex_scan);
+    int start       = cur_ex_scan[row];
+    int stop        = get_stop_idx(row, m, nnz, cur_ex_scan);
     int cur_out_idx = ex_scan[row];
 
     for (int idx = start; idx < stop; idx++) {
@@ -82,37 +90,51 @@ __global__ void coo_remove_scalar_kernel(const int *rows, const int *cols,
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
-                       int *crows, int *ccols, T *cvals, int *cnnz,
-                       int *cur_cnnz, T scalar, int n,
+void coo_remove_scalar(const int* rows,
+                       const int* cols,
+                       const T* vals,
+                       int nnz,
+                       int* crows,
+                       int* ccols,
+                       T* cvals,
+                       int* cnnz,
+                       int* cur_cnnz,
+                       T scalar,
+                       int n,
                        std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream) {
+                       cudaStream_t stream)
+{
   raft::mr::device::buffer<int> ex_scan(d_alloc, stream, n);
   raft::mr::device::buffer<int> cur_ex_scan(d_alloc, stream, n);
 
   CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream));
   CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream));
 
-  thrust::device_ptr<int> dev_cnnz = thrust::device_pointer_cast(cnnz);
-  thrust::device_ptr<int> dev_ex_scan =
-    thrust::device_pointer_cast(ex_scan.data());
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n,
-                         dev_ex_scan);
+  thrust::device_ptr<int> dev_cnnz    = thrust::device_pointer_cast(cnnz);
+  thrust::device_ptr<int> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  thrust::device_ptr<int> dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz);
-  thrust::device_ptr<int> dev_cur_ex_scan =
-    thrust::device_pointer_cast(cur_ex_scan.data());
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz,
-                         dev_cur_cnnz + n, dev_cur_ex_scan);
+  thrust::device_ptr<int> dev_cur_cnnz    = thrust::device_pointer_cast(cur_cnnz);
+  thrust::device_ptr<int> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
+  thrust::exclusive_scan(
+    thrust::cuda::par.on(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan);
   CUDA_CHECK(cudaPeekAtLastError());
 
   dim3 grid(raft::ceildiv(n, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(
-    rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(),
-    dev_cur_ex_scan.get(), n, scalar);
+  coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(rows,
+                                                            cols,
+                                                            vals,
+                                                            nnz,
+                                                            crows,
+                                                            ccols,
+                                                            cvals,
+                                                            dev_ex_scan.get(),
+                                                            dev_cur_ex_scan.get(),
+                                                            n,
+                                                            scalar);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -126,35 +148,44 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
+void coo_remove_scalar(COO<T>* in,
+                       COO<T>* out,
+                       T scalar,
                        std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream) {
+                       cudaStream_t stream)
+{
   raft::mr::device::buffer<int> row_count_nz(d_alloc, stream, in->n_rows);
   raft::mr::device::buffer<int> row_count(d_alloc, stream, in->n_rows);
 
-  CUDA_CHECK(
-    cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
-  CUDA_CHECK(
-    cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
 
   linalg::coo_degree<TPB_X>(in->rows(), in->nnz, row_count.data(), stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  linalg::coo_degree_scalar<TPB_X>(in->rows(), in->vals(), in->nnz, scalar,
-                                   row_count_nz.data(), stream);
+  linalg::coo_degree_scalar<TPB_X>(
+    in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  thrust::device_ptr<int> d_row_count_nz =
-    thrust::device_pointer_cast(row_count_nz.data());
-  int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz,
-                               d_row_count_nz + in->n_rows);
+  thrust::device_ptr<int> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
+  int out_nnz =
+    thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz, d_row_count_nz + in->n_rows);
 
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
 
-  coo_remove_scalar<TPB_X, T>(in->rows(), in->cols(), in->vals(), in->nnz,
-                              out->rows(), out->cols(), out->vals(),
-                              row_count_nz.data(), row_count.data(), scalar,
-                              in->n_rows, d_alloc, stream);
+  coo_remove_scalar<TPB_X, T>(in->rows(),
+                              in->cols(),
+                              in->vals(),
+                              in->nnz,
+                              out->rows(),
+                              out->cols(),
+                              out->vals(),
+                              row_count_nz.data(),
+                              row_count.data(),
+                              scalar,
+                              in->n_rows,
+                              d_alloc,
+                              stream);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -167,9 +198,11 @@ void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_zeros(COO<T> *in, COO<T> *out,
+void coo_remove_zeros(COO<T>* in,
+                      COO<T>* out,
                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                      cudaStream_t stream) {
+                      cudaStream_t stream)
+{
   coo_remove_scalar<TPB_X, T>(in, out, T(0.0), d_alloc, stream);
 }
 
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index 53c9f89074..bc4d7bace5 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -46,25 +46,29 @@ namespace sparse {
 namespace op {
 
 template <typename value_idx>
-__global__ void compute_duplicates_diffs_kernel(const value_idx *rows,
-                                                const value_idx *cols,
-                                                value_idx *diff, size_t nnz) {
+__global__ void compute_duplicates_diffs_kernel(const value_idx* rows,
+                                                const value_idx* cols,
+                                                value_idx* diff,
+                                                size_t nnz)
+{
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid >= nnz) return;
 
   value_idx d = 1;
-  if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid]))
-    d = 0;
+  if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) d = 0;
   diff[tid] = d;
 }
 
 template <typename value_idx, typename value_t>
-__global__ void max_duplicates_kernel(const value_idx *src_rows,
-                                      const value_idx *src_cols,
-                                      const value_t *src_vals,
-                                      const value_idx *index,
-                                      value_idx *out_rows, value_idx *out_cols,
-                                      value_t *out_vals, size_t nnz) {
+__global__ void max_duplicates_kernel(const value_idx* src_rows,
+                                      const value_idx* src_cols,
+                                      const value_t* src_vals,
+                                      const value_idx* index,
+                                      value_idx* out_rows,
+                                      value_idx* out_cols,
+                                      value_t* out_vals,
+                                      size_t nnz)
+{
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid < nnz) {
@@ -96,13 +100,13 @@ __global__ void max_duplicates_kernel(const value_idx *src_rows,
  * @param[in] stream cuda ops will be ordered wrt this stream
  */
 template <typename value_idx>
-void compute_duplicates_mask(value_idx *mask, const value_idx *rows,
-                             const value_idx *cols, size_t nnz,
-                             cudaStream_t stream) {
+void compute_duplicates_mask(
+  value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream)
+{
   CUDA_CHECK(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream));
 
-  compute_duplicates_diffs_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0,
-                                    stream>>>(rows, cols, mask, nnz);
+  compute_duplicates_diffs_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0, stream>>>(
+    rows, cols, mask, nnz);
 }
 
 /**
@@ -122,12 +126,17 @@ void compute_duplicates_mask(value_idx *mask, const value_idx *rows,
  * @param[in] stream cuda ops will be ordered wrt this stream
  */
 template <typename value_idx, typename value_t>
-void max_duplicates(const raft::handle_t &handle,
-                    raft::sparse::COO<value_t, value_idx> &out,
-                    const value_idx *rows, const value_idx *cols,
-                    const value_t *vals, size_t nnz, size_t m, size_t n) {
+void max_duplicates(const raft::handle_t& handle,
+                    raft::sparse::COO<value_t, value_idx>& out,
+                    const value_idx* rows,
+                    const value_idx* cols,
+                    const value_t* vals,
+                    size_t nnz,
+                    size_t m,
+                    size_t n)
+{
   auto d_alloc = handle.get_device_allocator();
-  auto stream = handle.get_stream();
+  auto stream  = handle.get_stream();
 
   auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
 
@@ -136,8 +145,8 @@ void max_duplicates(const raft::handle_t &handle,
 
   compute_duplicates_mask(diff.data(), rows, cols, nnz, stream);
 
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), diff.data(),
-                         diff.data() + diff.size(), diff.data());
+  thrust::exclusive_scan(
+    thrust::cuda::par.on(stream), diff.data(), diff.data() + diff.size(), diff.data());
 
   // compute final size
   value_idx size = 0;
diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh
index 9e5034dc28..194a878ac1 100644
--- a/cpp/include/raft/sparse/op/row_op.cuh
+++ b/cpp/include/raft/sparse/op/row_op.cuh
@@ -38,12 +38,12 @@ namespace sparse {
 namespace op {
 
 template <typename T, int TPB_X = 256, typename Lambda = auto(T, T, T)->void>
-__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz,
-                                  Lambda op) {
+__global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op)
+{
   T row = blockIdx.x * TPB_X + threadIdx.x;
   if (row < n_rows) {
     T start_idx = row_ind[row];
-    T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz;
+    T stop_idx  = row < n_rows - 1 ? row_ind[row + 1] : nnz;
     op(row, start_idx, stop_idx);
   }
 }
@@ -59,14 +59,12 @@ __global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz,
  * @param op custom row operation functor accepting the row and beginning index.
  * @param stream cuda stream to use
  */
-template <typename Index_, int TPB_X = 256,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op,
-                cudaStream_t stream) {
+template <typename Index_, int TPB_X = 256, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1);
   dim3 blk(TPB_X, 1, 1);
-  csr_row_op_kernel<Index_, TPB_X>
-    <<<grid, blk, 0, stream>>>(row_ind, n_rows, nnz, op);
+  csr_row_op_kernel<Index_, TPB_X><<<grid, blk, 0, stream>>>(row_ind, n_rows, nnz, op);
 
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/sparse/op/slice.h b/cpp/include/raft/sparse/op/slice.h
index 46f4f41879..9bbe04cf34 100644
--- a/cpp/include/raft/sparse/op/slice.h
+++ b/cpp/include/raft/sparse/op/slice.h
@@ -50,10 +50,14 @@ namespace op {
  * @param[in] stream : cuda stream for ordering events
  */
 template <typename value_idx>
-void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
-                          const value_idx *indptr, value_idx *indptr_out,
-                          value_idx *start_offset, value_idx *stop_offset,
-                          cudaStream_t stream) {
+void csr_row_slice_indptr(value_idx start_row,
+                          value_idx stop_row,
+                          const value_idx* indptr,
+                          value_idx* indptr_out,
+                          value_idx* start_offset,
+                          value_idx* stop_offset,
+                          cudaStream_t stream)
+{
   raft::update_host(start_offset, indptr + start_row, 1, stream);
   raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream);
 
@@ -63,11 +67,12 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
 
   // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1,
   // we add another 1 to stop row.
-  raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row,
-                   stream);
+  raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, stream);
 
   raft::linalg::unaryOp<value_idx>(
-    indptr_out, indptr_out, (stop_row + 2) - start_row,
+    indptr_out,
+    indptr_out,
+    (stop_row + 2) - start_row,
     [s_offset] __device__(value_idx input) { return input - s_offset; },
     stream);
 }
@@ -85,12 +90,15 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
  * @param[in] stream : cuda stream for ordering events
  */
 template <typename value_idx, typename value_t>
-void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset,
-                            const value_idx *indices, const value_t *data,
-                            value_idx *indices_out, value_t *data_out,
-                            cudaStream_t stream) {
-  raft::copy(indices_out, indices + start_offset, stop_offset - start_offset,
-             stream);
+void csr_row_slice_populate(value_idx start_offset,
+                            value_idx stop_offset,
+                            const value_idx* indices,
+                            const value_t* data,
+                            value_idx* indices_out,
+                            value_t* data_out,
+                            cudaStream_t stream)
+{
+  raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, stream);
   raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream);
 }
 
diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h
index 9dbe2b67c5..3cab24fc09 100644
--- a/cpp/include/raft/sparse/op/sort.h
+++ b/cpp/include/raft/sparse/op/sort.h
@@ -42,7 +42,8 @@ namespace op {
 
 struct TupleComp {
   template <typename one, typename two>
-  __host__ __device__ bool operator()(const one &t1, const two &t2) {
+  __host__ __device__ bool operator()(const one& t1, const two& t2)
+  {
     // sort first by each sample's color,
     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
     if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
@@ -66,15 +67,21 @@ struct TupleComp {
  * @param stream: cuda stream to use
  */
 template <typename T>
-void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
+void coo_sort(int m,
+              int n,
+              int nnz,
+              int* rows,
+              int* cols,
+              T* vals,
               // TODO: Remove this
               std::shared_ptr<raft::mr::device::allocator> d_alloc,
-              cudaStream_t stream) {
+              cudaStream_t stream)
+{
   auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(thrust::cuda::par.on(stream), coo_indices,
-                      coo_indices + nnz, vals, TupleComp());
+  thrust::sort_by_key(
+    thrust::cuda::par.on(stream), coo_indices, coo_indices + nnz, vals, TupleComp());
 }
 
 /**
@@ -85,12 +92,12 @@ void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
  * @param stream: the cuda stream to use
  */
 template <typename T>
-void coo_sort(COO<T> *const in,
+void coo_sort(COO<T>* const in,
               // TODO: Remove this
               std::shared_ptr<raft::mr::device::allocator> d_alloc,
-              cudaStream_t stream) {
-  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(),
-              in->vals(), d_alloc, stream);
+              cudaStream_t stream)
+{
+  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), d_alloc, stream);
 }
 
 /**
@@ -104,16 +111,16 @@ void coo_sort(COO<T> *const in,
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t>
-void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data,
-                        value_idx nnz, cudaStream_t stream) {
+void coo_sort_by_weight(
+  value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream)
+{
   thrust::device_ptr<value_idx> t_rows = thrust::device_pointer_cast(rows);
   thrust::device_ptr<value_idx> t_cols = thrust::device_pointer_cast(cols);
-  thrust::device_ptr<value_t> t_data = thrust::device_pointer_cast(data);
+  thrust::device_ptr<value_t> t_data   = thrust::device_pointer_cast(data);
 
   auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
-  thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz,
-                      first);
+  thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz, first);
 }
 };  // namespace op
 };  // end NAMESPACE sparse
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index 8aae90f1d8..ec8bec6eb3 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -59,17 +59,20 @@ struct KeyValuePair {
   __host__ __device__ __forceinline__ KeyValuePair() {}
 
   /// Copy Constructor
-  __host__ __device__ __forceinline__
-  KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp)
-    : key(kvp.key), value(kvp.value) {}
+  __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp)
+    : key(kvp.key), value(kvp.value)
+  {
+  }
 
   /// Constructor
-  __host__ __device__ __forceinline__ KeyValuePair(Key const &key,
-                                                   Value const &value)
-    : key(key), value(value) {}
+  __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value)
+    : key(key), value(value)
+  {
+  }
 
   /// Inequality operator
-  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair &b) {
+  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b)
+  {
     return (value != b.value) || (key != b.key);
   }
 };
@@ -83,31 +86,32 @@ struct KeyValuePair {
  */
 template <typename value_idx, typename value_t>
 struct FixConnectivitiesRedOp {
-  value_idx *colors;
+  value_idx* colors;
   value_idx m;
 
-  FixConnectivitiesRedOp(value_idx *colors_, value_idx m_)
-    : colors(colors_), m(m_){};
+  FixConnectivitiesRedOp(value_idx* colors_, value_idx m_) : colors(colors_), m(m_){};
 
   typedef typename cub::KeyValuePair<value_idx, value_t> KVP;
-  DI void operator()(value_idx rit, KVP *out, const KVP &other) {
-    if (rit < m && other.value < out->value &&
-        colors[rit] != colors[other.key]) {
-      out->key = other.key;
+  DI void operator()(value_idx rit, KVP* out, const KVP& other)
+  {
+    if (rit < m && other.value < out->value && colors[rit] != colors[other.key]) {
+      out->key   = other.key;
       out->value = other.value;
     }
   }
 
-  DI KVP operator()(value_idx rit, const KVP &a, const KVP &b) {
+  DI KVP operator()(value_idx rit, const KVP& a, const KVP& b)
+  {
     if (rit < m && a.value < b.value && colors[rit] != colors[a.key]) {
       return a;
     } else
       return b;
   }
 
-  DI void init(value_t *out, value_t maxVal) { *out = maxVal; }
-  DI void init(KVP *out, value_t maxVal) {
-    out->key = -1;
+  DI void init(value_t* out, value_t maxVal) { *out = maxVal; }
+  DI void init(KVP* out, value_t maxVal)
+  {
+    out->key   = -1;
     out->value = maxVal;
   }
 };
@@ -119,7 +123,8 @@ struct FixConnectivitiesRedOp {
  */
 struct TupleComp {
   template <typename one, typename two>
-  __host__ __device__ bool operator()(const one &t1, const two &t2) {
+  __host__ __device__ bool operator()(const one& t1, const two& t2)
+  {
     // sort first by each sample's color,
     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
     if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
@@ -137,13 +142,9 @@ template <typename LabelT, typename DataT>
 struct CubKVPMinReduce {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
-  DI KVP operator()(const KVP &a, const KVP &b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
 };  // KVPMinReduce
 
@@ -158,13 +159,14 @@ struct CubKVPMinReduce {
  * @return total number of components
  */
 template <typename value_idx>
-value_idx get_n_components(value_idx *colors, size_t n_rows,
+value_idx get_n_components(value_idx* colors,
+                           size_t n_rows,
                            std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                           cudaStream_t stream) {
-  value_idx *map_ids;
+                           cudaStream_t stream)
+{
+  value_idx* map_ids;
   int num_clusters;
-  raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream,
-                               d_alloc);
+  raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream, d_alloc);
   d_alloc->deallocate(map_ids, num_clusters * sizeof(value_idx), stream);
 
   return num_clusters;
@@ -177,11 +179,12 @@ value_idx get_n_components(value_idx *colors, size_t n_rows,
  */
 template <typename value_idx, typename value_t>
 struct LookupColorOp {
-  value_idx *colors;
+  value_idx* colors;
 
-  LookupColorOp(value_idx *colors_) : colors(colors_) {}
+  LookupColorOp(value_idx* colors_) : colors(colors_) {}
 
-  DI value_idx operator()(const cub::KeyValuePair<value_idx, value_t> &kvp) {
+  DI value_idx operator()(const cub::KeyValuePair<value_idx, value_t>& kvp)
+  {
     return colors[kvp.key];
   }
 };
@@ -191,7 +194,8 @@ struct LookupColorOp {
  * the given array of components
  * @tparam value_idx
  * @tparam value_t
- * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given array of components
+ * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given
+ * array of components
  * @param[out] nn_colors components of nearest neighbors for each vertex
  * @param[in] colors components of each vertex
  * @param[in] X original dense data
@@ -201,25 +205,39 @@ struct LookupColorOp {
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t, typename red_op>
-void perform_1nn(cub::KeyValuePair<value_idx, value_t> *kvp,
-                 value_idx *nn_colors, value_idx *colors, const value_t *X,
-                 size_t n_rows, size_t n_cols,
+void perform_1nn(cub::KeyValuePair<value_idx, value_t>* kvp,
+                 value_idx* nn_colors,
+                 value_idx* colors,
+                 const value_t* X,
+                 size_t n_rows,
+                 size_t n_cols,
                  std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                 cudaStream_t stream, red_op reduction_op) {
+                 cudaStream_t stream,
+                 red_op reduction_op)
+{
   rmm::device_uvector<int> workspace(n_rows, stream);
   rmm::device_uvector<value_t> x_norm(n_rows, stream);
 
-  raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm,
-                        true, stream);
-
-  raft::distance::fusedL2NN<value_t, cub::KeyValuePair<value_idx, value_t>,
-                            value_idx>(
-    kvp, X, X, x_norm.data(), x_norm.data(), n_rows, n_rows, n_cols,
-    workspace.data(), reduction_op, reduction_op, true, true, stream);
+  raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, true, stream);
+
+  raft::distance::fusedL2NN<value_t, cub::KeyValuePair<value_idx, value_t>, value_idx>(
+    kvp,
+    X,
+    X,
+    x_norm.data(),
+    x_norm.data(),
+    n_rows,
+    n_rows,
+    n_cols,
+    workspace.data(),
+    reduction_op,
+    reduction_op,
+    true,
+    true,
+    stream);
 
   LookupColorOp<value_idx, value_t> extract_colors_op(colors);
-  thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors,
-                    extract_colors_op);
+  thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op);
 }
 
 /**
@@ -235,27 +253,33 @@ void perform_1nn(cub::KeyValuePair<value_idx, value_t> *kvp,
  * @param stream stream for which to order CUDA operations
  */
 template <typename value_idx, typename value_t>
-void sort_by_color(value_idx *colors, value_idx *nn_colors,
-                   cub::KeyValuePair<value_idx, value_t> *kvp,
-                   value_idx *src_indices, size_t n_rows, cudaStream_t stream) {
+void sort_by_color(value_idx* colors,
+                   value_idx* nn_colors,
+                   cub::KeyValuePair<value_idx, value_t>* kvp,
+                   value_idx* src_indices,
+                   size_t n_rows,
+                   cudaStream_t stream)
+{
   thrust::counting_iterator<value_idx> arg_sort_iter(0);
-  thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter,
-               arg_sort_iter + n_rows, src_indices);
+  thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter, arg_sort_iter + n_rows, src_indices);
 
-  auto keys = thrust::make_zip_iterator(thrust::make_tuple(
-    colors, nn_colors, (raft::linkage::KeyValuePair<value_idx, value_t> *)kvp));
+  auto keys = thrust::make_zip_iterator(
+    thrust::make_tuple(colors, nn_colors, (raft::linkage::KeyValuePair<value_idx, value_t>*)kvp));
   auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals,
-                      TupleComp());
+  thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals, TupleComp());
 }
 
 template <typename value_idx, typename value_t>
-__global__ void min_components_by_color_kernel(
-  value_idx *out_rows, value_idx *out_cols, value_t *out_vals,
-  const value_idx *out_index, const value_idx *indices,
-  const cub::KeyValuePair<value_idx, value_t> *kvp, size_t nnz) {
+__global__ void min_components_by_color_kernel(value_idx* out_rows,
+                                               value_idx* out_cols,
+                                               value_t* out_vals,
+                                               const value_idx* out_index,
+                                               const value_idx* indices,
+                                               const cub::KeyValuePair<value_idx, value_t>* kvp,
+                                               size_t nnz)
+{
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid >= nnz) return;
@@ -284,19 +308,20 @@ __global__ void min_components_by_color_kernel(
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t>
-void min_components_by_color(raft::sparse::COO<value_t, value_idx> &coo,
-                             const value_idx *out_index,
-                             const value_idx *indices,
-                             const cub::KeyValuePair<value_idx, value_t> *kvp,
-                             size_t nnz, cudaStream_t stream) {
+void min_components_by_color(raft::sparse::COO<value_t, value_idx>& coo,
+                             const value_idx* out_index,
+                             const value_idx* indices,
+                             const cub::KeyValuePair<value_idx, value_t>* kvp,
+                             size_t nnz,
+                             cudaStream_t stream)
+{
   /**
    * Arrays should be ordered by: colors_indptr->colors_n->kvp.value
    * so the last element of each column in the input CSR should be
    * the min.
    */
-  min_components_by_color_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0,
-                                   stream>>>(coo.rows(), coo.cols(), coo.vals(),
-                                             out_index, indices, kvp, nnz);
+  min_components_by_color_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0, stream>>>(
+    coo.rows(), coo.cols(), coo.vals(), out_index, indices, kvp, nnz);
 }
 
 /**
@@ -318,14 +343,18 @@ void min_components_by_color(raft::sparse::COO<value_t, value_idx> &coo,
  * @param[in] n_cols number of cols in X
  */
 template <typename value_idx, typename value_t, typename red_op>
-void connect_components(const raft::handle_t &handle,
-                        raft::sparse::COO<value_t, value_idx> &out,
-                        const value_t *X, const value_idx *orig_colors,
-                        size_t n_rows, size_t n_cols, red_op reduction_op,
-                        raft::distance::DistanceType metric =
-                          raft::distance::DistanceType::L2SqrtExpanded) {
+void connect_components(
+  const raft::handle_t& handle,
+  raft::sparse::COO<value_t, value_idx>& out,
+  const value_t* X,
+  const value_idx* orig_colors,
+  size_t n_rows,
+  size_t n_cols,
+  red_op reduction_op,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
+{
   auto d_alloc = handle.get_device_allocator();
-  auto stream = handle.get_stream();
+  auto stream  = handle.get_stream();
 
   RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
                "Fixing connectivities for an unconnected k-NN graph only "
@@ -335,47 +364,52 @@ void connect_components(const raft::handle_t &handle,
   raft::copy_async(colors.data(), orig_colors, n_rows, stream);
 
   // Normalize colors so they are drawn from a monotonically increasing set
-  raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream,
-                              d_alloc, true);
+  raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, d_alloc, true);
 
-  value_idx n_components =
-    get_n_components(colors.data(), n_rows, d_alloc, stream);
+  value_idx n_components = get_n_components(colors.data(), n_rows, d_alloc, stream);
 
   /**
    * First compute 1-nn for all colors where the color of each data point
    * is guaranteed to be != color of its nearest neighbor.
    */
   rmm::device_uvector<value_idx> nn_colors(n_rows, stream);
-  rmm::device_uvector<cub::KeyValuePair<value_idx, value_t>> temp_inds_dists(
-    n_rows, stream);
+  rmm::device_uvector<cub::KeyValuePair<value_idx, value_t>> temp_inds_dists(n_rows, stream);
   rmm::device_uvector<value_idx> src_indices(n_rows, stream);
 
-  perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X,
-              n_rows, n_cols, d_alloc, stream, reduction_op);
+  perform_1nn(temp_inds_dists.data(),
+              nn_colors.data(),
+              colors.data(),
+              X,
+              n_rows,
+              n_cols,
+              d_alloc,
+              stream,
+              reduction_op);
 
   /**
    * Sort data points by color (neighbors are not sorted)
    */
   // max_color + 1 = number of connected components
   // sort nn_colors by key w/ original colors
-  sort_by_color(colors.data(), nn_colors.data(), temp_inds_dists.data(),
-                src_indices.data(), n_rows, stream);
+  sort_by_color(
+    colors.data(), nn_colors.data(), temp_inds_dists.data(), src_indices.data(), n_rows, stream);
 
   /**
    * Take the min for any duplicate colors
    */
   // Compute mask of duplicates
   rmm::device_uvector<value_idx> out_index(n_rows + 1, stream);
-  raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(),
-                                            nn_colors.data(), n_rows, stream);
+  raft::sparse::op::compute_duplicates_mask(
+    out_index.data(), colors.data(), nn_colors.data(), n_rows, stream);
 
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), out_index.data(),
-                         out_index.data() + out_index.size(), out_index.data());
+  thrust::exclusive_scan(thrust::cuda::par.on(stream),
+                         out_index.data(),
+                         out_index.data() + out_index.size(),
+                         out_index.data());
 
   // compute final size
   value_idx size = 0;
-  raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1,
-                    stream);
+  raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   size++;
@@ -383,14 +417,14 @@ void connect_components(const raft::handle_t &handle,
   raft::sparse::COO<value_t, value_idx> min_edges(d_alloc, stream);
   min_edges.allocate(size, n_rows, n_rows, true, stream);
 
-  min_components_by_color(min_edges, out_index.data(), src_indices.data(),
-                          temp_inds_dists.data(), n_rows, stream);
+  min_components_by_color(
+    min_edges, out_index.data(), src_indices.data(), temp_inds_dists.data(), n_rows, stream);
 
   /**
    * Symmetrize resulting edge list
    */
-  raft::sparse::linalg::symmetrize(handle, min_edges.rows(), min_edges.cols(),
-                                   min_edges.vals(), n_rows, n_rows, size, out);
+  raft::sparse::linalg::symmetrize(
+    handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out);
 }
 
 };  // end namespace linkage
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index 71fbb8ab3d..dbb24ee334 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -49,9 +49,11 @@ namespace selection {
 
 template <typename value_idx, typename value_t>
 struct csr_batcher_t {
-  csr_batcher_t(value_idx batch_size, value_idx n_rows,
-                const value_idx *csr_indptr, const value_idx *csr_indices,
-                const value_t *csr_data)
+  csr_batcher_t(value_idx batch_size,
+                value_idx n_rows,
+                const value_idx* csr_indptr,
+                const value_idx* csr_indices,
+                const value_t* csr_data)
     : batch_start_(0),
       batch_stop_(0),
       batch_rows_(0),
@@ -61,32 +63,42 @@ struct csr_batcher_t {
       csr_indices_(csr_indices),
       csr_data_(csr_data),
       batch_csr_start_offset_(0),
-      batch_csr_stop_offset_(0) {}
+      batch_csr_stop_offset_(0)
+  {
+  }
 
-  void set_batch(int batch_num) {
+  void set_batch(int batch_num)
+  {
     batch_start_ = batch_num * batch_size_;
-    batch_stop_ = batch_start_ + batch_size_ - 1;  // zero-based indexing
+    batch_stop_  = batch_start_ + batch_size_ - 1;  // zero-based indexing
 
-    if (batch_stop_ >= total_rows_)
-      batch_stop_ = total_rows_ - 1;  // zero-based indexing
+    if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1;  // zero-based indexing
 
     batch_rows_ = (batch_stop_ - batch_start_) + 1;
   }
 
-  value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr,
-                                     cudaStream_t stream) {
-    raft::sparse::op::csr_row_slice_indptr(
-      batch_start_, batch_stop_, csr_indptr_, batch_indptr,
-      &batch_csr_start_offset_, &batch_csr_stop_offset_, stream);
+  value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream)
+  {
+    raft::sparse::op::csr_row_slice_indptr(batch_start_,
+                                           batch_stop_,
+                                           csr_indptr_,
+                                           batch_indptr,
+                                           &batch_csr_start_offset_,
+                                           &batch_csr_stop_offset_,
+                                           stream);
 
     return batch_csr_stop_offset_ - batch_csr_start_offset_;
   }
 
-  void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data,
-                                  cudaStream_t stream) {
-    raft::sparse::op::csr_row_slice_populate(
-      batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_,
-      csr_indices, csr_data, stream);
+  void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream)
+  {
+    raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_,
+                                             batch_csr_stop_offset_,
+                                             csr_indices_,
+                                             csr_data_,
+                                             csr_indices,
+                                             csr_data,
+                                             stream);
   }
 
   value_idx batch_rows() const { return batch_rows_; }
@@ -103,9 +115,9 @@ struct csr_batcher_t {
 
   value_idx total_rows_;
 
-  const value_idx *csr_indptr_;
-  const value_idx *csr_indices_;
-  const value_t *csr_data_;
+  const value_idx* csr_indptr_;
+  const value_idx* csr_indices_;
+  const value_t* csr_data_;
 
   value_idx batch_csr_start_offset_;
   value_idx batch_csr_stop_offset_;
@@ -114,18 +126,26 @@ struct csr_batcher_t {
 template <typename value_idx, typename value_t>
 class sparse_knn_t {
  public:
-  sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_,
-               const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_,
-               int n_idx_cols_, const value_idx *queryIndptr_,
-               const value_idx *queryIndices_, const value_t *queryData_,
-               size_t queryNNZ_, int n_query_rows_, int n_query_cols_,
-               value_idx *output_indices_, value_t *output_dists_, int k_,
-               const raft::handle_t &handle_,
-               size_t batch_size_index_ = 2 << 14,  // approx 1M
-               size_t batch_size_query_ = 2 << 14,
-               raft::distance::DistanceType metric_ =
-                 raft::distance::DistanceType::L2Expanded,
-               float metricArg_ = 0)
+  sparse_knn_t(const value_idx* idxIndptr_,
+               const value_idx* idxIndices_,
+               const value_t* idxData_,
+               size_t idxNNZ_,
+               int n_idx_rows_,
+               int n_idx_cols_,
+               const value_idx* queryIndptr_,
+               const value_idx* queryIndices_,
+               const value_t* queryData_,
+               size_t queryNNZ_,
+               int n_query_rows_,
+               int n_query_cols_,
+               value_idx* output_indices_,
+               value_t* output_dists_,
+               int k_,
+               const raft::handle_t& handle_,
+               size_t batch_size_index_             = 2 << 14,  // approx 1M
+               size_t batch_size_query_             = 2 << 14,
+               raft::distance::DistanceType metric_ = raft::distance::DistanceType::L2Expanded,
+               float metricArg_                     = 0)
     : idxIndptr(idxIndptr_),
       idxIndices(idxIndices_),
       idxData(idxData_),
@@ -145,9 +165,12 @@ class sparse_knn_t {
       batch_size_index(batch_size_index_),
       batch_size_query(batch_size_query_),
       metric(metric_),
-      metricArg(metricArg_) {}
+      metricArg(metricArg_)
+  {
+  }
 
-  void run() {
+  void run()
+  {
     using namespace raft::sparse;
 
     int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query);
@@ -158,37 +181,33 @@ class sparse_knn_t {
 
     for (int i = 0; i < n_batches_query; i++) {
       /**
-        * Compute index batch info
-        */
+       * Compute index batch info
+       */
       query_batcher.set_batch(i);
 
       /**
-        * Slice CSR to rows in batch
-        */
+       * Slice CSR to rows in batch
+       */
 
-      rmm::device_uvector<value_idx> query_batch_indptr(
-        query_batcher.batch_rows() + 1, handle.get_stream());
+      rmm::device_uvector<value_idx> query_batch_indptr(query_batcher.batch_rows() + 1,
+                                                        handle.get_stream());
 
-      value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz(
-        query_batch_indptr.data(), handle.get_stream());
+      value_idx n_query_batch_nnz =
+        query_batcher.get_batch_csr_indptr_nnz(query_batch_indptr.data(), handle.get_stream());
 
-      rmm::device_uvector<value_idx> query_batch_indices(n_query_batch_nnz,
-                                                         handle.get_stream());
-      rmm::device_uvector<value_t> query_batch_data(n_query_batch_nnz,
-                                                    handle.get_stream());
+      rmm::device_uvector<value_idx> query_batch_indices(n_query_batch_nnz, handle.get_stream());
+      rmm::device_uvector<value_t> query_batch_data(n_query_batch_nnz, handle.get_stream());
 
-      query_batcher.get_batch_csr_indices_data(query_batch_indices.data(),
-                                               query_batch_data.data(),
-                                               handle.get_stream());
+      query_batcher.get_batch_csr_indices_data(
+        query_batch_indices.data(), query_batch_data.data(), handle.get_stream());
 
       // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent
       // batches and 1 space for the results of the merge, which get copied back to the top
-      rmm::device_uvector<value_idx> merge_buffer_indices(0,
-                                                          handle.get_stream());
+      rmm::device_uvector<value_idx> merge_buffer_indices(0, handle.get_stream());
       rmm::device_uvector<value_t> merge_buffer_dists(0, handle.get_stream());
 
-      value_t *dists_merge_buffer_ptr;
-      value_idx *indices_merge_buffer_ptr;
+      value_t* dists_merge_buffer_ptr;
+      value_idx* indices_merge_buffer_ptr;
 
       int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index);
       csr_batcher_t<value_idx, value_t> idx_batcher(
@@ -197,22 +216,19 @@ class sparse_knn_t {
       for (int j = 0; j < n_batches_idx; j++) {
         idx_batcher.set_batch(j);
 
-        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3,
-                                    handle.get_stream());
-        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3,
-                                  handle.get_stream());
+        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, handle.get_stream());
+        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, handle.get_stream());
 
         /**
-          * Slice CSR to rows in batch
-        */
-        rmm::device_uvector<value_idx> idx_batch_indptr(
-          idx_batcher.batch_rows() + 1, handle.get_stream());
-        rmm::device_uvector<value_idx> idx_batch_indices(0,
-                                                         handle.get_stream());
+         * Slice CSR to rows in batch
+         */
+        rmm::device_uvector<value_idx> idx_batch_indptr(idx_batcher.batch_rows() + 1,
+                                                        handle.get_stream());
+        rmm::device_uvector<value_idx> idx_batch_indices(0, handle.get_stream());
         rmm::device_uvector<value_t> idx_batch_data(0, handle.get_stream());
 
-        value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz(
-          idx_batch_indptr.data(), handle.get_stream());
+        value_idx idx_batch_nnz =
+          idx_batcher.get_batch_csr_indptr_nnz(idx_batch_indptr.data(), handle.get_stream());
 
         idx_batch_indices.resize(idx_batch_nnz, handle.get_stream());
         idx_batch_data.resize(idx_batch_nnz, handle.get_stream());
@@ -221,111 +237,126 @@ class sparse_knn_t {
           idx_batch_indices.data(), idx_batch_data.data(), handle.get_stream());
 
         /**
-           * Compute distances
-           */
-        size_t dense_size =
-          idx_batcher.batch_rows() * query_batcher.batch_rows();
-        rmm::device_uvector<value_t> batch_dists(dense_size,
-                                                 handle.get_stream());
-
-        CUDA_CHECK(cudaMemset(batch_dists.data(), 0,
-                              batch_dists.size() * sizeof(value_t)));
-
-        compute_distances(idx_batcher, query_batcher, idx_batch_nnz,
-                          n_query_batch_nnz, idx_batch_indptr.data(),
-                          idx_batch_indices.data(), idx_batch_data.data(),
-                          query_batch_indptr.data(), query_batch_indices.data(),
-                          query_batch_data.data(), batch_dists.data());
+         * Compute distances
+         */
+        size_t dense_size = idx_batcher.batch_rows() * query_batcher.batch_rows();
+        rmm::device_uvector<value_t> batch_dists(dense_size, handle.get_stream());
+
+        CUDA_CHECK(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t)));
+
+        compute_distances(idx_batcher,
+                          query_batcher,
+                          idx_batch_nnz,
+                          n_query_batch_nnz,
+                          idx_batch_indptr.data(),
+                          idx_batch_indices.data(),
+                          idx_batch_data.data(),
+                          query_batch_indptr.data(),
+                          query_batch_indices.data(),
+                          query_batch_data.data(),
+                          batch_dists.data());
 
         // Build batch indices array
-        rmm::device_uvector<value_idx> batch_indices(batch_dists.size(),
-                                                     handle.get_stream());
+        rmm::device_uvector<value_idx> batch_indices(batch_dists.size(), handle.get_stream());
 
         // populate batch indices array
-        value_idx batch_rows = query_batcher.batch_rows(),
-                  batch_cols = idx_batcher.batch_rows();
+        value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
 
-        iota_fill(batch_indices.data(), batch_rows, batch_cols,
-                  handle.get_stream());
+        iota_fill(batch_indices.data(), batch_rows, batch_cols, handle.get_stream());
 
         /**
          * Perform k-selection on batch & merge with other k-selections
          */
         size_t merge_buffer_offset = batch_rows * k;
-        dists_merge_buffer_ptr =
-          merge_buffer_dists.data() + merge_buffer_offset;
-        indices_merge_buffer_ptr =
-          merge_buffer_indices.data() + merge_buffer_offset;
-
-        perform_k_selection(idx_batcher, query_batcher, batch_dists.data(),
-                            batch_indices.data(), dists_merge_buffer_ptr,
+        dists_merge_buffer_ptr     = merge_buffer_dists.data() + merge_buffer_offset;
+        indices_merge_buffer_ptr   = merge_buffer_indices.data() + merge_buffer_offset;
+
+        perform_k_selection(idx_batcher,
+                            query_batcher,
+                            batch_dists.data(),
+                            batch_indices.data(),
+                            dists_merge_buffer_ptr,
                             indices_merge_buffer_ptr);
 
-        value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr;
-        value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
+        value_t* dists_merge_buffer_tmp_ptr     = dists_merge_buffer_ptr;
+        value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
 
         // Merge results of difference batches if necessary
         if (idx_batcher.batch_start() > 0) {
-          size_t merge_buffer_tmp_out = batch_rows * k * 2;
-          dists_merge_buffer_tmp_ptr =
-            merge_buffer_dists.data() + merge_buffer_tmp_out;
-          indices_merge_buffer_tmp_ptr =
-            merge_buffer_indices.data() + merge_buffer_tmp_out;
-
-          merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(),
-                        merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr,
+          size_t merge_buffer_tmp_out  = batch_rows * k * 2;
+          dists_merge_buffer_tmp_ptr   = merge_buffer_dists.data() + merge_buffer_tmp_out;
+          indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out;
+
+          merge_batches(idx_batcher,
+                        query_batcher,
+                        merge_buffer_dists.data(),
+                        merge_buffer_indices.data(),
+                        dists_merge_buffer_tmp_ptr,
                         indices_merge_buffer_tmp_ptr);
         }
 
         // copy merged output back into merge buffer partition for next iteration
         raft::copy_async<value_idx>(merge_buffer_indices.data(),
                                     indices_merge_buffer_tmp_ptr,
-                                    batch_rows * k, handle.get_stream());
+                                    batch_rows * k,
+                                    handle.get_stream());
         raft::copy_async<value_t>(merge_buffer_dists.data(),
-                                  dists_merge_buffer_tmp_ptr, batch_rows * k,
+                                  dists_merge_buffer_tmp_ptr,
+                                  batch_rows * k,
                                   handle.get_stream());
       }
 
       // Copy final merged batch to output array
-      raft::copy_async<value_idx>(
-        output_indices + (rows_processed * k), merge_buffer_indices.data(),
-        query_batcher.batch_rows() * k, handle.get_stream());
-      raft::copy_async<value_t>(
-        output_dists + (rows_processed * k), merge_buffer_dists.data(),
-        query_batcher.batch_rows() * k, handle.get_stream());
+      raft::copy_async<value_idx>(output_indices + (rows_processed * k),
+                                  merge_buffer_indices.data(),
+                                  query_batcher.batch_rows() * k,
+                                  handle.get_stream());
+      raft::copy_async<value_t>(output_dists + (rows_processed * k),
+                                merge_buffer_dists.data(),
+                                query_batcher.batch_rows() * k,
+                                handle.get_stream());
 
       rows_processed += query_batcher.batch_rows();
     }
   }
 
  private:
-  void merge_batches(csr_batcher_t<value_idx, value_t> &idx_batcher,
-                     csr_batcher_t<value_idx, value_t> &query_batcher,
-                     value_t *merge_buffer_dists,
-                     value_idx *merge_buffer_indices, value_t *out_dists,
-                     value_idx *out_indices) {
+  void merge_batches(csr_batcher_t<value_idx, value_t>& idx_batcher,
+                     csr_batcher_t<value_idx, value_t>& query_batcher,
+                     value_t* merge_buffer_dists,
+                     value_idx* merge_buffer_indices,
+                     value_t* out_dists,
+                     value_idx* out_indices)
+  {
     // build translation buffer to shift resulting indices by the batch
     std::vector<value_idx> id_ranges;
     id_ranges.push_back(0);
     id_ranges.push_back(idx_batcher.batch_start());
 
     rmm::device_uvector<value_idx> trans(id_ranges.size(), handle.get_stream());
-    raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(),
-                        handle.get_stream());
+    raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), handle.get_stream());
 
     // combine merge buffers only if there's more than 1 partition to combine
-    raft::spatial::knn::knn_merge_parts(
-      merge_buffer_dists, merge_buffer_indices, out_dists, out_indices,
-      query_batcher.batch_rows(), 2, k, handle.get_stream(), trans.data());
+    raft::spatial::knn::knn_merge_parts(merge_buffer_dists,
+                                        merge_buffer_indices,
+                                        out_dists,
+                                        out_indices,
+                                        query_batcher.batch_rows(),
+                                        2,
+                                        k,
+                                        handle.get_stream(),
+                                        trans.data());
   }
 
   void perform_k_selection(csr_batcher_t<value_idx, value_t> idx_batcher,
                            csr_batcher_t<value_idx, value_t> query_batcher,
-                           value_t *batch_dists, value_idx *batch_indices,
-                           value_t *out_dists, value_idx *out_indices) {
+                           value_t* batch_dists,
+                           value_idx* batch_indices,
+                           value_t* out_dists,
+                           value_idx* out_indices)
+  {
     // populate batch indices array
-    value_idx batch_rows = query_batcher.batch_rows(),
-              batch_cols = idx_batcher.batch_rows();
+    value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
 
     // build translation buffer to shift resulting indices by the batch
     std::vector<value_idx> id_ranges;
@@ -340,51 +371,60 @@ class sparse_knn_t {
     if (metric == raft::distance::DistanceType::InnerProduct) ascending = false;
 
     // kernel to slice first (min) k cols and copy into batched merge buffer
-    select_k(batch_dists, batch_indices, batch_rows, batch_cols, out_dists,
-             out_indices, ascending, n_neighbors, handle.get_stream());
+    select_k(batch_dists,
+             batch_indices,
+             batch_rows,
+             batch_cols,
+             out_dists,
+             out_indices,
+             ascending,
+             n_neighbors,
+             handle.get_stream());
   }
 
-  void compute_distances(csr_batcher_t<value_idx, value_t> &idx_batcher,
-                         csr_batcher_t<value_idx, value_t> &query_batcher,
-                         size_t idx_batch_nnz, size_t query_batch_nnz,
-                         value_idx *idx_batch_indptr,
-                         value_idx *idx_batch_indices, value_t *idx_batch_data,
-                         value_idx *query_batch_indptr,
-                         value_idx *query_batch_indices,
-                         value_t *query_batch_data, value_t *batch_dists) {
+  void compute_distances(csr_batcher_t<value_idx, value_t>& idx_batcher,
+                         csr_batcher_t<value_idx, value_t>& query_batcher,
+                         size_t idx_batch_nnz,
+                         size_t query_batch_nnz,
+                         value_idx* idx_batch_indptr,
+                         value_idx* idx_batch_indices,
+                         value_t* idx_batch_data,
+                         value_idx* query_batch_indptr,
+                         value_idx* query_batch_indices,
+                         value_t* query_batch_data,
+                         value_t* batch_dists)
+  {
     /**
      * Compute distances
      */
-    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config(
-      handle);
+    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config(handle);
     dist_config.b_nrows = idx_batcher.batch_rows();
     dist_config.b_ncols = n_idx_cols;
-    dist_config.b_nnz = idx_batch_nnz;
+    dist_config.b_nnz   = idx_batch_nnz;
 
-    dist_config.b_indptr = idx_batch_indptr;
+    dist_config.b_indptr  = idx_batch_indptr;
     dist_config.b_indices = idx_batch_indices;
-    dist_config.b_data = idx_batch_data;
+    dist_config.b_data    = idx_batch_data;
 
     dist_config.a_nrows = query_batcher.batch_rows();
     dist_config.a_ncols = n_query_cols;
-    dist_config.a_nnz = query_batch_nnz;
+    dist_config.a_nnz   = query_batch_nnz;
 
-    dist_config.a_indptr = query_batch_indptr;
+    dist_config.a_indptr  = query_batch_indptr;
     dist_config.a_indices = query_batch_indices;
-    dist_config.a_data = query_batch_data;
+    dist_config.a_data    = query_batch_data;
 
     if (raft::sparse::distance::supportedDistance.find(metric) ==
         raft::sparse::distance::supportedDistance.end())
       THROW("DistanceType not supported: %d", metric);
 
-    raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric,
-                                             metricArg);
+    raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg);
   }
 
   const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices;
-  value_idx *output_indices;
+  value_idx* output_indices;
   const value_t *idxData, *queryData;
-  value_t *output_dists;
+  value_t* output_dists;
 
   size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query;
 
@@ -394,52 +434,76 @@ class sparse_knn_t {
 
   int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k;
 
-  const raft::handle_t &handle;
+  const raft::handle_t& handle;
 };
 
 /**
-   * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
-   * using some distance implementation
-   * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
-   * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
-   * @param[in] idxData csr data array of the index matrix (size idxNNZ)
-   * @param[in] idxNNA number of non-zeros for sparse index matrix
-   * @param[in] n_idx_rows number of data samples in index matrix
-   * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
-   * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
-   * @param[in] queryData csr data array of the query matrix (size queryNNZ)
-   * @param[in] queryNNZ number of non-zeros for sparse query matrix
-   * @param[in] n_query_rows number of data samples in query matrix
-   * @param[in] n_query_cols number of features in query matrix
-   * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
-   * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
-   * @param[in] k the number of neighbors to query
-   * @param[in] cusparseHandle the initialized cusparseHandle instance to use
-   * @param[in] allocator device allocator instance to use
-   * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
-   * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
-   * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
-   * @param[in] metric distance metric/measure to use
-   * @param[in] metricArg potential argument for metric (currently unused)
-   */
+ * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
+ * using some distance implementation
+ * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
+ * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
+ * @param[in] idxData csr data array of the index matrix (size idxNNZ)
+ * @param[in] idxNNA number of non-zeros for sparse index matrix
+ * @param[in] n_idx_rows number of data samples in index matrix
+ * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
+ * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
+ * @param[in] queryData csr data array of the query matrix (size queryNNZ)
+ * @param[in] queryNNZ number of non-zeros for sparse query matrix
+ * @param[in] n_query_rows number of data samples in query matrix
+ * @param[in] n_query_cols number of features in query matrix
+ * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
+ * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
+ * @param[in] k the number of neighbors to query
+ * @param[in] cusparseHandle the initialized cusparseHandle instance to use
+ * @param[in] allocator device allocator instance to use
+ * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
+ * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
+ * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
+ * @param[in] metric distance metric/measure to use
+ * @param[in] metricArg potential argument for metric (currently unused)
+ */
 template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
-void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices,
-                     const value_t *idxData, size_t idxNNZ, int n_idx_rows,
-                     int n_idx_cols, const value_idx *queryIndptr,
-                     const value_idx *queryIndices, const value_t *queryData,
-                     size_t queryNNZ, int n_query_rows, int n_query_cols,
-                     value_idx *output_indices, value_t *output_dists, int k,
-                     const raft::handle_t &handle,
-                     size_t batch_size_index = 2 << 14,  // approx 1M
-                     size_t batch_size_query = 2 << 14,
-                     raft::distance::DistanceType metric =
-                       raft::distance::DistanceType::L2Expanded,
-                     float metricArg = 0) {
-  sparse_knn_t<value_idx, value_t>(
-    idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr,
-    queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols,
-    output_indices, output_dists, k, handle, batch_size_index, batch_size_query,
-    metric, metricArg)
+void brute_force_knn(const value_idx* idxIndptr,
+                     const value_idx* idxIndices,
+                     const value_t* idxData,
+                     size_t idxNNZ,
+                     int n_idx_rows,
+                     int n_idx_cols,
+                     const value_idx* queryIndptr,
+                     const value_idx* queryIndices,
+                     const value_t* queryData,
+                     size_t queryNNZ,
+                     int n_query_rows,
+                     int n_query_cols,
+                     value_idx* output_indices,
+                     value_t* output_dists,
+                     int k,
+                     const raft::handle_t& handle,
+                     size_t batch_size_index             = 2 << 14,  // approx 1M
+                     size_t batch_size_query             = 2 << 14,
+                     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
+                     float metricArg                     = 0)
+{
+  sparse_knn_t<value_idx, value_t>(idxIndptr,
+                                   idxIndices,
+                                   idxData,
+                                   idxNNZ,
+                                   n_idx_rows,
+                                   n_idx_cols,
+                                   queryIndptr,
+                                   queryIndices,
+                                   queryData,
+                                   queryNNZ,
+                                   n_query_rows,
+                                   n_query_cols,
+                                   output_indices,
+                                   output_dists,
+                                   k,
+                                   handle,
+                                   batch_size_index,
+                                   batch_size_query,
+                                   metric,
+                                   metricArg)
     .run();
 }
 
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
index 1cf225087a..1308f5ce02 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -45,31 +45,34 @@ namespace selection {
  * @param m
  */
 template <typename value_idx>
-__global__ void fill_indices(value_idx *indices, size_t m, size_t nnz) {
+__global__ void fill_indices(value_idx* indices, size_t m, size_t nnz)
+{
   value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (tid >= nnz) return;
-  value_idx v = tid / m;
+  value_idx v  = tid / m;
   indices[tid] = v;
 }
 
 template <typename value_idx>
-value_idx build_k(value_idx n_samples, int c) {
+value_idx build_k(value_idx n_samples, int c)
+{
   // from "kNN-MST-Agglomerative: A fast & scalable graph-based data clustering
   // approach on GPU"
-  return min(n_samples,
-             max((value_idx)2, (value_idx)floor(log2(n_samples)) + c));
+  return min(n_samples, max((value_idx)2, (value_idx)floor(log2(n_samples)) + c));
 }
 
 template <typename in_t, typename out_t>
-__global__ void conv_indices_kernel(in_t *inds, out_t *out, size_t nnz) {
+__global__ void conv_indices_kernel(in_t* inds, out_t* out, size_t nnz)
+{
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid >= nnz) return;
-  out_t v = inds[tid];
+  out_t v  = inds[tid];
   out[tid] = v;
 }
 
 template <typename in_t, typename out_t, int tpb = 256>
-void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) {
+void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream)
+{
   size_t blocks = ceildiv(size, (size_t)tpb);
   conv_indices_kernel<<<blocks, tpb, 0, stream>>>(inds, out, size);
 }
@@ -91,13 +94,18 @@ void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) {
  * @param c
  */
 template <typename value_idx = int, typename value_t = float>
-void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
+void knn_graph(const handle_t& handle,
+               const value_t* X,
+               size_t m,
+               size_t n,
                distance::DistanceType metric,
-               raft::sparse::COO<value_t, value_idx> &out, int c = 15) {
+               raft::sparse::COO<value_t, value_idx>& out,
+               int c = 15)
+{
   int k = build_k(m, c);
 
   auto d_alloc = handle.get_device_allocator();
-  auto stream = handle.get_stream();
+  auto stream  = handle.get_stream();
 
   size_t nnz = m * k;
 
@@ -108,8 +116,8 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
   size_t blocks = ceildiv(nnz, (size_t)256);
   fill_indices<value_idx><<<blocks, 256, 0, stream>>>(rows.data(), k, nnz);
 
-  std::vector<value_t *> inputs;
-  inputs.push_back(const_cast<value_t *>(X));
+  std::vector<value_t*> inputs;
+  inputs.push_back(const_cast<value_t*>(X));
 
   std::vector<int> sizes;
   sizes.push_back(m);
@@ -119,15 +127,25 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
   rmm::device_uvector<int64_t> int64_indices(nnz, stream);
 
   uint32_t knn_start = curTimeMillis();
-  raft::spatial::knn::brute_force_knn(
-    handle, inputs, sizes, n, const_cast<value_t *>(X), m, int64_indices.data(),
-    data.data(), k, true, true, nullptr, metric);
+  raft::spatial::knn::brute_force_knn(handle,
+                                      inputs,
+                                      sizes,
+                                      n,
+                                      const_cast<value_t*>(X),
+                                      m,
+                                      int64_indices.data(),
+                                      data.data(),
+                                      k,
+                                      true,
+                                      true,
+                                      nullptr,
+                                      metric);
 
   // convert from current knn's 64-bit to 32-bit.
   conv_indices(int64_indices.data(), indices.data(), nnz, stream);
 
-  raft::sparse::linalg::symmetrize(handle, rows.data(), indices.data(),
-                                   data.data(), m, k, nnz, out);
+  raft::sparse::linalg::symmetrize(
+    handle, rows.data(), indices.data(), data.data(), m, k, nnz, out);
 }
 
 };  // namespace selection
diff --git a/cpp/include/raft/sparse/selection/selection.cuh b/cpp/include/raft/sparse/selection/selection.cuh
index 6066a36289..190e06b2cd 100644
--- a/cpp/include/raft/sparse/selection/selection.cuh
+++ b/cpp/include/raft/sparse/selection/selection.cuh
@@ -39,27 +39,33 @@ namespace raft {
 namespace sparse {
 namespace selection {
 
-template <typename K, typename IndexType, bool select_min, int warp_q,
-          int thread_q, int tpb>
-__global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows,
-                                size_t n_cols, K *outK, IndexType *outV,
-                                K initK, IndexType initV, int k) {
+template <typename K, typename IndexType, bool select_min, int warp_q, int thread_q, int tpb>
+__global__ void select_k_kernel(K* inK,
+                                IndexType* inV,
+                                size_t n_rows,
+                                size_t n_cols,
+                                K* outK,
+                                IndexType* outV,
+                                K initK,
+                                IndexType initV,
+                                int k)
+{
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ K smemK[kNumWarps * warp_q];
   __shared__ IndexType smemV[kNumWarps * warp_q];
 
-  faiss::gpu::BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>,
-                          warp_q, thread_q, tpb>
-    heap(initK, initV, smemK, smemV, k);
+  faiss::gpu::
+    BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>, warp_q, thread_q, tpb>
+      heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
   int row = blockIdx.x;
-  int i = threadIdx.x;
+  int i   = threadIdx.x;
 
-  int idx = row * n_cols;
-  K *inKStart = inK + idx + i;
-  IndexType *inVStart = inV + idx + i;
+  int idx             = row * n_cols;
+  K* inKStart         = inK + idx + i;
+  IndexType* inVStart = inV + idx + i;
 
   // Whole warps must participate in the selection
   int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize);
@@ -86,27 +92,31 @@ __global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows,
   }
 }
 
-template <typename value_idx = int, typename value_t = float, int warp_q,
-          int thread_q>
-inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows,
-                          size_t n_cols, value_t *outK, value_idx *outV,
-                          bool select_min, int k, cudaStream_t stream) {
+template <typename value_idx = int, typename value_t = float, int warp_q, int thread_q>
+inline void select_k_impl(value_t* inK,
+                          value_idx* inV,
+                          size_t n_rows,
+                          size_t n_cols,
+                          value_t* outK,
+                          value_idx* outV,
+                          bool select_min,
+                          int k,
+                          cudaStream_t stream)
+{
   auto grid = dim3(n_rows);
 
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
-  auto block = dim3(n_threads);
+  auto block              = dim3(n_threads);
 
-  auto kInit = select_min ? faiss::gpu::Limits<value_t>::getMax()
-                          : faiss::gpu::Limits<value_t>::getMin();
+  auto kInit =
+    select_min ? faiss::gpu::Limits<value_t>::getMax() : faiss::gpu::Limits<value_t>::getMin();
   auto vInit = -1;
   if (select_min) {
     select_k_kernel<value_t, value_idx, false, warp_q, thread_q, n_threads>
-      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit,
-                                   vInit, k);
+      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   } else {
     select_k_kernel<value_t, value_idx, true, warp_q, thread_q, n_threads>
-      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit,
-                                   vInit, k);
+      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   }
   CUDA_CHECK(cudaGetLastError());
 }
@@ -126,30 +136,37 @@ inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows,
  * @param[in] stream CUDA stream to use
  */
 template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
-                     value_t *outK, value_idx *outV, bool select_min, int k,
-                     cudaStream_t stream) {
+inline void select_k(value_t* inK,
+                     value_idx* inV,
+                     size_t n_rows,
+                     size_t n_cols,
+                     value_t* outK,
+                     value_idx* outV,
+                     bool select_min,
+                     int k,
+                     cudaStream_t stream)
+{
   if (k == 1)
-    select_k_impl<value_idx, value_t, 1, 1>(inK, inV, n_rows, n_cols, outK,
-                                            outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 1, 1>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 32)
-    select_k_impl<value_idx, value_t, 32, 2>(inK, inV, n_rows, n_cols, outK,
-                                             outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 32, 2>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 64)
-    select_k_impl<value_idx, value_t, 64, 3>(inK, inV, n_rows, n_cols, outK,
-                                             outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 64, 3>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 128)
-    select_k_impl<value_idx, value_t, 128, 3>(inK, inV, n_rows, n_cols, outK,
-                                              outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 128, 3>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 256)
-    select_k_impl<value_idx, value_t, 256, 4>(inK, inV, n_rows, n_cols, outK,
-                                              outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 256, 4>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 512)
-    select_k_impl<value_idx, value_t, 512, 8>(inK, inV, n_rows, n_cols, outK,
-                                              outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 512, 8>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 1024)
-    select_k_impl<value_idx, value_t, 1024, 8>(inK, inV, n_rows, n_cols, outK,
-                                               outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 1024, 8>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
 }
 
 };  // namespace selection
diff --git a/cpp/include/raft/sparse/utils.h b/cpp/include/raft/sparse/utils.h
index 63578bf1f3..56e8832e0a 100644
--- a/cpp/include/raft/sparse/utils.h
+++ b/cpp/include/raft/sparse/utils.h
@@ -26,7 +26,8 @@ namespace sparse {
  * @param[in] ncols number of blocks to quantize
  */
 template <typename value_idx>
-inline int block_dim(value_idx ncols) {
+inline int block_dim(value_idx ncols)
+{
   int blockdim;
   if (ncols <= 32)
     blockdim = 32;
@@ -54,9 +55,9 @@ inline int block_dim(value_idx ncols) {
  * @return
  */
 template <typename G>
-__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask,
-                                                    G key) {
-  unsigned int mask = __ballot_sync(init_mask, true);
+__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, G key)
+{
+  unsigned int mask       = __ballot_sync(init_mask, true);
   unsigned int peer_group = 0;
   bool is_peer;
 
@@ -77,12 +78,14 @@ __device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask,
 }
 #endif
 
-__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) {
+__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group)
+{
   return __ffs(peer_group) - 1;
 }
 
 template <typename value_idx>
-__global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) {
+__global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols)
+{
   int row = blockIdx.x;
   int tid = threadIdx.x;
 
@@ -92,15 +95,16 @@ __global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) {
 }
 
 template <typename value_idx>
-void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols,
-               cudaStream_t stream) {
+void iota_fill(value_idx* indices, value_idx nrows, value_idx ncols, cudaStream_t stream)
+{
   int blockdim = block_dim(ncols);
 
   iota_fill_block_kernel<<<nrows, blockdim, 0, stream>>>(indices, ncols);
 }
 
 template <typename T>
-__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) {
+__device__ int get_stop_idx(T row, T m, T nnz, const T* ind)
+{
   int stop_idx = 0;
   if (row < (m - 1))
     stop_idx = ind[row + 1];
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index 77d7831b4a..f77a56164d 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -45,14 +45,16 @@ using deviceAllocator = raft::mr::device::allocator;
  * @param[in] D the dimensionality of the index array
  */
 template <typename value_idx = int>
-inline void approx_knn_build_index(raft::handle_t &handle,
-                                   raft::spatial::knn::knnIndex *index,
-                                   knnIndexParam *params,
+inline void approx_knn_build_index(raft::handle_t& handle,
+                                   raft::spatial::knn::knnIndex* index,
+                                   knnIndexParam* params,
                                    raft::distance::DistanceType metric,
-                                   float metricArg, float *index_array,
-                                   value_idx n, value_idx D) {
-  detail::approx_knn_build_index(handle, index, params, metric, metricArg,
-                                 index_array, n, D);
+                                   float metricArg,
+                                   float* index_array,
+                                   value_idx n,
+                                   value_idx D)
+{
+  detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D);
 }
 
 /**
@@ -69,12 +71,15 @@ inline void approx_knn_build_index(raft::handle_t &handle,
  * @param[in] n number of rows in the query array
  */
 template <typename value_idx = int>
-inline void approx_knn_search(raft::handle_t &handle, float *distances,
-                              int64_t *indices,
-                              raft::spatial::knn::knnIndex *index, value_idx k,
-                              float *query_array, value_idx n) {
-  detail::approx_knn_search(handle, distances, indices, index, k, query_array,
-                            n);
+inline void approx_knn_search(raft::handle_t& handle,
+                              float* distances,
+                              int64_t* indices,
+                              raft::spatial::knn::knnIndex* index,
+                              value_idx k,
+                              float* query_array,
+                              value_idx n)
+{
+  detail::approx_knn_search(handle, distances, indices, index, k, query_array, n);
 }
 
 }  // namespace knn
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 6a6c7751c2..573a23181d 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -26,13 +26,14 @@ namespace spatial {
 namespace knn {
 
 struct knnIndex {
-  faiss::gpu::GpuIndex *index;
+  faiss::gpu::GpuIndex* index;
   raft::distance::DistanceType metric;
   float metricArg;
 
-  faiss::gpu::StandardGpuResources *gpu_res;
+  faiss::gpu::StandardGpuResources* gpu_res;
   int device;
-  ~knnIndex() {
+  ~knnIndex()
+  {
     delete index;
     delete gpu_res;
   }
@@ -57,7 +58,8 @@ struct IVFParam : knnIndexParam {
   int nprobe;
 };
 
-struct IVFFlatParam : IVFParam {};
+struct IVFFlatParam : IVFParam {
+};
 
 struct IVFPQParam : IVFParam {
   int M;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 6e4c99b646..7eb439c78b 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -56,115 +56,107 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(
-  QuantizerType qtype) {
+inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype)
+{
   switch (qtype) {
-    case QuantizerType::QT_8bit:
-      return faiss::ScalarQuantizer::QuantizerType::QT_8bit;
+    case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit;
     case QuantizerType::QT_8bit_uniform:
       return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform;
     case QuantizerType::QT_4bit_uniform:
       return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform;
-    case QuantizerType::QT_fp16:
-      return faiss::ScalarQuantizer::QuantizerType::QT_fp16;
+    case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16;
     case QuantizerType::QT_8bit_direct:
       return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct;
-    case QuantizerType::QT_6bit:
-      return faiss::ScalarQuantizer::QuantizerType::QT_6bit;
-    default:
-      return (faiss::ScalarQuantizer::QuantizerType)qtype;
+    case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit;
+    default: return (faiss::ScalarQuantizer::QuantizerType)qtype;
   }
 }
 
 template <typename IntType = int>
-void approx_knn_ivfflat_build_index(knnIndex *index, IVFParam *params,
-                                    raft::distance::DistanceType metric,
-                                    IntType n, IntType D) {
+void approx_knn_ivfflat_build_index(
+  knnIndex* index, IVFParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
+{
   faiss::gpu::GpuIndexIVFFlatConfig config;
-  config.device = index->device;
+  config.device                  = index->device;
   faiss::MetricType faiss_metric = build_faiss_metric(metric);
-  faiss::gpu::GpuIndexIVFFlat *faiss_index = new faiss::gpu::GpuIndexIVFFlat(
-    index->gpu_res, D, params->nlist, faiss_metric, config);
+  faiss::gpu::GpuIndexIVFFlat* faiss_index =
+    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params->nlist, faiss_metric, config);
   faiss_index->setNumProbes(params->nprobe);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_ivfpq_build_index(knnIndex *index, IVFPQParam *params,
-                                  raft::distance::DistanceType metric,
-                                  IntType n, IntType D) {
+void approx_knn_ivfpq_build_index(
+  knnIndex* index, IVFPQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
+{
   faiss::gpu::GpuIndexIVFPQConfig config;
-  config.device = index->device;
-  config.usePrecomputedTables = params->usePrecomputedTables;
-  config.interleavedLayout = params->n_bits != 8;
-  faiss::MetricType faiss_metric = build_faiss_metric(metric);
-  faiss::gpu::GpuIndexIVFPQ *faiss_index =
-    new faiss::gpu::GpuIndexIVFPQ(index->gpu_res, D, params->nlist, params->M,
-                                  params->n_bits, faiss_metric, config);
+  config.device                          = index->device;
+  config.usePrecomputedTables            = params->usePrecomputedTables;
+  config.interleavedLayout               = params->n_bits != 8;
+  faiss::MetricType faiss_metric         = build_faiss_metric(metric);
+  faiss::gpu::GpuIndexIVFPQ* faiss_index = new faiss::gpu::GpuIndexIVFPQ(
+    index->gpu_res, D, params->nlist, params->M, params->n_bits, faiss_metric, config);
   faiss_index->setNumProbes(params->nprobe);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_ivfsq_build_index(knnIndex *index, IVFSQParam *params,
-                                  raft::distance::DistanceType metric,
-                                  IntType n, IntType D) {
+void approx_knn_ivfsq_build_index(
+  knnIndex* index, IVFSQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
+{
   faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
-  config.device = index->device;
-  faiss::MetricType faiss_metric = build_faiss_metric(metric);
-  faiss::ScalarQuantizer::QuantizerType faiss_qtype =
-    build_faiss_qtype(params->qtype);
-  faiss::gpu::GpuIndexIVFScalarQuantizer *faiss_index =
-    new faiss::gpu::GpuIndexIVFScalarQuantizer(index->gpu_res, D, params->nlist,
-                                               faiss_qtype, faiss_metric,
-                                               params->encodeResidual);
+  config.device                                       = index->device;
+  faiss::MetricType faiss_metric                      = build_faiss_metric(metric);
+  faiss::ScalarQuantizer::QuantizerType faiss_qtype   = build_faiss_qtype(params->qtype);
+  faiss::gpu::GpuIndexIVFScalarQuantizer* faiss_index = new faiss::gpu::GpuIndexIVFScalarQuantizer(
+    index->gpu_res, D, params->nlist, faiss_qtype, faiss_metric, params->encodeResidual);
   faiss_index->setNumProbes(params->nprobe);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_build_index(raft::handle_t &handle,
-                            raft::spatial::knn::knnIndex *index,
-                            raft::spatial::knn::knnIndexParam *params,
+void approx_knn_build_index(raft::handle_t& handle,
+                            raft::spatial::knn::knnIndex* index,
+                            raft::spatial::knn::knnIndexParam* params,
                             raft::distance::DistanceType metric,
-                            float metricArg, float *index_array, IntType n,
-                            IntType D) {
+                            float metricArg,
+                            float* index_array,
+                            IntType n,
+                            IntType D)
+{
   int device;
   CUDA_CHECK(cudaGetDevice(&device));
 
-  faiss::gpu::StandardGpuResources *gpu_res =
-    new faiss::gpu::StandardGpuResources();
+  faiss::gpu::StandardGpuResources* gpu_res = new faiss::gpu::StandardGpuResources();
   gpu_res->noTempMemory();
   gpu_res->setDefaultStream(device, handle.get_stream());
-  index->gpu_res = gpu_res;
-  index->device = device;
-  index->index = nullptr;
-  index->metric = metric;
+  index->gpu_res   = gpu_res;
+  index->device    = device;
+  index->index     = nullptr;
+  index->metric    = metric;
   index->metricArg = metricArg;
 
   // perform preprocessing
   // k set to 0 (unused during preprocessing / revertion)
-  std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(metric, n, D, 0, false, handle.get_stream(),
-                            handle.get_device_allocator());
+  std::unique_ptr<MetricProcessor<float>> query_metric_processor = create_processor<float>(
+    metric, n, D, 0, false, handle.get_stream(), handle.get_device_allocator());
 
   query_metric_processor->preprocess(index_array);
 
-  if (dynamic_cast<IVFFlatParam *>(params)) {
-    IVFFlatParam *IVFFlat_param = dynamic_cast<IVFFlatParam *>(params);
+  if (dynamic_cast<IVFFlatParam*>(params)) {
+    IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
     approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D);
     std::vector<float> h_index_array(n * D);
-    raft::update_host(h_index_array.data(), index_array, h_index_array.size(),
-                      handle.get_stream());
+    raft::update_host(h_index_array.data(), index_array, h_index_array.size(), handle.get_stream());
     query_metric_processor->revert(index_array);
     index->index->train(n, h_index_array.data());
     index->index->add(n, h_index_array.data());
   } else {
-    if (dynamic_cast<IVFPQParam *>(params)) {
-      IVFPQParam *IVFPQ_param = dynamic_cast<IVFPQParam *>(params);
+    if (dynamic_cast<IVFPQParam*>(params)) {
+      IVFPQParam* IVFPQ_param = dynamic_cast<IVFPQParam*>(params);
       approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D);
-    } else if (dynamic_cast<IVFSQParam *>(params)) {
-      IVFSQParam *IVFSQ_param = dynamic_cast<IVFSQParam *>(params);
+    } else if (dynamic_cast<IVFSQParam*>(params)) {
+      IVFSQParam* IVFSQ_param = dynamic_cast<IVFSQParam*>(params);
       approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D);
     } else {
       ASSERT(index->index, "KNN index could not be initialized");
@@ -177,13 +169,23 @@ void approx_knn_build_index(raft::handle_t &handle,
 }
 
 template <typename IntType = int>
-void approx_knn_search(raft::handle_t &handle, float *distances,
-                       int64_t *indices, raft::spatial::knn::knnIndex *index,
-                       IntType k, float *query_array, IntType n) {
+void approx_knn_search(raft::handle_t& handle,
+                       float* distances,
+                       int64_t* indices,
+                       raft::spatial::knn::knnIndex* index,
+                       IntType k,
+                       float* query_array,
+                       IntType n)
+{
   // perform preprocessing
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(index->metric, n, index->index->d, k, false,
-                            handle.get_stream(), handle.get_device_allocator());
+    create_processor<float>(index->metric,
+                            n,
+                            index->index->d,
+                            k,
+                            false,
+                            handle.get_stream(),
+                            handle.get_device_allocator());
 
   query_metric_processor->preprocess(query_array);
   index->index->search(n, query_array, k, distances, indices);
@@ -194,13 +196,14 @@ void approx_knn_search(raft::handle_t &handle, float *distances,
       index->metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
       index->metric == raft::distance::DistanceType::LpUnexpanded) {
     /**
-  * post-processing
-  */
+     * post-processing
+     */
     float p = 0.5;  // standard l2
-    if (index->metric == raft::distance::DistanceType::LpUnexpanded)
-      p = 1.0 / index->metricArg;
+    if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg;
     raft::linalg::unaryOp<float>(
-      distances, distances, n * k,
+      distances,
+      distances,
+      n * k,
       [p] __device__(float input) { return powf(input, p); },
       handle.get_stream());
   }
diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h
index 0c0398a336..5618186dfc 100644
--- a/cpp/include/raft/spatial/knn/detail/common_faiss.h
+++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h
@@ -27,37 +27,26 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-inline faiss::MetricType build_faiss_metric(
-  raft::distance::DistanceType metric) {
+inline faiss::MetricType build_faiss_metric(raft::distance::DistanceType metric)
+{
   switch (metric) {
     case raft::distance::DistanceType::CosineExpanded:
       return faiss::MetricType::METRIC_INNER_PRODUCT;
     case raft::distance::DistanceType::CorrelationExpanded:
       return faiss::MetricType::METRIC_INNER_PRODUCT;
-    case raft::distance::DistanceType::L2Expanded:
-      return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L2Unexpanded:
-      return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L2SqrtExpanded:
-      return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L1:
-      return faiss::MetricType::METRIC_L1;
-    case raft::distance::DistanceType::InnerProduct:
-      return faiss::MetricType::METRIC_INNER_PRODUCT;
-    case raft::distance::DistanceType::LpUnexpanded:
-      return faiss::MetricType::METRIC_Lp;
-    case raft::distance::DistanceType::Linf:
-      return faiss::MetricType::METRIC_Linf;
-    case raft::distance::DistanceType::Canberra:
-      return faiss::MetricType::METRIC_Canberra;
-    case raft::distance::DistanceType::BrayCurtis:
-      return faiss::MetricType::METRIC_BrayCurtis;
+    case raft::distance::DistanceType::L2Expanded: return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L2Unexpanded: return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L2SqrtExpanded: return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L2SqrtUnexpanded: return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L1: return faiss::MetricType::METRIC_L1;
+    case raft::distance::DistanceType::InnerProduct: return faiss::MetricType::METRIC_INNER_PRODUCT;
+    case raft::distance::DistanceType::LpUnexpanded: return faiss::MetricType::METRIC_Lp;
+    case raft::distance::DistanceType::Linf: return faiss::MetricType::METRIC_Linf;
+    case raft::distance::DistanceType::Canberra: return faiss::MetricType::METRIC_Canberra;
+    case raft::distance::DistanceType::BrayCurtis: return faiss::MetricType::METRIC_BrayCurtis;
     case raft::distance::DistanceType::JensenShannon:
       return faiss::MetricType::METRIC_JensenShannon;
-    default:
-      THROW("MetricType not supported: %d", metric);
+    default: THROW("MetricType not supported: %d", metric);
   }
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 7d87254cb6..049c11514c 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -35,7 +35,8 @@ namespace knn {
 namespace detail {
 
 template <typename value_t>
-DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) {
+DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2)
+{
   value_t sin_0 = sin(0.5 * (x1 - y1));
   value_t sin_1 = sin(0.5 * (x2 - y2));
   value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1;
@@ -56,34 +57,36 @@ DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) {
  * @param[in] n_index_rows number of rows in index array
  * @param[in] k number of closest neighbors to return
  */
-template <typename value_idx, typename value_t, int warp_q = 1024,
-          int thread_q = 8, int tpb = 128>
-__global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
-                                     const value_t *index, const value_t *query,
-                                     size_t n_index_rows, int k) {
+template <typename value_idx, typename value_t, int warp_q = 1024, int thread_q = 8, int tpb = 128>
+__global__ void haversine_knn_kernel(value_idx* out_inds,
+                                     value_t* out_dists,
+                                     const value_t* index,
+                                     const value_t* query,
+                                     size_t n_index_rows,
+                                     int k)
+{
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
 
-  faiss::gpu::BlockSelect<value_t, value_idx, false,
-                          faiss::gpu::Comparator<value_t>, warp_q, thread_q,
-                          tpb>
-    heap(faiss::gpu::Limits<value_t>::getMax(), -1, smemK, smemV, k);
+  faiss::gpu::
+    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
+      heap(faiss::gpu::Limits<value_t>::getMax(), -1, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
   int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize);
 
-  const value_t *query_ptr = query + (blockIdx.x * 2);
-  value_t x1 = query_ptr[0];
-  value_t x2 = query_ptr[1];
+  const value_t* query_ptr = query + (blockIdx.x * 2);
+  value_t x1               = query_ptr[0];
+  value_t x2               = query_ptr[1];
 
   int i = threadIdx.x;
 
   for (; i < limit; i += tpb) {
-    const value_t *idx_ptr = index + (i * 2);
-    value_t y1 = idx_ptr[0];
-    value_t y2 = idx_ptr[1];
+    const value_t* idx_ptr = index + (i * 2);
+    value_t y1             = idx_ptr[0];
+    value_t y2             = idx_ptr[1];
 
     value_t dist = compute_haversine(x1, y1, x2, y2);
 
@@ -92,9 +95,9 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
 
   // Handle last remainder fraction of a warp of elements
   if (i < n_index_rows) {
-    const value_t *idx_ptr = index + (i * 2);
-    value_t y1 = idx_ptr[0];
-    value_t y2 = idx_ptr[1];
+    const value_t* idx_ptr = index + (i * 2);
+    value_t y1             = idx_ptr[0];
+    value_t y2             = idx_ptr[1];
 
     value_t dist = compute_haversine(x1, y1, x2, y2);
 
@@ -105,7 +108,7 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
 
   for (int i = threadIdx.x; i < k; i += tpb) {
     out_dists[blockIdx.x * k + i] = smemK[i];
-    out_inds[blockIdx.x * k + i] = smemV[i];
+    out_inds[blockIdx.x * k + i]  = smemV[i];
   }
 }
 
@@ -126,10 +129,15 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
  * @param[in] stream stream to order kernel launch
  */
 template <typename value_idx, typename value_t>
-void haversine_knn(value_idx *out_inds, value_t *out_dists,
-                   const value_t *index, const value_t *query,
-                   size_t n_index_rows, size_t n_query_rows, int k,
-                   cudaStream_t stream) {
+void haversine_knn(value_idx* out_inds,
+                   value_t* out_dists,
+                   const value_t* index,
+                   const value_t* query,
+                   size_t n_index_rows,
+                   size_t n_query_rows,
+                   int k,
+                   cudaStream_t stream)
+{
   haversine_knn_kernel<<<n_query_rows, 128, 0, stream>>>(
     out_inds, out_dists, index, query, n_index_rows, k);
 }
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index 09494e9eb1..a276ae45ad 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -43,13 +43,18 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-template <typename value_idx = int64_t, typename value_t = float, int warp_q,
-          int thread_q, int tpb>
-__global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
-                                       value_t *outK, value_idx *outV,
-                                       size_t n_samples, int n_parts,
-                                       value_t initK, value_idx initV, int k,
-                                       value_idx *translations) {
+template <typename value_idx = int64_t, typename value_t = float, int warp_q, int thread_q, int tpb>
+__global__ void knn_merge_parts_kernel(value_t* inK,
+                                       value_idx* inV,
+                                       value_t* outK,
+                                       value_idx* outV,
+                                       size_t n_samples,
+                                       int n_parts,
+                                       value_t initK,
+                                       value_idx initV,
+                                       int k,
+                                       value_idx* translations)
+{
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
@@ -58,34 +63,33 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
   /**
    * Uses shared memory
    */
-  faiss::gpu::BlockSelect<value_t, value_idx, false,
-                          faiss::gpu::Comparator<value_t>, warp_q, thread_q,
-                          tpb>
-    heap(initK, initV, smemK, smemV, k);
+  faiss::gpu::
+    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
+      heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
-  int row = blockIdx.x;
+  int row     = blockIdx.x;
   int total_k = k * n_parts;
 
   int i = threadIdx.x;
 
   // Get starting pointers for cols in current thread
-  int part = i / k;
+  int part       = i / k;
   size_t row_idx = (row * k) + (part * n_samples * k);
 
   int col = i % k;
 
-  value_t *inKStart = inK + (row_idx + col);
-  value_idx *inVStart = inV + (row_idx + col);
+  value_t* inKStart   = inK + (row_idx + col);
+  value_idx* inVStart = inV + (row_idx + col);
 
-  int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
+  int limit             = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
   value_idx translation = 0;
 
   for (; i < limit; i += tpb) {
     translation = translations[part];
     heap.add(*inKStart, (*inVStart) + translation);
 
-    part = (i + tpb) / k;
+    part    = (i + tpb) / k;
     row_idx = (row * k) + (part * n_samples * k);
 
     col = (i + tpb) % k;
@@ -108,22 +112,27 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
   }
 }
 
-template <typename value_idx = int64_t, typename value_t = float, int warp_q,
-          int thread_q>
-inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK,
-                                 value_idx *outV, size_t n_samples, int n_parts,
-                                 int k, cudaStream_t stream,
-                                 value_idx *translations) {
+template <typename value_idx = int64_t, typename value_t = float, int warp_q, int thread_q>
+inline void knn_merge_parts_impl(value_t* inK,
+                                 value_idx* inV,
+                                 value_t* outK,
+                                 value_idx* outV,
+                                 size_t n_samples,
+                                 int n_parts,
+                                 int k,
+                                 cudaStream_t stream,
+                                 value_idx* translations)
+{
   auto grid = dim3(n_samples);
 
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
-  auto block = dim3(n_threads);
+  auto block              = dim3(n_threads);
 
   auto kInit = faiss::gpu::Limits<value_t>::getMax();
   auto vInit = -1;
   knn_merge_parts_kernel<value_idx, value_t, warp_q, thread_q, n_threads>
-    <<<grid, block, 0, stream>>>(inK, inV, outK, outV, n_samples, n_parts,
-                                 kInit, vInit, k, translations);
+    <<<grid, block, 0, stream>>>(
+      inK, inV, outK, outV, n_samples, n_parts, kInit, vInit, k, translations);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -142,10 +151,16 @@ inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK,
  * @param translations mapping of index offsets for each partition
  */
 template <typename value_idx = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
-                            value_idx *outV, size_t n_samples, int n_parts,
-                            int k, cudaStream_t stream,
-                            value_idx *translations) {
+inline void knn_merge_parts(value_t* inK,
+                            value_idx* inV,
+                            value_t* outK,
+                            value_idx* outV,
+                            size_t n_samples,
+                            int n_parts,
+                            int k,
+                            cudaStream_t stream,
+                            value_idx* translations)
+{
   if (k == 1)
     knn_merge_parts_impl<value_idx, value_t, 1, 1>(
       inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
@@ -195,27 +210,33 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
  * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm
  */
 template <typename IntType = int>
-void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
-                          IntType D, float *search_items, IntType n,
-                          int64_t *res_I, float *res_D, IntType k,
-                          std::shared_ptr<deviceAllocator> allocator,
-                          cudaStream_t userStream,
-                          cudaStream_t *internalStreams = nullptr,
-                          int n_int_streams = 0, bool rowMajorIndex = true,
-                          bool rowMajorQuery = true,
-                          std::vector<int64_t> *translations = nullptr,
-                          raft::distance::DistanceType metric =
-                            raft::distance::DistanceType::L2Expanded,
-                          float metricArg = 0) {
-  ASSERT(input.size() == sizes.size(),
-         "input and sizes vectors should be the same size");
-
-  std::vector<int64_t> *id_ranges;
+void brute_force_knn_impl(
+  std::vector<float*>& input,
+  std::vector<int>& sizes,
+  IntType D,
+  float* search_items,
+  IntType n,
+  int64_t* res_I,
+  float* res_D,
+  IntType k,
+  std::shared_ptr<deviceAllocator> allocator,
+  cudaStream_t userStream,
+  cudaStream_t* internalStreams       = nullptr,
+  int n_int_streams                   = 0,
+  bool rowMajorIndex                  = true,
+  bool rowMajorQuery                  = true,
+  std::vector<int64_t>* translations  = nullptr,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
+  float metricArg                     = 0)
+{
+  ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size");
+
+  std::vector<int64_t>* id_ranges;
   if (translations == nullptr) {
     // If we don't have explicit translations
     // for offsets of the indices, build them
     // from the local partitions
-    id_ranges = new std::vector<int64_t>();
+    id_ranges       = new std::vector<int64_t>();
     int64_t total_n = 0;
     for (size_t i = 0; i < input.size(); i++) {
       id_ranges->push_back(total_n);
@@ -228,31 +249,27 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
 
   // perform preprocessing
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream,
-                            allocator);
+    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream, allocator);
   query_metric_processor->preprocess(search_items);
 
-  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(
-    input.size());
+  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(input.size());
   for (size_t i = 0; i < input.size(); i++) {
-    metric_processors[i] = create_processor<float>(
-      metric, sizes[i], D, k, rowMajorQuery, userStream, allocator);
+    metric_processors[i] =
+      create_processor<float>(metric, sizes[i], D, k, rowMajorQuery, userStream, allocator);
     metric_processors[i]->preprocess(input[i]);
   }
 
   int device;
   CUDA_CHECK(cudaGetDevice(&device));
 
-  raft::mr::device::buffer<int64_t> trans(allocator, userStream,
-                                          id_ranges->size());
-  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
-                      userStream);
+  raft::mr::device::buffer<int64_t> trans(allocator, userStream, id_ranges->size());
+  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream);
 
   raft::mr::device::buffer<float> all_D(allocator, userStream, 0);
   raft::mr::device::buffer<int64_t> all_I(allocator, userStream, 0);
 
-  float *out_D = res_D;
-  int64_t *out_I = res_I;
+  float* out_D   = res_D;
+  int64_t* out_I = res_I;
 
   if (input.size() > 1) {
     all_D.resize(input.size() * k * n, userStream);
@@ -266,11 +283,10 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
   if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));
 
   for (size_t i = 0; i < input.size(); i++) {
-    float *out_d_ptr = out_D + (i * k * n);
-    int64_t *out_i_ptr = out_I + (i * k * n);
+    float* out_d_ptr   = out_D + (i * k * n);
+    int64_t* out_i_ptr = out_I + (i * k * n);
 
-    cudaStream_t stream =
-      raft::select_stream(userStream, internalStreams, n_int_streams, i);
+    cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i);
 
     switch (metric) {
       case raft::distance::DistanceType::Haversine:
@@ -279,8 +295,7 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
                "Haversine distance requires 2 dimensions "
                "(latitude / longitude).");
 
-        haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n,
-                      k, stream);
+        haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
         break;
       default:
         faiss::MetricType m = build_faiss_metric(metric);
@@ -291,18 +306,18 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
         gpu_res.setDefaultStream(device, stream);
 
         faiss::gpu::GpuDistanceParams args;
-        args.metric = m;
-        args.metricArg = metricArg;
-        args.k = k;
-        args.dims = D;
-        args.vectors = input[i];
+        args.metric          = m;
+        args.metricArg       = metricArg;
+        args.k               = k;
+        args.dims            = D;
+        args.vectors         = input[i];
         args.vectorsRowMajor = rowMajorIndex;
-        args.numVectors = sizes[i];
-        args.queries = search_items;
+        args.numVectors      = sizes[i];
+        args.queries         = search_items;
         args.queriesRowMajor = rowMajorQuery;
-        args.numQueries = n;
-        args.outDistances = out_d_ptr;
-        args.outIndices = out_i_ptr;
+        args.numQueries      = n;
+        args.outDistances    = out_d_ptr;
+        args.outIndices      = out_i_ptr;
 
         /**
          * @todo: Until FAISS supports pluggable allocation strategies,
@@ -325,8 +340,7 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
   if (input.size() > 1 || translations != nullptr) {
     // This is necessary for proper index translations. If there are
     // no translations or partitions to combine, it can be skipped.
-    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream,
-                    trans.data());
+    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data());
   }
 
   // Perform necessary post-processing
@@ -334,14 +348,12 @@ void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
       metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
       metric == raft::distance::DistanceType::LpUnexpanded) {
     /**
-	* post-processing
-	*/
+     * post-processing
+     */
     float p = 0.5;  // standard l2
-    if (metric == raft::distance::DistanceType::LpUnexpanded)
-      p = 1.0 / metricArg;
+    if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg;
     raft::linalg::unaryOp<float>(
-      res_D, res_D, n * k,
-      [p] __device__(float input) { return powf(input, p); }, userStream);
+      res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream);
   }
 
   query_metric_processor->revert(search_items);
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index a645412c2f..6e983d1f42 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -39,11 +39,11 @@ using deviceAllocator = raft::mr::device::allocator;
 template <typename math_t>
 class MetricProcessor {
  public:
-  virtual void preprocess(math_t *data) {}
+  virtual void preprocess(math_t* data) {}
 
-  virtual void revert(math_t *data) {}
+  virtual void revert(math_t* data) {}
 
-  virtual void postprocess(math_t *data) {}
+  virtual void postprocess(math_t* data) {}
 
   virtual ~MetricProcessor() = default;
 };
@@ -60,7 +60,10 @@ class CosineMetricProcessor : public MetricProcessor<math_t> {
   raft::mr::device::buffer<math_t> colsums_;
 
  public:
-  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major,
+  CosineMetricProcessor(size_t n_rows,
+                        size_t n_cols,
+                        int k,
+                        bool row_major,
                         cudaStream_t stream,
                         std::shared_ptr<deviceAllocator> allocator)
     : device_allocator_(allocator),
@@ -69,30 +72,51 @@ class CosineMetricProcessor : public MetricProcessor<math_t> {
       n_cols_(n_cols),
       n_rows_(n_rows),
       row_major_(row_major),
-      k_(k) {}
+      k_(k)
+  {
+  }
 
-  void preprocess(math_t *data) {
-    raft::linalg::rowNorm(colsums_.data(), data, n_cols_, n_rows_,
-                          raft::linalg::NormType::L2Norm, row_major_, stream_,
+  void preprocess(math_t* data)
+  {
+    raft::linalg::rowNorm(colsums_.data(),
+                          data,
+                          n_cols_,
+                          n_rows_,
+                          raft::linalg::NormType::L2Norm,
+                          row_major_,
+                          stream_,
                           [] __device__(math_t in) { return sqrtf(in); });
 
     raft::linalg::matrixVectorOp(
-      data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false,
+      data,
+      data,
+      colsums_.data(),
+      n_cols_,
+      n_rows_,
+      row_major_,
+      false,
       [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; },
       stream_);
   }
 
-  void revert(math_t *data) {
+  void revert(math_t* data)
+  {
     raft::linalg::matrixVectorOp(
-      data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false,
+      data,
+      data,
+      colsums_.data(),
+      n_cols_,
+      n_rows_,
+      row_major_,
+      false,
       [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; },
       stream_);
   }
 
-  void postprocess(math_t *data) {
+  void postprocess(math_t* data)
+  {
     raft::linalg::unaryOp(
-      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; },
-      stream_);
+      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_);
   }
 
   ~CosineMetricProcessor() = default;
@@ -103,43 +127,64 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
   using cosine = CosineMetricProcessor<math_t>;
 
  public:
-  CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k,
-                             bool row_major, cudaStream_t stream,
+  CorrelationMetricProcessor(size_t n_rows,
+                             size_t n_cols,
+                             int k,
+                             bool row_major,
+                             cudaStream_t stream,
                              std::shared_ptr<deviceAllocator> allocator)
-    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream,
-                                    allocator),
-      means_(allocator, stream, n_rows) {}
+    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream, allocator),
+      means_(allocator, stream, n_rows)
+  {
+  }
 
-  void preprocess(math_t *data) {
+  void preprocess(math_t* data)
+  {
     math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
 
-    raft::linalg::reduce(means_.data(), data, cosine::n_cols_, cosine::n_rows_,
-                         (math_t)0.0, cosine::row_major_, true,
+    raft::linalg::reduce(means_.data(),
+                         data,
+                         cosine::n_cols_,
+                         cosine::n_rows_,
+                         (math_t)0.0,
+                         cosine::row_major_,
+                         true,
                          cosine::stream_);
 
     raft::linalg::unaryOp(
-      means_.data(), means_.data(), cosine::n_rows_,
+      means_.data(),
+      means_.data(),
+      cosine::n_rows_,
       [=] __device__(math_t in) { return in * normalizer_const; },
       cosine::stream_);
 
-    raft::stats::meanCenter(data, data, means_.data(), cosine::n_cols_,
-                            cosine::n_rows_, cosine::row_major_, false,
+    raft::stats::meanCenter(data,
+                            data,
+                            means_.data(),
+                            cosine::n_cols_,
+                            cosine::n_rows_,
+                            cosine::row_major_,
+                            false,
                             cosine::stream_);
 
     CosineMetricProcessor<math_t>::preprocess(data);
   }
 
-  void revert(math_t *data) {
+  void revert(math_t* data)
+  {
     CosineMetricProcessor<math_t>::revert(data);
 
-    raft::stats::meanAdd(data, data, means_.data(), cosine::n_cols_,
-                         cosine::n_rows_, cosine::row_major_, false,
+    raft::stats::meanAdd(data,
+                         data,
+                         means_.data(),
+                         cosine::n_cols_,
+                         cosine::n_rows_,
+                         cosine::row_major_,
+                         false,
                          cosine::stream_);
   }
 
-  void postprocess(math_t *data) {
-    CosineMetricProcessor<math_t>::postprocess(data);
-  }
+  void postprocess(math_t* data) { CosineMetricProcessor<math_t>::postprocess(data); }
 
   ~CorrelationMetricProcessor() = default;
 
@@ -149,33 +194,36 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
 template <typename math_t>
 class DefaultMetricProcessor : public MetricProcessor<math_t> {
  public:
-  void preprocess(math_t *data) {}
+  void preprocess(math_t* data) {}
 
-  void revert(math_t *data) {}
+  void revert(math_t* data) {}
 
-  void postprocess(math_t *data) {}
+  void postprocess(math_t* data) {}
 
   ~DefaultMetricProcessor() = default;
 };
 
 template <typename math_t>
 inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
-  distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery,
-  cudaStream_t userStream, std::shared_ptr<deviceAllocator> allocator) {
-  MetricProcessor<math_t> *mp = nullptr;
+  distance::DistanceType metric,
+  int n,
+  int D,
+  int k,
+  bool rowMajorQuery,
+  cudaStream_t userStream,
+  std::shared_ptr<deviceAllocator> allocator)
+{
+  MetricProcessor<math_t>* mp = nullptr;
 
   switch (metric) {
     case distance::DistanceType::CosineExpanded:
-      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream,
-                                             allocator);
+      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream, allocator);
       break;
 
     case distance::DistanceType::CorrelationExpanded:
-      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery,
-                                                  userStream, allocator);
+      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream, allocator);
       break;
-    default:
-      mp = new DefaultMetricProcessor<math_t>();
+    default: mp = new DefaultMetricProcessor<math_t>();
   }
 
   return std::unique_ptr<MetricProcessor<math_t>>(mp);
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index a3a1972c13..42ee11ba5b 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -28,12 +28,17 @@ namespace knn {
 using deviceAllocator = raft::mr::device::allocator;
 
 template <typename value_idx = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
-                            value_idx *outV, size_t n_samples, int n_parts,
-                            int k, cudaStream_t stream,
-                            value_idx *translations) {
-  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream,
-                          translations);
+inline void knn_merge_parts(value_t* inK,
+                            value_idx* inV,
+                            value_t* outK,
+                            value_idx* outV,
+                            size_t n_samples,
+                            int n_parts,
+                            int k,
+                            cudaStream_t stream,
+                            value_idx* translations)
+{
+  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
 }
 
 /**
@@ -59,23 +64,42 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
  * @param[in] expanded should lp-based distances be returned in their expanded
  * 					 form (e.g., without raising to the 1/p power).
  */
-inline void brute_force_knn(
-  raft::handle_t const &handle, std::vector<float *> &input,
-  std::vector<int> &sizes, int D, float *search_items, int n, int64_t *res_I,
-  float *res_D, int k, bool rowMajorIndex = true, bool rowMajorQuery = true,
-  std::vector<int64_t> *translations = nullptr,
-  distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
-  float metric_arg = 2.0f) {
-  ASSERT(input.size() == sizes.size(),
-         "input and sizes vectors must be the same size");
+inline void brute_force_knn(raft::handle_t const& handle,
+                            std::vector<float*>& input,
+                            std::vector<int>& sizes,
+                            int D,
+                            float* search_items,
+                            int n,
+                            int64_t* res_I,
+                            float* res_D,
+                            int k,
+                            bool rowMajorIndex                 = true,
+                            bool rowMajorQuery                 = true,
+                            std::vector<int64_t>* translations = nullptr,
+                            distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
+                            float metric_arg              = 2.0f)
+{
+  ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
 
   std::vector<cudaStream_t> int_streams = handle.get_internal_streams();
 
-  detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D,
-                               k, handle.get_device_allocator(),
-                               handle.get_stream(), int_streams.data(),
-                               handle.get_num_internal_streams(), rowMajorIndex,
-                               rowMajorQuery, translations, metric, metric_arg);
+  detail::brute_force_knn_impl(input,
+                               sizes,
+                               D,
+                               search_items,
+                               n,
+                               res_I,
+                               res_D,
+                               k,
+                               handle.get_device_allocator(),
+                               handle.get_stream(),
+                               int_streams.data(),
+                               handle.get_num_internal_streams(),
+                               rowMajorIndex,
+                               rowMajorQuery,
+                               translations,
+                               metric,
+                               metric_arg);
 }
 
 }  // namespace knn
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index 922ae7cfab..7032a0009e 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -24,8 +24,7 @@ using namespace matrix;
 
 // aggregate of control params for Eigen Solver:
 //
-template <typename index_type_t, typename value_type_t,
-          typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
 struct cluster_solver_config_t {
   size_type_t n_clusters;
   size_type_t maxIter;
@@ -35,25 +34,37 @@ struct cluster_solver_config_t {
   unsigned long long seed{123456};
 };
 
-template <typename index_type_t, typename value_type_t,
-          typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
 struct kmeans_solver_t {
-  explicit kmeans_solver_t(cluster_solver_config_t<index_type_t, value_type_t,
-                                                   size_type_t> const& config)
-    : config_(config) {}
+  explicit kmeans_solver_t(
+    cluster_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
 
   template <typename thrust_exe_policy_t>
-  std::pair<value_type_t, index_type_t> solve(
-    handle_t const& handle, thrust_exe_policy_t t_exe_policy,
-    size_type_t n_obs_vecs, size_type_t dim,
-    value_type_t const* __restrict__ obs,
-    index_type_t* __restrict__ codes) const {
+  std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+                                              thrust_exe_policy_t t_exe_policy,
+                                              size_type_t n_obs_vecs,
+                                              size_type_t dim,
+                                              value_type_t const* __restrict__ obs,
+                                              index_type_t* __restrict__ codes) const
+  {
     RAFT_EXPECTS(obs != nullptr, "Null obs buffer.");
     RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
     value_type_t residual{};
     index_type_t iters{};
-    kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters,
-           config_.tol, config_.maxIter, obs, codes, residual, iters,
+    kmeans(handle,
+           t_exe_policy,
+           n_obs_vecs,
+           dim,
+           config_.n_clusters,
+           config_.tol,
+           config_.maxIter,
+           obs,
+           codes,
+           residual,
+           iters,
            config_.seed);
     return std::make_pair(residual, iters);
   }
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index e36dca2e0c..156b996586 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -23,8 +23,7 @@ using namespace matrix;
 
 // aggregate of control params for Eigen Solver:
 //
-template <typename index_type_t, typename value_type_t,
-          typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
 struct eigen_solver_config_t {
   size_type_t n_eigVecs;
   size_type_t maxIter;
@@ -34,42 +33,59 @@ struct eigen_solver_config_t {
 
   bool reorthogonalize{false};
   unsigned long long seed{
-    1234567};  // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations
+    1234567};  // CAVEAT: this default value is now common to all instances of using seed in
+               // Lanczos; was not the case before: there were places where a default seed = 123456
+               // was used; this may trigger slightly different # solver iterations
 };
 
-template <typename index_type_t, typename value_type_t,
-          typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
 struct lanczos_solver_t {
-  explicit lanczos_solver_t(eigen_solver_config_t<index_type_t, value_type_t,
-                                                  size_type_t> const& config)
-    : config_(config) {}
+  explicit lanczos_solver_t(
+    eigen_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
 
-  index_type_t solve_smallest_eigenvectors(
-    handle_t const& handle,
-    sparse_matrix_t<index_type_t, value_type_t> const& A,
-    value_type_t* __restrict__ eigVals,
-    value_type_t* __restrict__ eigVecs) const {
+  index_type_t solve_smallest_eigenvectors(handle_t const& handle,
+                                           sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                           value_type_t* __restrict__ eigVals,
+                                           value_type_t* __restrict__ eigVecs) const
+  {
     RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
     RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
     index_type_t iters{};
-    computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter,
-                                config_.restartIter, config_.tol,
-                                config_.reorthogonalize, iters, eigVals,
-                                eigVecs, config_.seed);
+    computeSmallestEigenvectors(handle,
+                                A,
+                                config_.n_eigVecs,
+                                config_.maxIter,
+                                config_.restartIter,
+                                config_.tol,
+                                config_.reorthogonalize,
+                                iters,
+                                eigVals,
+                                eigVecs,
+                                config_.seed);
     return iters;
   }
 
-  index_type_t solve_largest_eigenvectors(
-    handle_t const& handle,
-    sparse_matrix_t<index_type_t, value_type_t> const& A,
-    value_type_t* __restrict__ eigVals,
-    value_type_t* __restrict__ eigVecs) const {
+  index_type_t solve_largest_eigenvectors(handle_t const& handle,
+                                          sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                          value_type_t* __restrict__ eigVals,
+                                          value_type_t* __restrict__ eigVecs) const
+  {
     RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
     RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
     index_type_t iters{};
-    computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter,
-                               config_.restartIter, config_.tol,
-                               config_.reorthogonalize, iters, eigVals, eigVecs,
+    computeLargestEigenvectors(handle,
+                               A,
+                               config_.n_eigVecs,
+                               config_.maxIter,
+                               config_.restartIter,
+                               config_.tol,
+                               config_.reorthogonalize,
+                               iters,
+                               eigVals,
+                               eigVecs,
                                config_.seed);
     return iters;
   }
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index fb05bff3e2..e0c3565b77 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -44,15 +44,15 @@ using namespace raft::linalg;
 // Useful grid settings
 // =========================================================
 
-constexpr unsigned int BLOCK_SIZE = 1024;
-constexpr unsigned int WARP_SIZE = 32;
+constexpr unsigned int BLOCK_SIZE      = 1024;
+constexpr unsigned int WARP_SIZE       = 32;
 constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
 
 // =========================================================
 // CUDA kernels
 // =========================================================
 
-/** 
+/**
  *  @brief Compute distances between observation vectors and centroids
  *    Block dimensions should be (warpSize, 1,
  *    blockSize/warpSize). Ideally, the grid is large enough so there
@@ -76,11 +76,13 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
  *    initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void computeDistances(
-  index_type_t n, index_type_t d, index_type_t k,
-  const value_type_t* __restrict__ obs,
-  const value_type_t* __restrict__ centroids,
-  value_type_t* __restrict__ dists) {
+static __global__ void computeDistances(index_type_t n,
+                                        index_type_t d,
+                                        index_type_t k,
+                                        const value_type_t* __restrict__ obs,
+                                        const value_type_t* __restrict__ centroids,
+                                        value_type_t* __restrict__ dists)
+{
   // Loop index
   index_type_t i;
 
@@ -115,12 +117,10 @@ static __global__ void computeDistances(
 
         // Perform reduction on warp
         for (i = WARP_SIZE / 2; i > 0; i /= 2)
-          dist_private +=
-            __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i);
+          dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i);
 
         // Write result to global memory
-        if (threadIdx.x == 0)
-          atomicAdd(dists + IDX(gidz, gidy, n), dist_private);
+        if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private);
 
         // Move to another observation vector
         gidz += blockDim.z * gridDim.z;
@@ -135,8 +135,8 @@ static __global__ void computeDistances(
   }
 }
 
-/** 
- *  @brief Find closest centroid to observation vectors. 
+/**
+ *  @brief Find closest centroid to observation vectors.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
  *    grid is large enough so there are n threads.
  *  @tparam index_type_t the type of data used for indexing.
@@ -157,10 +157,12 @@ static __global__ void computeDistances(
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void minDistances(index_type_t n, index_type_t k,
+static __global__ void minDistances(index_type_t n,
+                                    index_type_t k,
                                     value_type_t* __restrict__ dists,
                                     index_type_t* __restrict__ codes,
-                                    index_type_t* __restrict__ clusterSizes) {
+                                    index_type_t* __restrict__ clusterSizes)
+{
   // Loop index
   index_type_t i, j;
 
@@ -179,8 +181,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k,
     dist_min = dists[IDX(i, 0, n)];
     for (j = 1; j < k; ++j) {
       dist_curr = dists[IDX(i, j, n)];
-      code_min = (dist_curr < dist_min) ? j : code_min;
-      dist_min = (dist_curr < dist_min) ? dist_curr : dist_min;
+      code_min  = (dist_curr < dist_min) ? j : code_min;
+      dist_min  = (dist_curr < dist_min) ? dist_curr : dist_min;
     }
 
     // Transfer result to global memory
@@ -195,8 +197,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k,
   }
 }
 
-/** 
- *  @brief Check if newly computed distances are smaller than old distances. 
+/**
+ *  @brief Check if newly computed distances are smaller than old distances.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
  *    grid is large enough so there are n threads.
  *  @tparam index_type_t the type of data used for indexing.
@@ -219,7 +221,8 @@ static __global__ void minDistances2(index_type_t n,
                                      value_type_t* __restrict__ dists_old,
                                      const value_type_t* __restrict__ dists_new,
                                      index_type_t* __restrict__ codes_old,
-                                     index_type_t code_new) {
+                                     index_type_t code_new)
+{
   // Loop index
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -244,7 +247,7 @@ static __global__ void minDistances2(index_type_t n,
   }
 }
 
-/** 
+/**
  *  @brief Compute size of k-means clusters.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
  *    grid is large enough so there are n threads.
@@ -256,9 +259,11 @@ static __global__ void minDistances2(index_type_t n,
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t>
-static __global__ void computeClusterSizes(
-  index_type_t n, index_type_t k, const index_type_t* __restrict__ codes,
-  index_type_t* __restrict__ clusterSizes) {
+static __global__ void computeClusterSizes(index_type_t n,
+                                           index_type_t k,
+                                           const index_type_t* __restrict__ codes,
+                                           index_type_t* __restrict__ clusterSizes)
+{
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
   while (i < n) {
     atomicAdd(clusterSizes + codes[i], 1);
@@ -266,8 +271,8 @@ static __global__ void computeClusterSizes(
   }
 }
 
-/** 
- *  @brief Divide rows of centroid matrix by cluster sizes. 
+/**
+ *  @brief Divide rows of centroid matrix by cluster sizes.
  *    Divides the ith column of the sum matrix by the size of the ith
  *    cluster. If the sum matrix has been initialized so that the ith
  *    row is the sum of all observation vectors in the ith cluster,
@@ -288,9 +293,11 @@ static __global__ void computeClusterSizes(
  *    column is the mean position of a cluster).
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void divideCentroids(
-  index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes,
-  value_type_t* __restrict__ centroids) {
+static __global__ void divideCentroids(index_type_t d,
+                                       index_type_t k,
+                                       const index_type_t* __restrict__ clusterSizes,
+                                       value_type_t* __restrict__ centroids)
+{
   // Global indices
   index_type_t gidx, gidy;
 
@@ -341,15 +348,17 @@ static __global__ void divideCentroids(
  *    coordinates.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
+template <typename index_type_t, typename value_type_t, typename thrust_exe_pol_t>
 static int chooseNewCentroid(handle_t const& handle,
                              thrust_exe_pol_t thrust_exec_policy,
-                             index_type_t n, index_type_t d, index_type_t k,
+                             index_type_t n,
+                             index_type_t d,
+                             index_type_t k,
                              value_type_t rand,
                              const value_type_t* __restrict__ obs,
                              value_type_t* __restrict__ dists,
-                             value_type_t* __restrict__ centroid) {
+                             value_type_t* __restrict__ centroid)
+{
   // Cumulative sum of distances
   value_type_t* distsCumSum = dists + n;
   // Residual sum of squares
@@ -358,43 +367,43 @@ static int chooseNewCentroid(handle_t const& handle,
   index_type_t obsIndex;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // Compute cumulative sum of distances
-  thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists),
+  thrust::inclusive_scan(thrust_exec_policy,
+                         thrust::device_pointer_cast(dists),
                          thrust::device_pointer_cast(dists + n),
                          thrust::device_pointer_cast(distsCumSum));
   CHECK_CUDA(stream);
-  CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t),
-                           cudaMemcpyDeviceToHost, stream));
+  CUDA_TRY(cudaMemcpyAsync(
+    &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
 
   // Randomly choose observation vector
   //   Probabilities are proportional to square of distance to closest
   //   centroid (see k-means++ algorithm)
   //
-  //seg-faults due to Thrust bug
-  //on binary-search-like algorithms
-  //when run with stream dependent
-  //execution policies; fixed on Thrust GitHub
-  //hence replace w/ linear interpolation,
-  //until the Thrust issue gets resolved:
+  // seg-faults due to Thrust bug
+  // on binary-search-like algorithms
+  // when run with stream dependent
+  // execution policies; fixed on Thrust GitHub
+  // hence replace w/ linear interpolation,
+  // until the Thrust issue gets resolved:
   //
   // obsIndex = (thrust::lower_bound(
   //               thrust_exec_policy, thrust::device_pointer_cast(distsCumSum),
   //               thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) -
   //             thrust::device_pointer_cast(distsCumSum));
   //
-  //linear interpolation logic:
+  // linear interpolation logic:
   //{
   value_type_t minSum{0};
-  CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t),
-                           cudaMemcpyDeviceToHost, stream));
+  CUDA_TRY(
+    cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
   CHECK_CUDA(stream);
 
   if (distsSum > minSum) {
     value_type_t vIndex = static_cast<value_type_t>(n - 1);
-    obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) /
-                                         (distsSum - minSum));
+    obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) / (distsSum - minSum));
   } else {
     obsIndex = 0;
   }
@@ -405,21 +414,23 @@ static int chooseNewCentroid(handle_t const& handle,
   obsIndex = min(obsIndex, n - 1);
 
   // Record new centroid position
-  CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d),
-                           d * sizeof(value_type_t), cudaMemcpyDeviceToDevice,
+  CUDA_TRY(cudaMemcpyAsync(centroid,
+                           obs + IDX(0, obsIndex, d),
+                           d * sizeof(value_type_t),
+                           cudaMemcpyDeviceToDevice,
                            stream));
 
   return 0;
 }
 
 /**
- *  @brief Choose initial cluster centroids for k-means algorithm.  
+ *  @brief Choose initial cluster centroids for k-means algorithm.
  *    Centroids are randomly chosen with k-means++ algorithm
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
  *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
- *  @param  thrust_exec_policy thrust execution policy 
+ *  @param  thrust_exec_policy thrust execution policy
  *    (assumed to have same stream as handle.stream).
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
@@ -439,14 +450,19 @@ static int chooseNewCentroid(handle_t const& handle,
  *    distance between observation vectors and the closest centroid.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-static int initializeCentroids(
-  handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n,
-  index_type_t d, index_type_t k, const value_type_t* __restrict__ obs,
-  value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes,
-  index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists,
-  unsigned long long seed) {
+template <typename index_type_t, typename value_type_t, typename thrust_exe_pol_t>
+static int initializeCentroids(handle_t const& handle,
+                               thrust_exe_pol_t thrust_exec_policy,
+                               index_type_t n,
+                               index_type_t d,
+                               index_type_t k,
+                               const value_type_t* __restrict__ obs,
+                               value_type_t* __restrict__ centroids,
+                               index_type_t* __restrict__ codes,
+                               index_type_t* __restrict__ clusterSizes,
+                               value_type_t* __restrict__ dists,
+                               unsigned long long seed)
+{
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
@@ -459,7 +475,7 @@ static int initializeCentroids(
   thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   constexpr index_type_t grid_lower_bound{65535};
 
@@ -471,36 +487,43 @@ static int initializeCentroids(
   dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE};
 
   // CUDA grid dimensions
-  dim3 gridDim_warp{
-    min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1,
-    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
+  dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+                    1,
+                    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
 
   // CUDA grid dimensions
-  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound),
-                     1, 1};
+  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1};
 
   // Assign observation vectors to code 0
   CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
 
   // Choose first centroid
-  thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists),
-               thrust::device_pointer_cast(dists + n), 1);
+  thrust::fill(thrust_exec_policy,
+               thrust::device_pointer_cast(dists),
+               thrust::device_pointer_cast(dists + n),
+               1);
   CHECK_CUDA(stream);
-  if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng),
-                        obs, dists, centroids))
+  if (chooseNewCentroid(
+        handle, thrust_exec_policy, n, d, k, uniformDist(rng), obs, dists, centroids))
     WARNING("error in k-means++ (could not pick centroid)");
 
   // Compute distances from first centroid
   CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream));
-  computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
-    n, d, 1, obs, centroids, dists);
+  computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(n, d, 1, obs, centroids, dists);
   CHECK_CUDA(stream);
 
   // Choose remaining centroids
   for (i = 1; i < k; ++i) {
     // Choose ith centroid
-    if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng),
-                          obs, dists, centroids + IDX(0, i, d)))
+    if (chooseNewCentroid(handle,
+                          thrust_exec_policy,
+                          n,
+                          d,
+                          k,
+                          uniformDist(rng),
+                          obs,
+                          dists,
+                          centroids + IDX(0, i, d)))
       WARNING("error in k-means++ (could not pick centroid)");
 
     // Compute distances from ith centroid
@@ -510,22 +533,20 @@ static int initializeCentroids(
     CHECK_CUDA(stream);
 
     // Recompute minimum distances
-    minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n,
-                                                            codes, i);
+    minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n, codes, i);
     CHECK_CUDA(stream);
   }
 
   // Compute cluster sizes
   CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
-  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, k, codes,
-                                                                clusterSizes);
+  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, k, codes, clusterSizes);
   CHECK_CUDA(stream);
 
   return 0;
 }
 
-/** 
- *  @brief Find cluster centroids closest to observation vectors. 
+/**
+ *  @brief Find cluster centroids closest to observation vectors.
  *    Distance is measured with Euclidean norm.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
@@ -553,16 +574,21 @@ static int initializeCentroids(
  *    of squares of assignment.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-static int assignCentroids(
-  handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n,
-  index_type_t d, index_type_t k, const value_type_t* __restrict__ obs,
-  const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists,
-  index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes,
-  value_type_t* residual_host) {
+template <typename index_type_t, typename value_type_t, typename thrust_exe_pol_t>
+static int assignCentroids(handle_t const& handle,
+                           thrust_exe_pol_t thrust_exec_policy,
+                           index_type_t n,
+                           index_type_t d,
+                           index_type_t k,
+                           const value_type_t* __restrict__ obs,
+                           const value_type_t* __restrict__ centroids,
+                           value_type_t* __restrict__ dists,
+                           index_type_t* __restrict__ codes,
+                           index_type_t* __restrict__ clusterSizes,
+                           value_type_t* residual_host)
+{
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // Compute distance between centroids and observation vectors
   CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream));
@@ -574,11 +600,9 @@ static int assignCentroids(
   constexpr index_type_t grid_lower_bound{65535};
   gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound);
   gridDim.y = min(k, grid_lower_bound);
-  gridDim.z =
-    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
+  gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
 
-  computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids,
-                                                     dists);
+  computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids, dists);
   CHECK_CUDA(stream);
 
   // Find centroid closest to each observation vector
@@ -586,23 +610,21 @@ static int assignCentroids(
   blockDim.x = BLOCK_SIZE;
   blockDim.y = 1;
   blockDim.z = 1;
-  gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
-  gridDim.y = 1;
-  gridDim.z = 1;
-  minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes,
-                                                 clusterSizes);
+  gridDim.x  = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
+  gridDim.y  = 1;
+  gridDim.z  = 1;
+  minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
   CHECK_CUDA(stream);
 
   // Compute residual sum of squares
-  *residual_host =
-    thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists),
-                   thrust::device_pointer_cast(dists + n));
+  *residual_host = thrust::reduce(
+    thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n));
 
   return 0;
 }
 
-/** 
- *  @brief Update cluster centroids for k-means algorithm. 
+/**
+ *  @brief Update cluster centroids for k-means algorithm.
  *    All clusters are assumed to be non-empty.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
@@ -628,29 +650,31 @@ static int assignCentroids(
  *    Workspace.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
+template <typename index_type_t, typename value_type_t, typename thrust_exe_pol_t>
 static int updateCentroids(handle_t const& handle,
-                           thrust_exe_pol_t thrust_exec_policy, index_type_t n,
-                           index_type_t d, index_type_t k,
+                           thrust_exe_pol_t thrust_exec_policy,
+                           index_type_t n,
+                           index_type_t d,
+                           index_type_t k,
                            const value_type_t* __restrict__ obs,
                            const index_type_t* __restrict__ codes,
                            const index_type_t* __restrict__ clusterSizes,
                            value_type_t* __restrict__ centroids,
                            value_type_t* __restrict__ work,
-                           index_type_t* __restrict__ work_int) {
+                           index_type_t* __restrict__ work_int)
+{
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
 
   // Useful constants
-  const value_type_t one = 1;
+  const value_type_t one  = 1;
   const value_type_t zero = 0;
 
   constexpr index_type_t grid_lower_bound{65535};
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // Device memory
   thrust::device_ptr<value_type_t> obs_copy(work);
@@ -658,34 +682,56 @@ static int updateCentroids(handle_t const& handle,
   thrust::device_ptr<index_type_t> rows(work_int + d * n);
 
   // Take transpose of observation matrix
-  CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs,
-                          d, &zero, (value_type_t*)NULL, n,
-                          thrust::raw_pointer_cast(obs_copy), n, stream));
+  CUBLAS_CHECK(cublasgeam(cublas_h,
+                          CUBLAS_OP_T,
+                          CUBLAS_OP_N,
+                          n,
+                          d,
+                          &one,
+                          obs,
+                          d,
+                          &zero,
+                          (value_type_t*)NULL,
+                          n,
+                          thrust::raw_pointer_cast(obs_copy),
+                          n,
+                          stream));
 
   // Cluster assigned to each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
   CHECK_CUDA(stream);
-  thrust::transform(thrust_exec_policy, rows, rows + d * n,
-                    thrust::make_constant_iterator<index_type_t>(n), rows,
+  thrust::transform(thrust_exec_policy,
+                    rows,
+                    rows + d * n,
+                    thrust::make_constant_iterator<index_type_t>(n),
+                    rows,
                     thrust::modulus<index_type_t>());
   CHECK_CUDA(stream);
-  thrust::gather(thrust_exec_policy, rows, rows + d * n,
-                 thrust::device_pointer_cast(codes), codes_copy);
+  thrust::gather(
+    thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy);
   CHECK_CUDA(stream);
 
   // Row associated with each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
   CHECK_CUDA(stream);
-  thrust::transform(thrust_exec_policy, rows, rows + d * n,
-                    thrust::make_constant_iterator<index_type_t>(n), rows,
+  thrust::transform(thrust_exec_policy,
+                    rows,
+                    rows + d * n,
+                    thrust::make_constant_iterator<index_type_t>(n),
+                    rows,
                     thrust::divides<index_type_t>());
   CHECK_CUDA(stream);
 
   // Sort and reduce to add observation vectors in same cluster
-  thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n,
+  thrust::stable_sort_by_key(thrust_exec_policy,
+                             codes_copy,
+                             codes_copy + d * n,
                              make_zip_iterator(make_tuple(obs_copy, rows)));
   CHECK_CUDA(stream);
-  thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy,
+  thrust::reduce_by_key(thrust_exec_policy,
+                        rows,
+                        rows + d * n,
+                        obs_copy,
                         codes_copy,  // Output to codes_copy is ignored
                         thrust::device_pointer_cast(centroids));
   CHECK_CUDA(stream);
@@ -696,12 +742,11 @@ static int updateCentroids(handle_t const& handle,
   dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1};
 
   // CUDA grid dimensions
-  dim3 gridDim{
-    min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-    min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1};
+  dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+               min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound),
+               1};
 
-  divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes,
-                                                    centroids);
+  divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes, centroids);
   CHECK_CUDA(stream);
 
   return 0;
@@ -715,8 +760,8 @@ namespace raft {
 // k-means algorithm
 // =========================================================
 
-/** 
- *  @brief Find clusters with k-means algorithm. 
+/**
+ *  @brief Find clusters with k-means algorithm.
  *    Initial centroids are chosen with k-means++ algorithm. Empty
  *    clusters are reinitialized by choosing new centroids with
  *    k-means++ algorithm.
@@ -754,17 +799,24 @@ namespace raft {
  *  @param seed random seed to be used.
  *  @return error flag.
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
-           index_type_t n, index_type_t d, index_type_t k, value_type_t tol,
-           index_type_t maxiter, const value_type_t* __restrict__ obs,
+template <typename index_type_t, typename value_type_t, typename thrust_exe_pol_t>
+int kmeans(handle_t const& handle,
+           thrust_exe_pol_t thrust_exec_policy,
+           index_type_t n,
+           index_type_t d,
+           index_type_t k,
+           value_type_t tol,
+           index_type_t maxiter,
+           const value_type_t* __restrict__ obs,
            index_type_t* __restrict__ codes,
            index_type_t* __restrict__ clusterSizes,
            value_type_t* __restrict__ centroids,
-           value_type_t* __restrict__ work, index_type_t* __restrict__ work_int,
-           value_type_t* residual_host, index_type_t* iters_host,
-           unsigned long long seed) {
+           value_type_t* __restrict__ work,
+           index_type_t* __restrict__ work_int,
+           value_type_t* residual_host,
+           index_type_t* iters_host,
+           unsigned long long seed)
+{
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
@@ -786,100 +838,120 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
   // -------------------------------------------------------
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // Trivial cases
   if (k == 1) {
     CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
-    CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t),
-                             cudaMemcpyHostToDevice, stream));
-    if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes,
-                        clusterSizes, centroids, work, work_int))
+    CUDA_TRY(
+      cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream));
+    if (updateCentroids(
+          handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
       WARNING("could not compute k-means centroids");
 
     dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
 
     dim3 gridDim{
-      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1,
-      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE),
-          grid_lower_bound)};
+      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+      1,
+      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
 
     CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
-    computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids,
-                                                       work);
+    computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
     CHECK_CUDA(stream);
-    *residual_host =
-      thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work),
-                     thrust::device_pointer_cast(work + n));
+    *residual_host = thrust::reduce(
+      thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n));
     CHECK_CUDA(stream);
     return 0;
   }
   if (n <= k) {
-    thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes),
+    thrust::sequence(thrust_exec_policy,
+                     thrust::device_pointer_cast(codes),
                      thrust::device_pointer_cast(codes + n));
     CHECK_CUDA(stream);
-    thrust::fill_n(thrust_exec_policy,
-                   thrust::device_pointer_cast(clusterSizes), n, 1);
+    thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1);
     CHECK_CUDA(stream);
 
     if (n < k)
-      CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0,
-                               (k - n) * sizeof(index_type_t), stream));
-    CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t),
-                             cudaMemcpyDeviceToDevice, stream));
+      CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream));
+    CUDA_TRY(cudaMemcpyAsync(
+      centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
     *residual_host = 0;
     return 0;
   }
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // k-means++ algorithm
   // -------------------------------------------------------
 
   // Choose initial cluster centroids
-  if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
-                          codes, clusterSizes, work, seed))
+  if (initializeCentroids(
+        handle, thrust_exec_policy, n, d, k, obs, centroids, codes, clusterSizes, work, seed))
     WARNING("could not initialize k-means centroids");
 
   // Apply k-means iteration until convergence
   for (iter = 0; iter < maxiter; ++iter) {
     // Update cluster centroids
-    if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes,
-                        clusterSizes, centroids, work, work_int))
+    if (updateCentroids(
+          handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
       WARNING("could not update k-means centroids");
 
     // Determine centroid closest to each observation
     residualPrev = *residual_host;
-    if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
-                        work, codes, clusterSizes, residual_host))
+    if (assignCentroids(handle,
+                        thrust_exec_policy,
+                        n,
+                        d,
+                        k,
+                        obs,
+                        centroids,
+                        work,
+                        codes,
+                        clusterSizes,
+                        residual_host))
       WARNING("could not assign observation vectors to k-means clusters");
 
     // Reinitialize empty clusters with new centroids
-    index_type_t emptyCentroid =
-      (thrust::find(thrust_exec_policy,
-                    thrust::device_pointer_cast(clusterSizes),
-                    thrust::device_pointer_cast(clusterSizes + k), 0) -
-       thrust::device_pointer_cast(clusterSizes));
+    index_type_t emptyCentroid = (thrust::find(thrust_exec_policy,
+                                               thrust::device_pointer_cast(clusterSizes),
+                                               thrust::device_pointer_cast(clusterSizes + k),
+                                               0) -
+                                  thrust::device_pointer_cast(clusterSizes));
 
     // FIXME: emptyCentroid never reaches k (infinite loop) under certain
     // conditions, such as if obs is corrupt (as seen as a result of a
     // DataFrame column of NULL edge vals used to create the Graph)
     while (emptyCentroid < k) {
-      if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k,
-                            uniformDist(rng), obs, work,
+      if (chooseNewCentroid(handle,
+                            thrust_exec_policy,
+                            n,
+                            d,
+                            k,
+                            uniformDist(rng),
+                            obs,
+                            work,
                             centroids + IDX(0, emptyCentroid, d)))
         WARNING("could not replace empty centroid");
-      if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
-                          work, codes, clusterSizes, residual_host))
+      if (assignCentroids(handle,
+                          thrust_exec_policy,
+                          n,
+                          d,
+                          k,
+                          obs,
+                          centroids,
+                          work,
+                          codes,
+                          clusterSizes,
+                          residual_host))
         WARNING("could not assign observation vectors to k-means clusters");
-      emptyCentroid =
-        (thrust::find(thrust_exec_policy,
-                      thrust::device_pointer_cast(clusterSizes),
-                      thrust::device_pointer_cast(clusterSizes + k), 0) -
-         thrust::device_pointer_cast(clusterSizes));
+      emptyCentroid = (thrust::find(thrust_exec_policy,
+                                    thrust::device_pointer_cast(clusterSizes),
+                                    thrust::device_pointer_cast(clusterSizes + k),
+                                    0) -
+                       thrust::device_pointer_cast(clusterSizes));
       CHECK_CUDA(stream);
     }
 
@@ -891,14 +963,13 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
   }
 
   // Warning if k-means has failed to converge
-  if (std::fabs(residualPrev - (*residual_host)) / n >= tol)
-    WARNING("k-means failed to converge");
+  if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge");
 
   *iters_host = iter;
   return 0;
 }
 
-/** 
+/**
  *  @brief Find clusters with k-means algorithm.
  *    Initial centroids are chosen with k-means++ algorithm. Empty
  *    clusters are reinitialized by choosing new centroids with
@@ -926,13 +997,20 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
  *  @param seed random seed to be used.
  *  @return error flag
  */
-template <typename index_type_t, typename value_type_t,
-          typename thrust_exe_pol_t>
-int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
-           index_type_t n, index_type_t d, index_type_t k, value_type_t tol,
-           index_type_t maxiter, const value_type_t* __restrict__ obs,
-           index_type_t* __restrict__ codes, value_type_t& residual,
-           index_type_t& iters, unsigned long long seed = 123456) {
+template <typename index_type_t, typename value_type_t, typename thrust_exe_pol_t>
+int kmeans(handle_t const& handle,
+           thrust_exe_pol_t thrust_exec_policy,
+           index_type_t n,
+           index_type_t d,
+           index_type_t k,
+           value_type_t tol,
+           index_type_t maxiter,
+           const value_type_t* __restrict__ obs,
+           index_type_t* __restrict__ codes,
+           value_type_t& residual,
+           index_type_t& iters,
+           unsigned long long seed = 123456)
+{
   using namespace matrix;
 
   // Check that parameters are valid
@@ -949,10 +1027,22 @@ int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
   vector_t<index_type_t> work_int(handle, 2 * d * n);
 
   // Perform k-means
-  return kmeans<index_type_t, value_type_t>(
-    handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes,
-    clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual,
-    &iters, seed);
+  return kmeans<index_type_t, value_type_t>(handle,
+                                            thrust_exec_policy,
+                                            n,
+                                            d,
+                                            k,
+                                            tol,
+                                            maxiter,
+                                            obs,
+                                            codes,
+                                            clusterSizes.raw(),
+                                            centroids.raw(),
+                                            work.raw(),
+                                            work_int.raw(),
+                                            &residual,
+                                            &iters,
+                                            seed);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp
index d14bf05f37..35fc22c770 100644
--- a/cpp/include/raft/spectral/lapack.hpp
+++ b/cpp/include/raft/spectral/lapack.hpp
@@ -21,66 +21,125 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/error.hpp>
 
-//for now; TODO: check if/where this `define` should be;
+// for now; TODO: check if/where this `define` should be;
 //
 #define USE_LAPACK
 
 namespace raft {
 
-#define lapackCheckError(status)                        \
-  {                                                     \
-    if (status < 0) {                                   \
-      std::stringstream ss;                             \
-      ss << "Lapack error: argument number " << -status \
-         << " had an illegal value.";                   \
-      throw exception(ss.str());                        \
-    } else if (status > 0)                              \
-      RAFT_FAIL("Lapack error: internal error.");       \
+#define lapackCheckError(status)                                                     \
+  {                                                                                  \
+    if (status < 0) {                                                                \
+      std::stringstream ss;                                                          \
+      ss << "Lapack error: argument number " << -status << " had an illegal value."; \
+      throw exception(ss.str());                                                     \
+    } else if (status > 0)                                                           \
+      RAFT_FAIL("Lapack error: internal error.");                                    \
   }
 
-extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau,
-                        float *work, int *lwork, int *info);
-extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau,
-                        double *work, int *lwork, int *info);
-extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k,
-                        float *a, int *lda, const float *tau, float *c,
-                        int *ldc, float *work, int *lwork, int *info);
-extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k,
-                        double *a, int *lda, const double *tau, double *c,
-                        int *ldc, double *work, int *lwork, int *info);
-extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
-                      double *wr, double *wi, double *vl, int *ldvl, double *vr,
-                      int *ldvr, double *work, int *lwork, int *info);
-
-extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda,
-                      float *wr, float *wi, float *vl, int *ldvl, float *vr,
-                      int *ldvr, float *work, int *lwork, int *info);
-
-extern "C" cusolverStatus_t cusolverDnSgemmHost(
-  cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
-  const float *alpha, const float *A, int lda, const float *B, int ldb,
-  const float *beta, float *C, int ldc);
-
-extern "C" cusolverStatus_t cusolverDnDgemmHost(
-  cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
-  const double *alpha, const double *A, int lda, const double *B, int ldb,
-  const double *beta, double *C, int ldc);
-
-extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e,
-                                                 int *info);
-
-extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e,
-                                                 int *info);
-
-extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz,
-                                                 int n, float *d, float *e,
-                                                 float *z, int ldz, float *work,
-                                                 int *info);
-
-extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz,
-                                                 int n, double *d, double *e,
-                                                 double *z, int ldz,
-                                                 double *work, int *info);
+extern "C" void sgeqrf_(
+  int* m, int* n, float* a, int* lda, float* tau, float* work, int* lwork, int* info);
+extern "C" void dgeqrf_(
+  int* m, int* n, double* a, int* lda, double* tau, double* work, int* lwork, int* info);
+extern "C" void sormqr_(char* side,
+                        char* trans,
+                        int* m,
+                        int* n,
+                        int* k,
+                        float* a,
+                        int* lda,
+                        const float* tau,
+                        float* c,
+                        int* ldc,
+                        float* work,
+                        int* lwork,
+                        int* info);
+extern "C" void dormqr_(char* side,
+                        char* trans,
+                        int* m,
+                        int* n,
+                        int* k,
+                        double* a,
+                        int* lda,
+                        const double* tau,
+                        double* c,
+                        int* ldc,
+                        double* work,
+                        int* lwork,
+                        int* info);
+extern "C" int dgeev_(char* jobvl,
+                      char* jobvr,
+                      int* n,
+                      double* a,
+                      int* lda,
+                      double* wr,
+                      double* wi,
+                      double* vl,
+                      int* ldvl,
+                      double* vr,
+                      int* ldvr,
+                      double* work,
+                      int* lwork,
+                      int* info);
+
+extern "C" int sgeev_(char* jobvl,
+                      char* jobvr,
+                      int* n,
+                      float* a,
+                      int* lda,
+                      float* wr,
+                      float* wi,
+                      float* vl,
+                      int* ldvl,
+                      float* vr,
+                      int* ldvr,
+                      float* work,
+                      int* lwork,
+                      int* info);
+
+extern "C" cusolverStatus_t cusolverDnSgemmHost(cublasOperation_t transa,
+                                                cublasOperation_t transb,
+                                                int m,
+                                                int n,
+                                                int k,
+                                                const float* alpha,
+                                                const float* A,
+                                                int lda,
+                                                const float* B,
+                                                int ldb,
+                                                const float* beta,
+                                                float* C,
+                                                int ldc);
+
+extern "C" cusolverStatus_t cusolverDnDgemmHost(cublasOperation_t transa,
+                                                cublasOperation_t transb,
+                                                int m,
+                                                int n,
+                                                int k,
+                                                const double* alpha,
+                                                const double* A,
+                                                int lda,
+                                                const double* B,
+                                                int ldb,
+                                                const double* beta,
+                                                double* C,
+                                                int ldc);
+
+extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float* d, float* e, int* info);
+
+extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double* d, double* e, int* info);
+
+extern "C" cusolverStatus_t cusolverDnSsteqrHost(
+  const signed char* compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info);
+
+extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char* compz,
+                                                 int n,
+                                                 double* d,
+                                                 double* e,
+                                                 double* z,
+                                                 int ldz,
+                                                 double* work,
+                                                 int* info);
 
 template <typename T>
 class Lapack {
@@ -91,182 +150,339 @@ class Lapack {
  public:
   static void check_lapack_enabled();
 
-  static void gemm(bool transa, bool transb, int m, int n, int k, T alpha,
-                   const T *A, int lda, const T *B, int ldb, T beta, T *C,
+  static void gemm(bool transa,
+                   bool transb,
+                   int m,
+                   int n,
+                   int k,
+                   T alpha,
+                   const T* A,
+                   int lda,
+                   const T* B,
+                   int ldb,
+                   T beta,
+                   T* C,
                    int ldc);
 
   // special QR for lanczos
-  static void sterf(int n, T *d, T *e);
-  static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work);
+  static void sterf(int n, T* d, T* e);
+  static void steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work);
 
   // QR
   // computes the QR factorization of a general matrix
-  static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork);
+  static void geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork);
   // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf.
 
   // multiply C by implicit Q
-  static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
-                    int lda, T *tau, T *c, int ldc, T *work, int *lwork);
-
-  static void geev(T *A, T *eigenvalues, int dim, int lda);
-  static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
+  static void ormqr(bool right_side,
+                    bool transq,
+                    int m,
+                    int n,
+                    int k,
+                    T* a,
+                    int lda,
+                    T* tau,
+                    T* c,
+                    int ldc,
+                    T* work,
+                    int* lwork);
+
+  static void geev(T* A, T* eigenvalues, int dim, int lda);
+  static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr);
+  static void geev(T* A,
+                   T* eigenvalues_r,
+                   T* eigenvalues_i,
+                   T* eigenvectors_r,
+                   T* eigenvectors_i,
+                   int dim,
+                   int lda,
                    int ldvr);
-  static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r,
-                   T *eigenvectors_i, int dim, int lda, int ldvr);
 
  private:
-  static void lapack_gemm(const char transa, const char transb, int m, int n,
-                          int k, float alpha, const float *a, int lda,
-                          const float *b, int ldb, float beta, float *c,
-                          int ldc) {
-    cublasOperation_t cublas_transa =
-      (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cublasOperation_t cublas_transb =
-      (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha,
-                        (float *)a, lda, (float *)b, ldb, &beta, c, ldc);
+  static void lapack_gemm(const char transa,
+                          const char transb,
+                          int m,
+                          int n,
+                          int k,
+                          float alpha,
+                          const float* a,
+                          int lda,
+                          const float* b,
+                          int ldb,
+                          float beta,
+                          float* c,
+                          int ldc)
+  {
+    cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cusolverDnSgemmHost(
+      cublas_transa, cublas_transb, m, n, k, &alpha, (float*)a, lda, (float*)b, ldb, &beta, c, ldc);
   }
 
-  static void lapack_gemm(const signed char transa, const signed char transb,
-                          int m, int n, int k, double alpha, const double *a,
-                          int lda, const double *b, int ldb, double beta,
-                          double *c, int ldc) {
-    cublasOperation_t cublas_transa =
-      (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cublasOperation_t cublas_transb =
-      (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha,
-                        (double *)a, lda, (double *)b, ldb, &beta, c, ldc);
+  static void lapack_gemm(const signed char transa,
+                          const signed char transb,
+                          int m,
+                          int n,
+                          int k,
+                          double alpha,
+                          const double* a,
+                          int lda,
+                          const double* b,
+                          int ldb,
+                          double beta,
+                          double* c,
+                          int ldc)
+  {
+    cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cusolverDnDgemmHost(cublas_transa,
+                        cublas_transb,
+                        m,
+                        n,
+                        k,
+                        &alpha,
+                        (double*)a,
+                        lda,
+                        (double*)b,
+                        ldb,
+                        &beta,
+                        c,
+                        ldc);
   }
 
-  static void lapack_sterf(int n, float *d, float *e, int *info) {
+  static void lapack_sterf(int n, float* d, float* e, int* info)
+  {
     cusolverDnSsterfHost(n, d, e, info);
   }
 
-  static void lapack_sterf(int n, double *d, double *e, int *info) {
+  static void lapack_sterf(int n, double* d, double* e, int* info)
+  {
     cusolverDnDsterfHost(n, d, e, info);
   }
 
-  static void lapack_steqr(const signed char compz, int n, float *d, float *e,
-                           float *z, int ldz, float *work, int *info) {
+  static void lapack_steqr(
+    const signed char compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info)
+  {
     cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info);
   }
 
-  static void lapack_steqr(const signed char compz, int n, double *d, double *e,
-                           double *z, int ldz, double *work, int *info) {
+  static void lapack_steqr(const signed char compz,
+                           int n,
+                           double* d,
+                           double* e,
+                           double* z,
+                           int ldz,
+                           double* work,
+                           int* info)
+  {
     cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info);
   }
 
-  static void lapack_geqrf(int m, int n, float *a, int lda, float *tau,
-                           float *work, int *lwork, int *info) {
+  static void lapack_geqrf(
+    int m, int n, float* a, int lda, float* tau, float* work, int* lwork, int* info)
+  {
     sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
   }
 
-  static void lapack_geqrf(int m, int n, double *a, int lda, double *tau,
-                           double *work, int *lwork, int *info) {
+  static void lapack_geqrf(
+    int m, int n, double* a, int lda, double* tau, double* work, int* lwork, int* info)
+  {
     dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
   }
 
-  static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a,
-                           int lda, float *tau, float *c, int ldc, float *work,
-                           int *lwork, int *info) {
-    sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork,
-            info);
+  static void lapack_ormqr(char side,
+                           char trans,
+                           int m,
+                           int n,
+                           int k,
+                           float* a,
+                           int lda,
+                           float* tau,
+                           float* c,
+                           int ldc,
+                           float* work,
+                           int* lwork,
+                           int* info)
+  {
+    sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info);
   }
 
-  static void lapack_ormqr(char side, char trans, int m, int n, int k,
-                           double *a, int lda, double *tau, double *c, int ldc,
-                           double *work, int *lwork, int *info) {
-    dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork,
-            info);
+  static void lapack_ormqr(char side,
+                           char trans,
+                           int m,
+                           int n,
+                           int k,
+                           double* a,
+                           int lda,
+                           double* tau,
+                           double* c,
+                           int ldc,
+                           double* work,
+                           int* lwork,
+                           int* info)
+  {
+    dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info);
   }
 
-  static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a,
-                                  int *lda, double *wr, double *wi, double *vl,
-                                  int *ldvl, double *vr, int *ldvr,
-                                  double *work, int *lwork, int *info) {
-    return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work,
-                  lwork, info);
+  static int lapack_geev_dispatch(char* jobvl,
+                                  char* jobvr,
+                                  int* n,
+                                  double* a,
+                                  int* lda,
+                                  double* wr,
+                                  double* wi,
+                                  double* vl,
+                                  int* ldvl,
+                                  double* vr,
+                                  int* ldvr,
+                                  double* work,
+                                  int* lwork,
+                                  int* info)
+  {
+    return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info);
   }
 
-  static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a,
-                                  int *lda, float *wr, float *wi, float *vl,
-                                  int *ldvl, float *vr, int *ldvr, float *work,
-                                  int *lwork, int *info) {
-    return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work,
-                  lwork, info);
+  static int lapack_geev_dispatch(char* jobvl,
+                                  char* jobvr,
+                                  int* n,
+                                  float* a,
+                                  int* lda,
+                                  float* wr,
+                                  float* wi,
+                                  float* vl,
+                                  int* ldvl,
+                                  float* vr,
+                                  int* ldvr,
+                                  float* work,
+                                  int* lwork,
+                                  int* info)
+  {
+    return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info);
   }
 
   // real eigenvalues
-  static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) {
+  static void lapack_geev(T* A, T* eigenvalues, int dim, int lda)
+  {
     char job = 'N';
     std::vector<T> WI(dim);
-    int ldv = 1;
-    T *vl = 0;
+    int ldv       = 1;
+    T* vl         = 0;
     int work_size = 6 * dim;
     std::vector<T> work(work_size);
     int info;
-    lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl,
-                         &ldv, vl, &ldv, work.data(), &work_size, &info);
+    lapack_geev_dispatch(&job,
+                         &job,
+                         &dim,
+                         A,
+                         &lda,
+                         eigenvalues,
+                         WI.data(),
+                         vl,
+                         &ldv,
+                         vl,
+                         &ldv,
+                         work.data(),
+                         &work_size,
+                         &info);
     lapackCheckError(info);
   }
 
   // real eigenpairs
-  static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim,
-                          int lda, int ldvr) {
+  static void lapack_geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr)
+  {
     char jobvl = 'N';
     char jobvr = 'V';
     std::vector<T> WI(dim);
     int work_size = 6 * dim;
-    T *vl = 0;
-    int ldvl = 1;
+    T* vl         = 0;
+    int ldvl      = 1;
     std::vector<T> work(work_size);
     int info;
-    lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(),
-                         vl, &ldvl, eigenvectors, &ldvr, work.data(),
-                         &work_size, &info);
+    lapack_geev_dispatch(&jobvl,
+                         &jobvr,
+                         &dim,
+                         A,
+                         &lda,
+                         eigenvalues,
+                         WI.data(),
+                         vl,
+                         &ldvl,
+                         eigenvectors,
+                         &ldvr,
+                         work.data(),
+                         &work_size,
+                         &info);
     lapackCheckError(info);
   }
 
   // complex eigenpairs
-  static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i,
-                          T *eigenvectors_r, T *eigenvectors_i, int dim,
-                          int lda, int ldvr) {
-    char jobvl = 'N';
-    char jobvr = 'V';
+  static void lapack_geev(T* A,
+                          T* eigenvalues_r,
+                          T* eigenvalues_i,
+                          T* eigenvectors_r,
+                          T* eigenvectors_i,
+                          int dim,
+                          int lda,
+                          int ldvr)
+  {
+    char jobvl    = 'N';
+    char jobvr    = 'V';
     int work_size = 8 * dim;
-    int ldvl = 1;
+    int ldvl      = 1;
     std::vector<T> work(work_size);
     int info;
-    lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r,
-                         eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr,
-                         work.data(), &work_size, &info);
+    lapack_geev_dispatch(&jobvl,
+                         &jobvr,
+                         &dim,
+                         A,
+                         &lda,
+                         eigenvalues_r,
+                         eigenvalues_i,
+                         0,
+                         &ldvl,
+                         eigenvectors_r,
+                         &ldvr,
+                         work.data(),
+                         &work_size,
+                         &info);
     lapackCheckError(info);
   }
 };
 
 template <typename T>
-void Lapack<T>::check_lapack_enabled() {
+void Lapack<T>::check_lapack_enabled()
+{
 #ifndef USE_LAPACK
   RAFT_FAIL("Error: LAPACK not enabled.");
 #endif
 }
 
 template <typename T>
-void Lapack<T>::gemm(bool transa, bool transb, int m, int n, int k, T alpha,
-                     const T *A, int lda, const T *B, int ldb, T beta, T *C,
-                     int ldc) {
+void Lapack<T>::gemm(bool transa,
+                     bool transb,
+                     int m,
+                     int n,
+                     int k,
+                     T alpha,
+                     const T* A,
+                     int lda,
+                     const T* B,
+                     int ldb,
+                     T beta,
+                     T* C,
+                     int ldc)
+{
   // check_lapack_enabled();
   //#ifdef NVGRAPH_USE_LAPACK
   const char transA_char = transa ? 'T' : 'N';
   const char transB_char = transb ? 'T' : 'N';
-  lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C,
-              ldc);
+  lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
   //#endif
 }
 
 template <typename T>
-void Lapack<T>::sterf(int n, T *d, T *e) {
+void Lapack<T>::sterf(int n, T* d, T* e)
+{
   //    check_lapack_enabled();
   //#ifdef NVGRAPH_USE_LAPACK
   int info;
@@ -276,7 +492,8 @@ void Lapack<T>::sterf(int n, T *d, T *e) {
 }
 
 template <typename T>
-void Lapack<T>::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) {
+void Lapack<T>::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work)
+{
   //    check_lapack_enabled();
   //#ifdef NVGRAPH_USE_LAPACK
   int info;
@@ -286,8 +503,8 @@ void Lapack<T>::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) {
 }
 
 template <typename T>
-void Lapack<T>::geqrf(int m, int n, T *a, int lda, T *tau, T *work,
-                      int *lwork) {
+void Lapack<T>::geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
   int info;
@@ -296,11 +513,22 @@ void Lapack<T>::geqrf(int m, int n, T *a, int lda, T *tau, T *work,
 #endif
 }
 template <typename T>
-void Lapack<T>::ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
-                      int lda, T *tau, T *c, int ldc, T *work, int *lwork) {
+void Lapack<T>::ormqr(bool right_side,
+                      bool transq,
+                      int m,
+                      int n,
+                      int k,
+                      T* a,
+                      int lda,
+                      T* tau,
+                      T* c,
+                      int ldc,
+                      T* work,
+                      int* lwork)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
-  char side = right_side ? 'R' : 'L';
+  char side  = right_side ? 'R' : 'L';
   char trans = transq ? 'T' : 'N';
   int info;
   lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info);
@@ -310,7 +538,8 @@ void Lapack<T>::ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
 
 // real eigenvalues
 template <typename T>
-void Lapack<T>::geev(T *A, T *eigenvalues, int dim, int lda) {
+void Lapack<T>::geev(T* A, T* eigenvalues, int dim, int lda)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
   lapack_geev(A, eigenvalues, dim, lda);
@@ -318,8 +547,8 @@ void Lapack<T>::geev(T *A, T *eigenvalues, int dim, int lda) {
 }
 // real eigenpairs
 template <typename T>
-void Lapack<T>::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
-                     int ldvr) {
+void Lapack<T>::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
   lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr);
@@ -327,13 +556,18 @@ void Lapack<T>::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
 }
 // complex eigenpairs
 template <typename T>
-void Lapack<T>::geev(T *A, T *eigenvalues_r, T *eigenvalues_i,
-                     T *eigenvectors_r, T *eigenvectors_i, int dim, int lda,
-                     int ldvr) {
+void Lapack<T>::geev(T* A,
+                     T* eigenvalues_r,
+                     T* eigenvalues_i,
+                     T* eigenvectors_r,
+                     T* eigenvectors_i,
+                     int dim,
+                     int lda,
+                     int ldvr)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
-  lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i,
-              dim, lda, ldvr);
+  lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr);
 #endif
 }
 
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index c43154d17a..89d2b7e8ec 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -40,10 +40,12 @@ using size_type = int;  // for now; TODO: move it in appropriate header
 // Apply diagonal matrix to vector:
 //
 template <typename IndexType_, typename ValueType_>
-static __global__ void diagmv(IndexType_ n, ValueType_ alpha,
+static __global__ void diagmv(IndexType_ n,
+                              ValueType_ alpha,
                               const ValueType_* __restrict__ D,
                               const ValueType_* __restrict__ x,
-                              ValueType_* __restrict__ y) {
+                              ValueType_* __restrict__ y)
+{
   IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x;
   while (i < n) {
     y[i] += alpha * D[i] * x[i];
@@ -58,7 +60,7 @@ enum struct sparse_mv_alg_t : int {
   SPARSE_MV_UNDEFINED = -1,
   SPARSE_MV_ALG_DEFAULT,  // generic, for any sparse matrix
   SPARSE_MV_ALG1,         // typical for CSR
-  SPARSE_MV_ALG2  // may provide better performamce for irregular sparse matrices
+  SPARSE_MV_ALG2          // may provide better performamce for irregular sparse matrices
 };
 
 // Vector "view"-like aggregate for linear algebra purposes
@@ -68,21 +70,21 @@ struct vector_view_t {
   value_type* buffer_;
   size_type size_;
 
-  vector_view_t(value_type* buffer, size_type sz)
-    : buffer_(buffer), size_(sz) {}
+  vector_view_t(value_type* buffer, size_type sz) : buffer_(buffer), size_(sz) {}
 
-  vector_view_t(vector_view_t&& other)
-    : buffer_(other.buffer_), size_(other.size_) {
+  vector_view_t(vector_view_t&& other) : buffer_(other.buffer_), size_(other.size_)
+  {
     other.buffer_ = nullptr;
-    other.size_ = 0;
+    other.size_   = 0;
   }
 
-  vector_view_t& operator=(vector_view_t&& other) {
+  vector_view_t& operator=(vector_view_t&& other)
+  {
     buffer_ = other.buffer_;
-    size_ = other.size_;
+    size_   = other.size_;
 
     other.buffer_ = nullptr;
-    other.size_ = 0;
+    other.size_   = 0;
   }
 };
 
@@ -98,15 +100,16 @@ class vector_t {
  public:
   vector_t(handle_t const& raft_handle, size_type sz)
     : handle_(raft_handle),
-      buffer_(
-        static_cast<value_type*>(raft_handle.get_device_allocator()->allocate(
-          sz * sizeof(value_type), raft_handle.get_stream()))),
+      buffer_(static_cast<value_type*>(raft_handle.get_device_allocator()->allocate(
+        sz * sizeof(value_type), raft_handle.get_stream()))),
       size_(sz),
-      stream_(raft_handle.get_stream()) {}
+      stream_(raft_handle.get_stream())
+  {
+  }
 
-  ~vector_t(void) {
-    handle_.get_device_allocator()->deallocate(
-      buffer_, size_ * sizeof(value_type), stream_);
+  ~vector_t(void)
+  {
+    handle_.get_device_allocator()->deallocate(buffer_, size_ * sizeof(value_type), stream_);
   }
 
   size_type size(void) const { return size_; }
@@ -116,26 +119,31 @@ class vector_t {
   value_type const* raw(void) const { return buffer_; }
 
   template <typename ThrustExecPolicy>
-  value_type nrm1(ThrustExecPolicy t_exe_pol) const {
-    return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0},
-                          [] __device__(auto left, auto right) {
-                            auto abs_left = left > 0 ? left : -left;
-                            auto abs_right = right > 0 ? right : -right;
-                            return abs_left + abs_right;
-                          });
+  value_type nrm1(ThrustExecPolicy t_exe_pol) const
+  {
+    return thrust::reduce(
+      t_exe_pol, buffer_, buffer_ + size_, value_type{0}, [] __device__(auto left, auto right) {
+        auto abs_left  = left > 0 ? left : -left;
+        auto abs_right = right > 0 ? right : -right;
+        return abs_left + abs_right;
+      });
   }
 
   template <typename ThrustExecPolicy>
-  void fill(ThrustExecPolicy t_exe_pol, value_type value) {
+  void fill(ThrustExecPolicy t_exe_pol, value_type value)
+  {
     thrust::fill_n(t_exe_pol, buffer_, size_, value);
   }
 };
 
 template <typename index_type, typename value_type>
 struct sparse_matrix_t {
-  sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
-                  index_type const* col_indices, value_type const* values,
-                  index_type const nrows, index_type const ncols,
+  sparse_matrix_t(handle_t const& raft_handle,
+                  index_type const* row_offsets,
+                  index_type const* col_indices,
+                  value_type const* values,
+                  index_type const nrows,
+                  index_type const ncols,
                   index_type const nnz)
     : handle_(raft_handle),
       row_offsets_(row_offsets),
@@ -143,18 +151,25 @@ struct sparse_matrix_t {
       values_(values),
       nrows_(nrows),
       ncols_(ncols),
-      nnz_(nnz) {}
+      nnz_(nnz)
+  {
+  }
 
-  sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
-                  index_type const* col_indices, value_type const* values,
-                  index_type const nrows, index_type const nnz)
+  sparse_matrix_t(handle_t const& raft_handle,
+                  index_type const* row_offsets,
+                  index_type const* col_indices,
+                  value_type const* values,
+                  index_type const nrows,
+                  index_type const nnz)
     : handle_(raft_handle),
       row_offsets_(row_offsets),
       col_indices_(col_indices),
       values_(values),
       nrows_(nrows),
       ncols_(nrows),
-      nnz_(nnz) {}
+      nnz_(nnz)
+  {
+  }
 
   template <typename CSRView>
   sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view)
@@ -164,7 +179,9 @@ struct sparse_matrix_t {
       values_(csr_view.edge_data),
       nrows_(csr_view.number_of_vertices),
       ncols_(csr_view.number_of_vertices),
-      nnz_(csr_view.number_of_edges) {}
+      nnz_(csr_view.number_of_edges)
+  {
+  }
 
   virtual ~sparse_matrix_t(void) =
     default;  // virtual because used as base for following matrix types
@@ -174,21 +191,24 @@ struct sparse_matrix_t {
   // descriptor creation works with non-const, and const-casting
   // down is dangerous)
   //
-  virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
+  virtual void mv(value_type alpha,
+                  value_type* __restrict__ x,
+                  value_type beta,
                   value_type* __restrict__ y,
                   sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
-                  bool transpose = false, bool symmetric = false) const {
+                  bool transpose      = false,
+                  bool symmetric      = false) const
+  {
     using namespace sparse;
 
     RAFT_EXPECTS(x != nullptr, "Null x buffer.");
     RAFT_EXPECTS(y != nullptr, "Null y buffer.");
 
     auto cusparse_h = handle_.get_cusparse_handle();
-    auto stream = handle_.get_stream();
+    auto stream     = handle_.get_stream();
 
-    cusparseOperation_t trans =
-      transpose ? CUSPARSE_OPERATION_TRANSPOSE :  // transpose
-        CUSPARSE_OPERATION_NON_TRANSPOSE;         //non-transpose
+    cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE :  // transpose
+                                  CUSPARSE_OPERATION_NON_TRANSPOSE;         // non-transpose
 
 #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP
     auto size_x = transpose ? nrows_ : ncols_;
@@ -196,15 +216,19 @@ struct sparse_matrix_t {
 
     cusparseSpMVAlg_t spmv_alg = translate_algorithm(alg);
 
-    //create descriptors:
+    // create descriptors:
     //(below casts are necessary, because
     // cusparseCreateCsr(...) takes non-const
     // void*; the casts should be harmless)
     //
     cusparseSpMatDescr_t matA;
-    CUSPARSE_CHECK(cusparsecreatecsr(
-      &matA, nrows_, ncols_, nnz_, const_cast<index_type*>(row_offsets_),
-      const_cast<index_type*>(col_indices_), const_cast<value_type*>(values_)));
+    CUSPARSE_CHECK(cusparsecreatecsr(&matA,
+                                     nrows_,
+                                     ncols_,
+                                     nnz_,
+                                     const_cast<index_type*>(row_offsets_),
+                                     const_cast<index_type*>(col_indices_),
+                                     const_cast<value_type*>(values_)));
 
     cusparseDnVecDescr_t vecX;
     CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x));
@@ -212,31 +236,29 @@ struct sparse_matrix_t {
     cusparseDnVecDescr_t vecY;
     CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y));
 
-    //get (scratch) external device buffer size:
+    // get (scratch) external device buffer size:
     //
     size_t bufferSize;
-    CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA,
-                                           vecX, &beta, vecY, spmv_alg,
-                                           &bufferSize, stream));
+    CUSPARSE_CHECK(cusparsespmv_buffersize(
+      cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream));
 
-    //allocate external buffer:
+    // allocate external buffer:
     //
     vector_t<value_type> external_buffer(handle_, bufferSize);
 
-    //finally perform SpMV:
+    // finally perform SpMV:
     //
-    CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta,
-                                vecY, spmv_alg, external_buffer.raw(), stream));
+    CUSPARSE_CHECK(cusparsespmv(
+      cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream));
 
-    //free descriptors:
+    // free descriptors:
     //(TODO: maybe wrap them in a RAII struct?)
     //
     CUSPARSE_CHECK(cusparseDestroyDnVec(vecY));
     CUSPARSE_CHECK(cusparseDestroyDnVec(vecX));
     CUSPARSE_CHECK(cusparseDestroySpMat(matA));
 #else
-    CUSPARSE_CHECK(
-      cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
+    CUSPARSE_CHECK(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
     cusparseMatDescr_t descr = 0;
     CUSPARSE_CHECK(cusparseCreateMatDescr(&descr));
     if (symmetric) {
@@ -245,9 +267,20 @@ struct sparse_matrix_t {
       CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
     }
     CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
-    CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, ncols_, nnz_,
-                                 &alpha, descr, values_, row_offsets_,
-                                 col_indices_, x, &beta, y, stream));
+    CUSPARSE_CHECK(cusparsecsrmv(cusparse_h,
+                                 trans,
+                                 nrows_,
+                                 ncols_,
+                                 nnz_,
+                                 &alpha,
+                                 descr,
+                                 values_,
+                                 row_offsets_,
+                                 col_indices_,
+                                 x,
+                                 &beta,
+                                 y,
+                                 stream));
     CUSPARSE_CHECK(cusparseDestroyMatDescr(descr));
 #endif
   }
@@ -255,19 +288,18 @@ struct sparse_matrix_t {
   handle_t const& get_handle(void) const { return handle_; }
 
 #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP
-  cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const {
+  cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const
+  {
     switch (alg) {
-      case sparse_mv_alg_t::SPARSE_MV_ALG1:
-        return CUSPARSE_CSRMV_ALG1;
-      case sparse_mv_alg_t::SPARSE_MV_ALG2:
-        return CUSPARSE_CSRMV_ALG2;
-      default:
-        return CUSPARSE_MV_ALG_DEFAULT;
+      case sparse_mv_alg_t::SPARSE_MV_ALG1: return CUSPARSE_CSRMV_ALG1;
+      case sparse_mv_alg_t::SPARSE_MV_ALG2: return CUSPARSE_CSRMV_ALG2;
+      default: return CUSPARSE_MV_ALG_DEFAULT;
     }
   }
 #endif
 
-  //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate
+  // private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence,
+  // aggregate
 
   handle_t const& handle_;
   index_type const* row_offsets_;
@@ -284,44 +316,51 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
   laplacian_matrix_t(handle_t const& raft_handle,
                      ThrustExePolicy thrust_exec_policy,
                      index_type const* row_offsets,
-                     index_type const* col_indices, value_type const* values,
-                     index_type const nrows, index_type const nnz)
-    : sparse_matrix_t<index_type, value_type>(raft_handle, row_offsets,
-                                              col_indices, values, nrows, nnz),
-      diagonal_(raft_handle, nrows) {
+                     index_type const* col_indices,
+                     value_type const* values,
+                     index_type const nrows,
+                     index_type const nnz)
+    : sparse_matrix_t<index_type, value_type>(
+        raft_handle, row_offsets, col_indices, values, nrows, nnz),
+      diagonal_(raft_handle, nrows)
+  {
     vector_t<value_type> ones{raft_handle, nrows};
     ones.fill(thrust_exec_policy, 1.0);
-    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
-                                                diagonal_.raw());
+    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
 
   template <typename ThrustExePolicy>
   laplacian_matrix_t(handle_t const& raft_handle,
                      ThrustExePolicy thrust_exec_policy,
                      sparse_matrix_t<index_type, value_type> const& csr_m)
-    : sparse_matrix_t<index_type, value_type>(raft_handle, csr_m.row_offsets_,
-                                              csr_m.col_indices_, csr_m.values_,
-                                              csr_m.nrows_, csr_m.nnz_),
-      diagonal_(raft_handle, csr_m.nrows_) {
+    : sparse_matrix_t<index_type, value_type>(raft_handle,
+                                              csr_m.row_offsets_,
+                                              csr_m.col_indices_,
+                                              csr_m.values_,
+                                              csr_m.nrows_,
+                                              csr_m.nnz_),
+      diagonal_(raft_handle, csr_m.nrows_)
+  {
     vector_t<value_type> ones{raft_handle, csr_m.nrows_};
     ones.fill(thrust_exec_policy, 1.0);
-    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
-                                                diagonal_.raw());
+    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
 
   // y = alpha*A*x + beta*y
   //
-  void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
+  void mv(value_type alpha,
+          value_type* __restrict__ x,
+          value_type beta,
           value_type* __restrict__ y,
           sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
-          bool transpose = false, bool symmetric = false) const override {
+          bool transpose      = false,
+          bool symmetric      = false) const override
+  {
     constexpr int BLOCK_SIZE = 1024;
-    auto n = sparse_matrix_t<index_type, value_type>::nrows_;
+    auto n                   = sparse_matrix_t<index_type, value_type>::nrows_;
 
-    auto cublas_h =
-      sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
-    auto stream =
-      sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
+    auto cublas_h = sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
+    auto stream   = sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
 
     // scales y by beta:
     //
@@ -333,8 +372,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
     // Apply diagonal matrix
     //
-    dim3 gridDim{
-      std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
+    dim3 gridDim{std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
 
     dim3 blockDim{BLOCK_SIZE, 1, 1};
     diagmv<<<gridDim, blockDim, 0, stream>>>(n, alpha, diagonal_.raw(), x, y);
@@ -342,8 +380,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
     // Apply adjacency matrix
     //
-    sparse_matrix_t<index_type, value_type>::mv(-alpha, x, 1, y, alg, transpose,
-                                                symmetric);
+    sparse_matrix_t<index_type, value_type>::mv(-alpha, x, 1, y, alg, transpose, symmetric);
   }
 
   vector_t<value_type> diagonal_;
@@ -355,58 +392,68 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
   modularity_matrix_t(handle_t const& raft_handle,
                       ThrustExePolicy thrust_exec_policy,
                       index_type const* row_offsets,
-                      index_type const* col_indices, value_type const* values,
-                      index_type const nrows, index_type const nnz)
+                      index_type const* col_indices,
+                      value_type const* values,
+                      index_type const nrows,
+                      index_type const nnz)
     : laplacian_matrix_t<index_type, value_type>(
-        raft_handle, thrust_exec_policy, row_offsets, col_indices, values,
-        nrows, nnz) {
-    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(
-      thrust_exec_policy);
+        raft_handle, thrust_exec_policy, row_offsets, col_indices, values, nrows, nnz)
+  {
+    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(thrust_exec_policy);
   }
 
   template <typename ThrustExePolicy>
   modularity_matrix_t(handle_t const& raft_handle,
                       ThrustExePolicy thrust_exec_policy,
                       sparse_matrix_t<index_type, value_type> const& csr_m)
-    : laplacian_matrix_t<index_type, value_type>(raft_handle,
-                                                 thrust_exec_policy, csr_m) {
-    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(
-      thrust_exec_policy);
+    : laplacian_matrix_t<index_type, value_type>(raft_handle, thrust_exec_policy, csr_m)
+  {
+    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(thrust_exec_policy);
   }
 
   // y = alpha*A*x + beta*y
   //
-  void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
+  void mv(value_type alpha,
+          value_type* __restrict__ x,
+          value_type beta,
           value_type* __restrict__ y,
           sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
-          bool transpose = false, bool symmetric = false) const override {
+          bool transpose      = false,
+          bool symmetric      = false) const override
+  {
     auto n = sparse_matrix_t<index_type, value_type>::nrows_;
 
-    auto cublas_h =
-      sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
-    auto stream =
-      sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
+    auto cublas_h = sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
+    auto stream   = sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
 
     // y = A*x
     //
-    sparse_matrix_t<index_type, value_type>::mv(alpha, x, 0, y, alg, transpose,
-                                                symmetric);
+    sparse_matrix_t<index_type, value_type>::mv(alpha, x, 0, y, alg, transpose, symmetric);
     value_type dot_res;
 
     // gamma = d'*x
     //
     // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
-    CUBLAS_CHECK(linalg::cublasdot(
-      cublas_h, n, laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
-      1, x, 1, &dot_res, stream));
+    CUBLAS_CHECK(linalg::cublasdot(cublas_h,
+                                   n,
+                                   laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
+                                   1,
+                                   x,
+                                   1,
+                                   &dot_res,
+                                   stream));
 
     // y = y -(gamma/edge_sum)*d
     //
     value_type gamma_ = -dot_res / edge_sum_;
-    CUBLAS_CHECK(linalg::cublasaxpy(
-      cublas_h, n, &gamma_,
-      laplacian_matrix_t<index_type, value_type>::diagonal_.raw(), 1, y, 1,
-      stream));
+    CUBLAS_CHECK(linalg::cublasaxpy(cublas_h,
+                                    n,
+                                    &gamma_,
+                                    laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
+                                    1,
+                                    y,
+                                    1,
+                                    stream));
   }
 
   value_type edge_sum_;
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index f8dfe5daa3..bb7087a3be 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -40,7 +40,8 @@
 #endif
 
 #ifdef COLLECT_TIME_STATISTICS
-static double timer(void) {
+static double timer(void)
+{
   struct timeval tv;
   cudaDeviceSynchronize();
   gettimeofday(&tv, NULL);
@@ -79,19 +80,27 @@ using namespace linalg;
  *    performed.
  *  @return error flag.
  */
-template <typename vertex_t, typename weight_t, typename ThrustExePolicy,
-          typename EigenSolver, typename ClusterSolver>
+template <typename vertex_t,
+          typename weight_t,
+          typename ThrustExePolicy,
+          typename EigenSolver,
+          typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const &handle, ThrustExePolicy thrust_exec_policy,
-  sparse_matrix_t<vertex_t, weight_t> const &csr_m,
-  EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
-  vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
+  handle_t const& handle,
+  ThrustExePolicy thrust_exec_policy,
+  sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   std::tuple<vertex_t, weight_t, vertex_t>
     stats;  // # iters eigen solver, cluster solver residual, # iters cluster solver
@@ -104,11 +113,10 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   modularity_matrix_t<vertex_t, weight_t> B{handle, thrust_exec_policy, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
-  auto nEigVecs = eigen_config.n_eigVecs;
+  auto nEigVecs     = eigen_config.n_eigVecs;
 
   // Compute eigenvectors corresponding to largest eigenvalues
-  std::get<0>(stats) =
-    eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
+  std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
 
   // Whiten eigenvector matrix
   transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs);
@@ -119,8 +127,8 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   CHECK_CUDA(stream);
 
   // Find partition clustering
-  auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n,
-                                           nEigVecs, eigVecs, clusters);
+  auto pair_cluster =
+    cluster_solver.solve(handle, thrust_exec_policy, n, nEigVecs, eigVecs, clusters);
 
   std::get<1>(stats) = pair_cluster.first;
   std::get<2>(stats) = pair_cluster.second;
@@ -139,12 +147,13 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  *  @param modularity On exit, modularity
  */
 template <typename vertex_t, typename weight_t, typename ThrustExePolicy>
-void analyzeModularity(handle_t const &handle,
+void analyzeModularity(handle_t const& handle,
                        ThrustExePolicy thrust_exec_policy,
-                       sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+                       sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                        vertex_t nClusters,
-                       vertex_t const *__restrict__ clusters,
-                       weight_t &modularity) {
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
 
   vertex_t i;
@@ -152,15 +161,14 @@ void analyzeModularity(handle_t const &handle,
   weight_t partModularity, clustersize;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // Device memory
   vector_t<weight_t> part_i(handle, n);
   vector_t<weight_t> Bx(handle, n);
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Modularity
   modularity_matrix_t<vertex_t, weight_t> B{handle, thrust_exec_policy, csr_m};
@@ -170,8 +178,8 @@ void analyzeModularity(handle_t const &handle,
 
   // Iterate through partitions
   for (i = 0; i < nClusters; ++i) {
-    if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize,
-                             partModularity, clusters, part_i, Bx, B)) {
+    if (!construct_indicator(
+          handle, thrust_exec_policy, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) {
       WARNING("empty partition");
       continue;
     }
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index 841fca04d9..e2576c1d69 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -62,22 +62,30 @@ using namespace linalg;
  *    performed.
  *  @return statistics: number of eigensolver iterations, .
  */
-template <typename vertex_t, typename weight_t, typename ThrustExePolicy,
-          typename EigenSolver, typename ClusterSolver>
-std::tuple<vertex_t, weight_t, vertex_t> partition(
-  handle_t const &handle, ThrustExePolicy thrust_exec_policy,
-  sparse_matrix_t<vertex_t, weight_t> const &csr_m,
-  EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
-  vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
+template <typename vertex_t,
+          typename weight_t,
+          typename ThrustExePolicy,
+          typename EigenSolver,
+          typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
+                                                   ThrustExePolicy thrust_exec_policy,
+                                                   sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                                                   EigenSolver const& eigen_solver,
+                                                   ClusterSolver const& cluster_solver,
+                                                   vertex_t* __restrict__ clusters,
+                                                   weight_t* eigVals,
+                                                   weight_t* eigVecs)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   std::tuple<vertex_t, weight_t, vertex_t>
-    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver
+    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver,
+            //cluster solver residual, # iters cluster solver
 
   vertex_t n = csr_m.nrows_;
 
@@ -88,22 +96,21 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
   // Compute eigenvectors of Laplacian
 
   // Initialize Laplacian
-  ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
   laplacian_matrix_t<vertex_t, weight_t> L{handle, thrust_exec_policy, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
-  auto nEigVecs = eigen_config.n_eigVecs;
+  auto nEigVecs     = eigen_config.n_eigVecs;
 
   // Compute smallest eigenvalues and eigenvectors
-  std::get<0>(stats) =
-    eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
+  std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
 
   // Whiten eigenvector matrix
   transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs);
 
   // Find partition clustering
-  auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n,
-                                           nEigVecs, eigVecs, clusters);
+  auto pair_cluster =
+    cluster_solver.solve(handle, thrust_exec_policy, n, nEigVecs, eigVecs, clusters);
 
   std::get<1>(stats) = pair_cluster.first;
   std::get<2>(stats) = pair_cluster.second;
@@ -130,18 +137,21 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
  *  @return error flag.
  */
 template <typename vertex_t, typename weight_t, typename ThrustExePolicy>
-void analyzePartition(handle_t const &handle,
+void analyzePartition(handle_t const& handle,
                       ThrustExePolicy thrust_exec_policy,
-                      sparse_matrix_t<vertex_t, weight_t> const &csr_m,
-                      vertex_t nClusters, const vertex_t *__restrict__ clusters,
-                      weight_t &edgeCut, weight_t &cost) {
+                      sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
 
   vertex_t i;
   vertex_t n = csr_m.nrows_;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   weight_t partEdgesCut, clustersize;
 
@@ -150,22 +160,21 @@ void analyzePartition(handle_t const &handle,
   vector_t<weight_t> Lx(handle, n);
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Laplacian
-  ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
   laplacian_matrix_t<vertex_t, weight_t> L{handle, thrust_exec_policy, csr_m};
 
   // Initialize output
-  cost = 0;
+  cost    = 0;
   edgeCut = 0;
 
   // Iterate through partitions
   for (i = 0; i < nClusters; ++i) {
     // Construct indicator vector for ith partition
-    if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize,
-                             partEdgesCut, clusters, part_i, Lx, L)) {
+    if (!construct_indicator(
+          handle, thrust_exec_policy, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) {
       WARNING("empty partition");
       continue;
     }
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp
index 40dde30a74..5349cb2810 100644
--- a/cpp/include/raft/spectral/spectral_util.hpp
+++ b/cpp/include/raft/spectral/spectral_util.hpp
@@ -28,20 +28,18 @@ namespace raft {
 namespace spectral {
 
 template <typename index_type_t, typename value_type_t>
-static __global__ void scale_obs_kernel(index_type_t m, index_type_t n,
-                                        value_type_t* obs) {
+static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs)
+{
   index_type_t i, j, k, index, mm;
   value_type_t alpha, v, last;
   bool valid;
   // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension
 
   // compute alpha
-  mm = (((m + blockDim.x - 1) / blockDim.x) *
-        blockDim.x);  // m in multiple of blockDim.x
+  mm    = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x);  // m in multiple of blockDim.x
   alpha = 0.0;
 
-  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n;
-       j += blockDim.y * gridDim.y) {
+  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
     for (i = threadIdx.x; i < mm; i += blockDim.x) {
       // check if the thread is valid
       valid = i < m;
@@ -66,17 +64,17 @@ static __global__ void scale_obs_kernel(index_type_t m, index_type_t n,
   // scale by alpha
   alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x);
   alpha = std::sqrt(alpha);
-  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n;
-       j += blockDim.y * gridDim.y) {
+  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
     for (i = threadIdx.x; i < m; i += blockDim.x) {  // blockDim.x=32
-      index = i + j * m;
+      index      = i + j * m;
       obs[index] = obs[index] / alpha;
     }
   }
 }
 
 template <typename index_type_t>
-index_type_t next_pow2(index_type_t n) {
+index_type_t next_pow2(index_type_t n)
+{
   index_type_t v;
   // Reference:
   // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float
@@ -90,7 +88,8 @@ index_type_t next_pow2(index_type_t n) {
 }
 
 template <typename index_type_t, typename value_type_t>
-cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) {
+cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs)
+{
   index_type_t p2m;
 
   // find next power of 2
@@ -102,19 +101,20 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) {
   dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1};
 
   // launch scaling kernel (scale each column of obs by its norm)
-  scale_obs_kernel<index_type_t, value_type_t>
-    <<<nblocks, nthreads>>>(m, n, obs);
+  scale_obs_kernel<index_type_t, value_type_t><<<nblocks, nthreads>>>(m, n, obs);
 
   return cudaSuccess;
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename ThrustExePolicy>
+template <typename vertex_t, typename edge_t, typename weight_t, typename ThrustExePolicy>
 void transform_eigen_matrix(handle_t const& handle,
-                            ThrustExePolicy thrust_exec_policy, edge_t n,
-                            vertex_t nEigVecs, weight_t* eigVecs) {
+                            ThrustExePolicy thrust_exec_policy,
+                            edge_t n,
+                            vertex_t nEigVecs,
+                            weight_t* eigVecs)
+{
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   const weight_t zero{0.0};
   const weight_t one{1.0};
@@ -123,9 +123,9 @@ void transform_eigen_matrix(handle_t const& handle,
   for (auto i = 0; i < nEigVecs; ++i) {
     weight_t mean, std;
 
-    mean = thrust::reduce(
-      thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
-      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
+    mean = thrust::reduce(thrust_exec_policy,
+                          thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                          thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
     CHECK_CUDA(stream);
     mean /= n;
     thrust::transform(thrust_exec_policy,
@@ -136,8 +136,7 @@ void transform_eigen_matrix(handle_t const& handle,
                       thrust::minus<weight_t>());
     CHECK_CUDA(stream);
 
-    CUBLAS_CHECK(
-      cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+    CUBLAS_CHECK(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
 
     std /= std::sqrt(static_cast<weight_t>(n));
 
@@ -154,16 +153,25 @@ void transform_eigen_matrix(handle_t const& handle,
   //   TODO: in-place transpose
   {
     vector_t<weight_t> work(handle, nEigVecs * n);
-    CUBLAS_CHECK(
-      cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-    CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n,
-                            &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs,
-                            work.raw(), nEigVecs, stream));
-
-    CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(),
-                             nEigVecs * n * sizeof(weight_t),
-                             cudaMemcpyDeviceToDevice, stream));
+    CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    CUBLAS_CHECK(cublasgeam(cublas_h,
+                            CUBLAS_OP_T,
+                            CUBLAS_OP_N,
+                            nEigVecs,
+                            n,
+                            &one,
+                            eigVecs,
+                            n,
+                            &zero,
+                            (weight_t*)NULL,
+                            nEigVecs,
+                            work.raw(),
+                            nEigVecs,
+                            stream));
+
+    CUDA_TRY(cudaMemcpyAsync(
+      eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream));
   }
 }
 
@@ -178,49 +186,48 @@ struct equal_to_i_op {
  public:
   equal_to_i_op(index_type_t _i) : i(_i) {}
   template <typename Tuple_>
-  __host__ __device__ void operator()(Tuple_ t) {
-    thrust::get<1>(t) =
-      (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0;
+  __host__ __device__ void operator()(Tuple_ t)
+  {
+    thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0;
   }
 };
 }  // namespace
 
 // Construct indicator vector for ith partition
 //
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename ThrustExePolicy>
+template <typename vertex_t, typename edge_t, typename weight_t, typename ThrustExePolicy>
 bool construct_indicator(handle_t const& handle,
-                         ThrustExePolicy thrust_exec_policy, edge_t index,
-                         edge_t n, weight_t& clustersize, weight_t& partStats,
+                         ThrustExePolicy thrust_exec_policy,
+                         edge_t index,
+                         edge_t n,
+                         weight_t& clustersize,
+                         weight_t& partStats,
                          vertex_t const* __restrict__ clusters,
-                         vector_t<weight_t>& part_i, vector_t<weight_t>& Bx,
-                         laplacian_matrix_t<vertex_t, weight_t> const& B) {
+                         vector_t<weight_t>& part_i,
+                         vector_t<weight_t>& Bx,
+                         laplacian_matrix_t<vertex_t, weight_t> const& B)
+{
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
-
-  thrust::for_each(thrust_exec_policy,
-                   thrust::make_zip_iterator(thrust::make_tuple(
-                     thrust::device_pointer_cast(clusters),
-                     thrust::device_pointer_cast(part_i.raw()))),
-                   thrust::make_zip_iterator(thrust::make_tuple(
-                     thrust::device_pointer_cast(clusters + n),
-                     thrust::device_pointer_cast(part_i.raw() + n))),
-                   equal_to_i_op<vertex_t, weight_t>(index));
+  auto stream   = handle.get_stream();
+
+  thrust::for_each(
+    thrust_exec_policy,
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters),
+                                                 thrust::device_pointer_cast(part_i.raw()))),
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n),
+                                                 thrust::device_pointer_cast(part_i.raw() + n))),
+    equal_to_i_op<vertex_t, weight_t>(index));
   CHECK_CUDA(stream);
 
   // Compute size of ith partition
-  CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1,
-                         &clustersize, stream));
+  CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
 
   clustersize = round(clustersize);
-  if (clustersize < 0.5) {
-    return false;
-  }
+  if (clustersize < 0.5) { return false; }
 
   // Compute part stats
   B.mv(1, part_i.raw(), 0, Bx.raw());
-  CUBLAS_CHECK(
-    cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+  CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
 
   return true;
 }
diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/warn_dbg.hpp
index 406f1b7c7e..08a4e6efb5 100644
--- a/cpp/include/raft/spectral/warn_dbg.hpp
+++ b/cpp/include/raft/spectral/warn_dbg.hpp
@@ -4,13 +4,13 @@
 #include <string>
 
 #define STRINGIFY_DETAIL(x) #x
-#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x)
+#define RAFT_STRINGIFY(x)   STRINGIFY_DETAIL(x)
 
 #ifdef DEBUG
 #define COUT() (std::cout)
 #define CERR() (std::cerr)
 
-//nope:
+// nope:
 //
 #define WARNING(message)                                                  \
   do {                                                                    \
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index 8691cabc85..4d6724482c 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -26,15 +26,15 @@ namespace stats {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D,
-                                   IdxType N) {
+__global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId = threadIdx.x / ColsPerBlk;
-  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data = Type(0);
-  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
+  IdxType thisColId           = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
+  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data            = Type(0);
+  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
   for (IdxType i = rowId; i < N; i += stride)
     thread_data += (colId < D) ? data[i * D + colId] : Type(0);
   __shared__ Type smu[ColsPerBlk];
@@ -46,8 +46,8 @@ __global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D,
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D,
-                                   IdxType N) {
+__global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
@@ -57,9 +57,7 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D,
     thread_data += data[idx];
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) {
-    mu[blockIdx.x] = acc / N;
-  }
+  if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; }
 }
 
 /**
@@ -80,24 +78,22 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D,
  * @param stream: cuda stream
  */
 template <typename Type, typename IdxType = int>
-void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
-          bool rowMajor, cudaStream_t stream) {
+void mean(
+  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
+{
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk = 32;
-    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
-              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk    = 32;
+    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream));
-    meanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
-      <<<grid, TPB, 0, stream>>>(mu, data, D, N);
+    meanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(mu, data, D, N);
     CUDA_CHECK(cudaPeekAtLastError());
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::scalarMultiply(mu, mu, ratio, D, stream);
   } else {
-    meanKernelColMajor<Type, IdxType, TPB>
-      <<<D, TPB, 0, stream>>>(mu, data, D, N);
+    meanKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(mu, data, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
index 04934d4388..c0ba24312b 100644
--- a/cpp/include/raft/stats/mean_center.cuh
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -38,12 +38,25 @@ namespace stats {
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int, int TPB = 256>
-void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D,
-                IdxType N, bool rowMajor, bool bcastAlongRows,
-                cudaStream_t stream) {
+void meanCenter(Type* out,
+                const Type* data,
+                const Type* mu,
+                IdxType D,
+                IdxType N,
+                bool rowMajor,
+                bool bcastAlongRows,
+                cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    out, data, mu, D, N, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; }, stream);
+    out,
+    data,
+    mu,
+    D,
+    N,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; },
+    stream);
 }
 
 /**
@@ -61,11 +74,25 @@ void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int, int TPB = 256>
-void meanAdd(Type *out, const Type *data, const Type *mu, IdxType D, IdxType N,
-             bool rowMajor, bool bcastAlongRows, cudaStream_t stream) {
+void meanAdd(Type* out,
+             const Type* data,
+             const Type* mu,
+             IdxType D,
+             IdxType N,
+             bool rowMajor,
+             bool bcastAlongRows,
+             cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    out, data, mu, D, N, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; }, stream);
+    out,
+    data,
+    mu,
+    D,
+    N,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; },
+    stream);
 }
 
 };  // end namespace stats
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
index f12c633829..1dd9cd56bc 100644
--- a/cpp/include/raft/stats/stddev.cuh
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -26,15 +26,15 @@ namespace stats {
 
 ///@todo: ColPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D,
-                                     IdxType N) {
+__global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N)
+{
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId = threadIdx.x / ColsPerBlk;
-  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data = Type(0);
-  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
+  IdxType thisColId           = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
+  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data            = Type(0);
+  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
   for (IdxType i = rowId; i < N; i += stride) {
     Type val = (colId < D) ? data[i * D + colId] : Type(0);
     thread_data += val * val;
@@ -48,41 +48,39 @@ __global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D,
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void stddevKernelColMajor(Type *std, const Type *data,
-                                     const Type *mu, IdxType D, IdxType N) {
+__global__ void stddevKernelColMajor(
+  Type* std, const Type* data, const Type* mu, IdxType D, IdxType N)
+{
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
   IdxType colStart = N * blockIdx.x;
-  Type m = mu[blockIdx.x];
+  Type m           = mu[blockIdx.x];
   for (IdxType i = threadIdx.x; i < N; i += TPB) {
     IdxType idx = colStart + i;
-    Type diff = data[idx] - m;
+    Type diff   = data[idx] - m;
     thread_data += diff * diff;
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) {
-    std[blockIdx.x] = raft::mySqrt(acc / N);
-  }
+  if (threadIdx.x == 0) { std[blockIdx.x] = raft::mySqrt(acc / N); }
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu,
-                                   IdxType D, IdxType N) {
+__global__ void varsKernelColMajor(
+  Type* var, const Type* data, const Type* mu, IdxType D, IdxType N)
+{
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
   IdxType colStart = N * blockIdx.x;
-  Type m = mu[blockIdx.x];
+  Type m           = mu[blockIdx.x];
   for (IdxType i = threadIdx.x; i < N; i += TPB) {
     IdxType idx = colStart + i;
-    Type diff = data[idx] - m;
+    Type diff   = data[idx] - m;
     thread_data += diff * diff;
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) {
-    var[blockIdx.x] = acc / N;
-  }
+  if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; }
 }
 
 /**
@@ -104,28 +102,33 @@ __global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
-            bool sample, bool rowMajor, cudaStream_t stream) {
+void stddev(Type* std,
+            const Type* data,
+            const Type* mu,
+            IdxType D,
+            IdxType N,
+            bool sample,
+            bool rowMajor,
+            cudaStream_t stream)
+{
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk = 32;
-    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
-              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk    = 32;
+    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D));
-    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
-      <<<grid, TPB, 0, stream>>>(std, data, D, N);
+    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(std, data, D, N);
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::binaryOp(
-      std, std, mu, D,
-      [ratio] __device__(Type a, Type b) {
-        return raft::mySqrt(a * ratio - b * b);
-      },
+      std,
+      std,
+      mu,
+      D,
+      [ratio] __device__(Type a, Type b) { return raft::mySqrt(a * ratio - b * b); },
       stream);
   } else {
-    stddevKernelColMajor<Type, IdxType, TPB>
-      <<<D, TPB, 0, stream>>>(std, data, mu, D, N);
+    stddevKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(std, data, mu, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -149,25 +152,28 @@ void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
-          bool sample, bool rowMajor, cudaStream_t stream) {
+void vars(Type* var,
+          const Type* data,
+          const Type* mu,
+          IdxType D,
+          IdxType N,
+          bool sample,
+          bool rowMajor,
+          cudaStream_t stream)
+{
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk = 32;
-    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
-              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk    = 32;
+    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D));
-    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
-      <<<grid, TPB, 0, stream>>>(var, data, D, N);
+    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(var, data, D, N);
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::binaryOp(
-      var, var, mu, D,
-      [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream);
+      var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream);
   } else {
-    varsKernelColMajor<Type, IdxType, TPB>
-      <<<D, TPB, 0, stream>>>(var, data, mu, D, N);
+    varsKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(var, data, mu, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
index 5f8416c7e2..c7b8ce12b6 100644
--- a/cpp/include/raft/stats/sum.cuh
+++ b/cpp/include/raft/stats/sum.cuh
@@ -26,15 +26,15 @@ namespace stats {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D,
-                                  IdxType N) {
+__global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId = threadIdx.x / ColsPerBlk;
-  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data = Type(0);
-  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
+  IdxType thisColId           = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
+  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data            = Type(0);
+  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
   for (IdxType i = rowId; i < N; i += stride)
     thread_data += (colId < D) ? data[i * D + colId] : Type(0);
   __shared__ Type smu[ColsPerBlk];
@@ -46,8 +46,8 @@ __global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D,
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D,
-                                  IdxType N) {
+__global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
@@ -57,9 +57,7 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D,
     thread_data += data[idx];
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) {
-    mu[blockIdx.x] = acc;
-  }
+  if (threadIdx.x == 0) { mu[blockIdx.x] = acc; }
 }
 
 /**
@@ -77,21 +75,19 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
-         cudaStream_t stream) {
+void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
+{
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk = 32;
-    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
-              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk    = 32;
+    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D));
     sumKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
       <<<grid, TPB, 0, stream>>>(output, input, D, N);
   } else {
-    sumKernelColMajor<Type, IdxType, TPB>
-      <<<D, TPB, 0, stream>>>(output, input, D, N);
+    sumKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(output, input, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh
index 1829fc0351..1e0885fb99 100644
--- a/cpp/include/raft/vectorized.cuh
+++ b/cpp/include/raft/vectorized.cuh
@@ -22,11 +22,11 @@
 namespace raft {
 
 template <typename math_, int VecLen>
-struct IOType {};
+struct IOType {
+};
 template <>
 struct IOType<bool, 1> {
-  static_assert(sizeof(bool) == sizeof(int8_t),
-                "IOType bool size assumption failed");
+  static_assert(sizeof(bool) == sizeof(int8_t), "IOType bool size assumption failed");
   typedef int8_t Type;
 };
 template <>
@@ -215,42 +215,42 @@ struct IOType<double, 2> {
 };
 
 /**
-     * @struct TxN_t
-     *
-     * @brief Internal data structure that is used to define a facade for vectorized
-     * loads/stores across the most common POD types. The goal of his file is to
-     * provide with CUDA programmers, an easy way to have compiler issue vectorized
-     * load or store instructions to memory (either global or shared). Vectorized
-     * accesses to memory are important as they'll utilize its resources
-     * efficiently,
-     * when compared to their non-vectorized counterparts. Obviously, for whatever
-     * reasons if one is unable to issue such vectorized operations, one can always
-     * fallback to using POD types.
-     *
-     * Example demonstrating the use of load operations, performing math on such
-     * loaded data and finally storing it back.
-     * @code{.cu}
-     * TxN_t<uint8_t,8> mydata1, mydata2;
-     * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio;
-     * mydata1.load(ptr1, idx);
-     * mydata2.load(ptr2, idx);
-     * #pragma unroll
-     * for(int i=0;i<mydata1.Ratio;++i) {
-     *     mydata1.val.data[i] += mydata2.val.data[i];
-     * }
-     * mydata1.store(ptr1, idx);
-     * @endcode
-     *
-     * By doing as above, the interesting thing is that the code effectively remains
-     * almost the same, in case one wants to upgrade to TxN_t<uint16_t,16> type.
-     * Only change required is to replace variable declaration appropriately.
-     *
-     * Obviously, it's caller's responsibility to take care of pointer alignment!
-     *
-     * @tparam math_ the data-type in which the compute/math needs to happen
-     * @tparam veclen_ the number of 'math_' types to be loaded/stored per
-     * instruction
-     */
+ * @struct TxN_t
+ *
+ * @brief Internal data structure that is used to define a facade for vectorized
+ * loads/stores across the most common POD types. The goal of his file is to
+ * provide with CUDA programmers, an easy way to have compiler issue vectorized
+ * load or store instructions to memory (either global or shared). Vectorized
+ * accesses to memory are important as they'll utilize its resources
+ * efficiently,
+ * when compared to their non-vectorized counterparts. Obviously, for whatever
+ * reasons if one is unable to issue such vectorized operations, one can always
+ * fallback to using POD types.
+ *
+ * Example demonstrating the use of load operations, performing math on such
+ * loaded data and finally storing it back.
+ * @code{.cu}
+ * TxN_t<uint8_t,8> mydata1, mydata2;
+ * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio;
+ * mydata1.load(ptr1, idx);
+ * mydata2.load(ptr2, idx);
+ * #pragma unroll
+ * for(int i=0;i<mydata1.Ratio;++i) {
+ *     mydata1.val.data[i] += mydata2.val.data[i];
+ * }
+ * mydata1.store(ptr1, idx);
+ * @endcode
+ *
+ * By doing as above, the interesting thing is that the code effectively remains
+ * almost the same, in case one wants to upgrade to TxN_t<uint16_t,16> type.
+ * Only change required is to replace variable declaration appropriately.
+ *
+ * Obviously, it's caller's responsibility to take care of pointer alignment!
+ *
+ * @tparam math_ the data-type in which the compute/math needs to happen
+ * @tparam veclen_ the number of 'math_' types to be loaded/stored per
+ * instruction
+ */
 template <typename math_, int veclen_>
 struct TxN_t {
   /** underlying math data type */
@@ -274,7 +274,8 @@ struct TxN_t {
    * @brief Fill the contents of this structure with a constant value
    * @param _val the constant to be filled
    */
-  DI void fill(math_t _val) {
+  DI void fill(math_t _val)
+  {
 #pragma unroll
     for (int i = 0; i < Ratio; ++i) {
       val.data[i] = _val;
@@ -299,21 +300,24 @@ struct TxN_t {
    * @{
    */
   template <typename idx_t = int>
-  DI void load(const math_t *ptr, idx_t idx) {
-    const io_t *bptr = reinterpret_cast<const io_t *>(&ptr[idx]);
-    val.internal = __ldg(bptr);
+  DI void load(const math_t* ptr, idx_t idx)
+  {
+    const io_t* bptr = reinterpret_cast<const io_t*>(&ptr[idx]);
+    val.internal     = __ldg(bptr);
   }
 
   template <typename idx_t = int>
-  DI void load(math_t *ptr, idx_t idx) {
-    io_t *bptr = reinterpret_cast<io_t *>(&ptr[idx]);
+  DI void load(math_t* ptr, idx_t idx)
+  {
+    io_t* bptr   = reinterpret_cast<io_t*>(&ptr[idx]);
     val.internal = *bptr;
   }
 
   template <typename idx_t = int>
-  DI void store(math_t *ptr, idx_t idx) {
-    io_t *bptr = reinterpret_cast<io_t *>(&ptr[idx]);
-    *bptr = val.internal;
+  DI void store(math_t* ptr, idx_t idx)
+  {
+    io_t* bptr = reinterpret_cast<io_t*>(&ptr[idx]);
+    *bptr      = val.internal;
   }
   /** @} */
 };
@@ -330,11 +334,17 @@ struct TxN_t<math_, 0> {
 
   DI void fill(math_t _val) {}
   template <typename idx_t = int>
-  DI void load(const math_t *ptr, idx_t idx) {}
+  DI void load(const math_t* ptr, idx_t idx)
+  {
+  }
   template <typename idx_t = int>
-  DI void load(math_t *ptr, idx_t idx) {}
+  DI void load(math_t* ptr, idx_t idx)
+  {
+  }
   template <typename idx_t = int>
-  DI void store(math_t *ptr, idx_t idx) {}
+  DI void store(math_t* ptr, idx_t idx)
+  {
+  }
 };
 
 }  // namespace raft
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
index 4ff6cdf5fa..284a873dec 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster_solvers.cu
@@ -23,7 +23,8 @@
 
 namespace raft {
 
-TEST(Raft, ClusterSolvers) {
+TEST(Raft, ClusterSolvers)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -40,7 +41,7 @@ TEST(Raft, ClusterSolvers) {
   index_type d{10};
   index_type k{5};
 
-  //nullptr expected to trigger exceptions:
+  // nullptr expected to trigger exceptions:
   //
   value_type* eigvecs{nullptr};
   index_type* codes{nullptr};
@@ -49,11 +50,11 @@ TEST(Raft, ClusterSolvers) {
 
   kmeans_solver_t<index_type, value_type> cluster_solver{cfg};
 
-  EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d,
-                                        eigvecs, codes));
+  EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, eigvecs, codes));
 }
 
-TEST(Raft, ModularitySolvers) {
+TEST(Raft, ModularitySolvers)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -68,7 +69,7 @@ TEST(Raft, ModularitySolvers) {
   value_type tol{1.0e-10};
   bool reorthog{true};
 
-  //nullptr expected to trigger exceptions:
+  // nullptr expected to trigger exceptions:
   //
   index_type* clusters{nullptr};
   value_type* eigvals{nullptr};
@@ -82,21 +83,18 @@ TEST(Raft, ModularitySolvers) {
 
   index_type k{5};
 
-  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol,
-                                                            seed};
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
   kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
 
   auto stream = h.get_stream();
-  sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
-                                             nullptr, 0,       0};
+  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
   auto t_exe_p = thrust::cuda::par.on(stream);
 
   EXPECT_ANY_THROW(spectral::modularity_maximization(
     h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
 
   value_type modularity{0};
-  EXPECT_ANY_THROW(
-    spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity));
+  EXPECT_ANY_THROW(spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity));
 }
 
 }  // namespace raft
diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp
index c14d880efd..150767992f 100644
--- a/cpp/test/cudart_utils.cpp
+++ b/cpp/test/cudart_utils.cpp
@@ -20,7 +20,8 @@
 
 namespace raft {
 
-TEST(Raft, Utils) {
+TEST(Raft, Utils)
+{
   ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!"));
   ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception);
   ASSERT_THROW(THROW("Should throw!"), exception);
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index e2ed2c01dc..9ed32b80ef 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -25,30 +25,42 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-__global__ void naiveDistanceAdjKernel(bool *dist, const DataType *x,
-                                       const DataType *y, int m, int n, int k,
-                                       DataType eps, bool isRowMajor) {
+__global__ void naiveDistanceAdjKernel(bool* dist,
+                                       const DataType* x,
+                                       const DataType* y,
+                                       int m,
+                                       int n,
+                                       int k,
+                                       DataType eps,
+                                       bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
     auto diff = x[xidx] - y[yidx];
     acc += diff * diff;
   }
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc <= eps;
 }
 
 template <typename DataType>
-void naiveDistanceAdj(bool *dist, const DataType *x, const DataType *y, int m,
-                      int n, int k, DataType eps, bool isRowMajor) {
+void naiveDistanceAdj(bool* dist,
+                      const DataType* x,
+                      const DataType* y,
+                      int m,
+                      int n,
+                      int k,
+                      DataType eps,
+                      bool isRowMajor)
+{
   static const dim3 TPB(16, 32, 1);
   dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
-  naiveDistanceAdjKernel<DataType>
-    <<<nblks, TPB>>>(dist, x, y, m, n, k, eps, isRowMajor);
+  naiveDistanceAdjKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, eps, isRowMajor);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -61,21 +73,21 @@ struct DistanceAdjInputs {
 };
 
 template <typename DataType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const DistanceAdjInputs<DataType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const DistanceAdjInputs<DataType>& dims)
+{
   return os;
 }
 
 template <typename DataType>
-class DistanceAdjTest
-  : public ::testing::TestWithParam<DistanceAdjInputs<DataType>> {
+class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataType>> {
  public:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<DistanceAdjInputs<DataType>>::GetParam();
     raft::random::Rng r(params.seed);
-    int m = params.m;
-    int n = params.n;
-    int k = params.k;
+    int m           = params.m;
+    int n           = params.n;
+    int k           = params.k;
     bool isRowMajor = params.isRowMajor;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
@@ -89,25 +101,23 @@ class DistanceAdjTest
     DataType threshold = params.eps;
 
     naiveDistanceAdj(dist_ref, x, y, m, n, k, threshold, isRowMajor);
-    char *workspace = nullptr;
-    size_t worksize =
-      raft::distance::getWorkspaceSize<raft::distance::DistanceType::L2Expanded,
-                                       DataType, DataType, bool>(x, y, m, n, k);
-    if (worksize != 0) {
-      raft::allocate(workspace, worksize);
-    }
+    char* workspace = nullptr;
+    size_t worksize = raft::distance::
+      getWorkspaceSize<raft::distance::DistanceType::L2Expanded, DataType, DataType, bool>(
+        x, y, m, n, k);
+    if (worksize != 0) { raft::allocate(workspace, worksize); }
 
     auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) {
       return d_val <= threshold;
     };
-    raft::distance::distance<raft::distance::DistanceType::L2Expanded, DataType,
-                             DataType, bool>(
+    raft::distance::distance<raft::distance::DistanceType::L2Expanded, DataType, DataType, bool>(
       x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor);
     CUDA_CHECK(cudaStreamDestroy(stream));
     CUDA_CHECK(cudaFree(workspace));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(x));
     CUDA_CHECK(cudaFree(y));
     CUDA_CHECK(cudaFree(dist_ref));
@@ -131,13 +141,13 @@ const std::vector<DistanceAdjInputs<float>> inputsf = {
   {10.0f, 1024, 1024, 32, false, 1234ULL},
 };
 typedef DistanceAdjTest<float> DistanceAdjTestF;
-TEST_P(DistanceAdjTestF, Result) {
+TEST_P(DistanceAdjTestF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::Compare<bool>()));
 }
-INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceAdjInputs<double>> inputsd = {
   {0.01, 1024, 1024, 32, true, 1234ULL},
@@ -150,13 +160,13 @@ const std::vector<DistanceAdjInputs<double>> inputsd = {
   {10.0, 1024, 1024, 32, false, 1234ULL},
 };
 typedef DistanceAdjTest<double> DistanceAdjTestD;
-TEST_P(DistanceAdjTestD, Result) {
+TEST_P(DistanceAdjTestD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::Compare<bool>()));
 }
-INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu
index 10bc4d1899..c812a1985d 100644
--- a/cpp/test/distance/dist_canberra.cu
+++ b/cpp/test/distance/dist_canberra.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceCanberra
-  : public DistanceTest<raft::distance::DistanceType::Canberra, DataType> {};
+class DistanceCanberra : public DistanceTest<raft::distance::DistanceType::Canberra, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceCanberra<float> DistanceCanberraF;
-TEST_P(DistanceCanberraF, Result) {
+TEST_P(DistanceCanberraF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceCanberra<double> DistanceCanberraD;
-TEST_P(DistanceCanberraD, Result) {
+TEST_P(DistanceCanberraD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu
index 6a2b02863a..0a4a69f059 100644
--- a/cpp/test/distance/dist_chebyshev.cu
+++ b/cpp/test/distance/dist_chebyshev.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceLinf
-  : public DistanceTest<raft::distance::DistanceType::Linf, DataType> {};
+class DistanceLinf : public DistanceTest<raft::distance::DistanceType::Linf, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceLinf<float> DistanceLinfF;
-TEST_P(DistanceLinfF, Result) {
+TEST_P(DistanceLinfF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceLinf<double> DistanceLinfD;
-TEST_P(DistanceLinfD, Result) {
+TEST_P(DistanceLinfD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu
index 291c4196f9..f7510c17b1 100644
--- a/cpp/test/distance/dist_cos.cu
+++ b/cpp/test/distance/dist_cos.cu
@@ -21,9 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceExpCos
-  : public DistanceTest<raft::distance::DistanceType::CosineExpanded,
-                        DataType> {};
+class DistanceExpCos : public DistanceTest<raft::distance::DistanceType::CosineExpanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +35,13 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceExpCos<float> DistanceExpCosF;
-TEST_P(DistanceExpCosF, Result) {
+TEST_P(DistanceExpCosF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
-                          raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +54,13 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceExpCos<double> DistanceExpCosD;
-TEST_P(DistanceExpCosD, Result) {
+TEST_P(DistanceExpCosD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
-                          raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu
index 46e7ded0ec..e90d0e83dc 100644
--- a/cpp/test/distance/dist_euc_exp.cu
+++ b/cpp/test/distance/dist_euc_exp.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceEucExpTest
-  : public DistanceTest<raft::distance::DistanceType::L2Expanded, DataType> {};
+class DistanceEucExpTest : public DistanceTest<raft::distance::DistanceType::L2Expanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,13 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucExpTest<float> DistanceEucExpTestF;
-TEST_P(DistanceEucExpTestF, Result) {
+TEST_P(DistanceEucExpTestF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
-                          raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +54,13 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucExpTest<double> DistanceEucExpTestD;
-TEST_P(DistanceEucExpTestD, Result) {
+TEST_P(DistanceEucExpTestD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
-                          raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu
index 92f424647d..90412a9cb2 100644
--- a/cpp/test/distance/dist_euc_unexp.cu
+++ b/cpp/test/distance/dist_euc_unexp.cu
@@ -36,14 +36,13 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucUnexpTest<float> DistanceEucUnexpTestF;
-TEST_P(DistanceEucUnexpTestF, Result) {
+TEST_P(DistanceEucUnexpTestF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
-                          raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +55,13 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucUnexpTest<double> DistanceEucUnexpTestD;
-TEST_P(DistanceEucUnexpTestD, Result) {
+TEST_P(DistanceEucUnexpTestD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
-                          raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu
index 39dc7aaeff..95b1908dc1 100644
--- a/cpp/test/distance/dist_hellinger.cu
+++ b/cpp/test/distance/dist_hellinger.cu
@@ -22,8 +22,8 @@ namespace distance {
 
 template <typename DataType>
 class DistanceHellingerExp
-  : public DistanceTest<raft::distance::DistanceType::HellingerExpanded,
-                        DataType> {};
+  : public DistanceTest<raft::distance::DistanceType::HellingerExpanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceHellingerExp<float> DistanceHellingerExpF;
-TEST_P(DistanceHellingerExpF, Result) {
+TEST_P(DistanceHellingerExpF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceHellingerExp<double> DistanceHellingerExpD;
-TEST_P(DistanceHellingerExpD, Result) {
+TEST_P(DistanceHellingerExpD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu
index bd32837e45..d14f8d8a0b 100644
--- a/cpp/test/distance/dist_l1.cu
+++ b/cpp/test/distance/dist_l1.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceUnexpL1
-  : public DistanceTest<raft::distance::DistanceType::L1, DataType> {};
+class DistanceUnexpL1 : public DistanceTest<raft::distance::DistanceType::L1, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceUnexpL1<float> DistanceUnexpL1F;
-TEST_P(DistanceUnexpL1F, Result) {
+TEST_P(DistanceUnexpL1F, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceUnexpL1<double> DistanceUnexpL1D;
-TEST_P(DistanceUnexpL1D, Result) {
+TEST_P(DistanceUnexpL1D, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu
index 42b8e294ac..cc6a5f60de 100644
--- a/cpp/test/distance/dist_minkowski.cu
+++ b/cpp/test/distance/dist_minkowski.cu
@@ -21,8 +21,7 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceLpUnexp
-  : public DistanceTest<raft::distance::DistanceType::LpUnexpanded, DataType> {
+class DistanceLpUnexp : public DistanceTest<raft::distance::DistanceType::LpUnexpanded, DataType> {
 };
 
 const std::vector<DistanceInputs<float>> inputsf = {
@@ -36,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL, 3.0f},
 };
 typedef DistanceLpUnexp<float> DistanceLpUnexpF;
-TEST_P(DistanceLpUnexpF, Result) {
+TEST_P(DistanceLpUnexpF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL, 4.0},
@@ -56,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL, 3.0},
 };
 typedef DistanceLpUnexp<double> DistanceLpUnexpD;
-TEST_P(DistanceLpUnexpD, Result) {
+TEST_P(DistanceLpUnexpD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index fc7b064205..a99d307abb 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -25,43 +25,52 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-__global__ void naiveDistanceKernel(DataType *dist, const DataType *x,
-                                    const DataType *y, int m, int n, int k,
+__global__ void naiveDistanceKernel(DataType* dist,
+                                    const DataType* x,
+                                    const DataType* y,
+                                    int m,
+                                    int n,
+                                    int k,
                                     raft::distance::DistanceType type,
-                                    bool isRowMajor) {
+                                    bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
     auto diff = x[xidx] - y[yidx];
     acc += diff * diff;
   }
   if (type == raft::distance::DistanceType::L2SqrtExpanded ||
       type == raft::distance::DistanceType::L2SqrtUnexpanded)
     acc = raft::mySqrt(acc);
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType>
-__global__ void naiveL1_Linf_CanberraDistanceKernel(
-  DataType *dist, const DataType *x, const DataType *y, int m, int n, int k,
-  raft::distance::DistanceType type, bool isRowMajor) {
+__global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist,
+                                                    const DataType* x,
+                                                    const DataType* y,
+                                                    int m,
+                                                    int n,
+                                                    int k,
+                                                    raft::distance::DistanceType type,
+                                                    bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) {
-    return;
-  }
+  if (midx >= m || nidx >= n) { return; }
 
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a    = x[xidx];
+    auto b    = y[yidx];
     auto diff = (a > b) ? (a - b) : (b - a);
     if (type == raft::distance::DistanceType::Linf) {
       acc = raft::myMax(acc, diff);
@@ -75,29 +84,27 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel(
     }
   }
 
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType>
-__global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x,
-                                          const DataType *y, int m, int n,
-                                          int k, bool isRowMajor) {
+__global__ void naiveCosineDistanceKernel(
+  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) {
-    return;
-  }
+  if (midx >= m || nidx >= n) { return; }
 
-  DataType acc_a = DataType(0);
-  DataType acc_b = DataType(0);
+  DataType acc_a  = DataType(0);
+  DataType acc_b  = DataType(0);
   DataType acc_ab = DataType(0);
 
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
     acc_a += a * a;
     acc_b += b * b;
     acc_ab += a * b;
@@ -106,64 +113,74 @@ __global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x,
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
 
   // Use 1.0 - (cosine similarity) to calc the distance
-  dist[outidx] =
-    (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b));
+  dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b));
 }
 
 template <typename DataType>
-__global__ void naiveHellingerDistanceKernel(DataType *dist, const DataType *x,
-                                             const DataType *y, int m, int n,
-                                             int k, bool isRowMajor) {
+__global__ void naiveHellingerDistanceKernel(
+  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) {
-    return;
-  }
+  if (midx >= m || nidx >= n) { return; }
 
   DataType acc_ab = DataType(0);
 
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
     acc_ab += raft::mySqrt(a) * raft::mySqrt(b);
   }
 
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
 
   // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
-  acc_ab = 1 - acc_ab;
+  acc_ab         = 1 - acc_ab;
   auto rectifier = (!signbit(acc_ab));
-  dist[outidx] = raft::mySqrt(rectifier * acc_ab);
+  dist[outidx]   = raft::mySqrt(rectifier * acc_ab);
 }
 
 template <typename DataType>
-__global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x,
-                                           const DataType *y, int m, int n,
-                                           int k, bool isRowMajor, DataType p) {
+__global__ void naiveLpUnexpDistanceKernel(DataType* dist,
+                                           const DataType* x,
+                                           const DataType* y,
+                                           int m,
+                                           int n,
+                                           int k,
+                                           bool isRowMajor,
+                                           DataType p)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a    = x[xidx];
+    auto b    = y[yidx];
     auto diff = raft::L1Op<DataType>()(a - b);
     acc += raft::myPow(diff, p);
   }
   auto one_over_p = 1 / p;
-  acc = raft::myPow(acc, one_over_p);
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
-  dist[outidx] = acc;
+  acc             = raft::myPow(acc, one_over_p);
+  int outidx      = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx]    = acc;
 }
 
 template <typename DataType>
-void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m,
-                   int n, int k, raft::distance::DistanceType type,
-                   bool isRowMajor, DataType metric_arg = 2.0f) {
+void naiveDistance(DataType* dist,
+                   const DataType* x,
+                   const DataType* y,
+                   int m,
+                   int n,
+                   int k,
+                   raft::distance::DistanceType type,
+                   bool isRowMajor,
+                   DataType metric_arg = 2.0f)
+{
   static const dim3 TPB(16, 32, 1);
   dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
 
@@ -178,23 +195,19 @@ void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m,
     case raft::distance::DistanceType::L2Unexpanded:
     case raft::distance::DistanceType::L2SqrtExpanded:
     case raft::distance::DistanceType::L2Expanded:
-      naiveDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, type, isRowMajor);
+      naiveDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, type, isRowMajor);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      naiveCosineDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveCosineDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      naiveHellingerDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveHellingerDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
       naiveLpUnexpDistanceKernel<DataType>
         <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor, metric_arg);
       break;
-    default:
-      FAIL() << "should be here\n";
+    default: FAIL() << "should be here\n";
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -209,37 +222,47 @@ struct DistanceInputs {
 };
 
 template <typename DataType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const DistanceInputs<DataType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const DistanceInputs<DataType>& dims)
+{
   return os;
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
-void distanceLauncher(DataType *x, DataType *y, DataType *dist, DataType *dist2,
-                      int m, int n, int k, DistanceInputs<DataType> &params,
-                      DataType threshold, char *workspace, size_t worksize,
-                      cudaStream_t stream, bool isRowMajor,
-                      DataType metric_arg = 2.0f) {
+void distanceLauncher(DataType* x,
+                      DataType* y,
+                      DataType* dist,
+                      DataType* dist2,
+                      int m,
+                      int n,
+                      int k,
+                      DistanceInputs<DataType>& params,
+                      DataType threshold,
+                      char* workspace,
+                      size_t worksize,
+                      cudaStream_t stream,
+                      bool isRowMajor,
+                      DataType metric_arg = 2.0f)
+{
   auto fin_op = [dist2, threshold] __device__(DataType d_val, int g_d_idx) {
     dist2[g_d_idx] = (d_val < threshold) ? 0.f : d_val;
     return d_val;
   };
   raft::distance::distance<distanceType, DataType, DataType, DataType>(
-    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor,
-    metric_arg);
+    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
 class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
  public:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<DistanceInputs<DataType>>::GetParam();
     raft::random::Rng r(params.seed);
-    int m = params.m;
-    int n = params.n;
-    int k = params.k;
+    int m               = params.m;
+    int n               = params.n;
+    int k               = params.k;
     DataType metric_arg = params.metric_arg;
-    bool isRowMajor = params.isRowMajor;
+    bool isRowMajor     = params.isRowMajor;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     raft::allocate(x, m * k);
@@ -256,25 +279,33 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
       r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream);
     }
 
-    naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor,
-                  metric_arg);
-    char *workspace = nullptr;
+    naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor, metric_arg);
+    char* workspace = nullptr;
     size_t worksize =
-      raft::distance::getWorkspaceSize<distanceType, DataType, DataType,
-                                       DataType>(x, y, m, n, k);
-    if (worksize != 0) {
-      raft::allocate(workspace, worksize);
-    }
+      raft::distance::getWorkspaceSize<distanceType, DataType, DataType, DataType>(x, y, m, n, k);
+    if (worksize != 0) { raft::allocate(workspace, worksize); }
 
     DataType threshold = -10000.f;
-    distanceLauncher<distanceType, DataType>(x, y, dist, dist2, m, n, k, params,
-                                             threshold, workspace, worksize,
-                                             stream, isRowMajor, metric_arg);
+    distanceLauncher<distanceType, DataType>(x,
+                                             y,
+                                             dist,
+                                             dist2,
+                                             m,
+                                             n,
+                                             k,
+                                             params,
+                                             threshold,
+                                             workspace,
+                                             worksize,
+                                             stream,
+                                             isRowMajor,
+                                             metric_arg);
     CUDA_CHECK(cudaStreamDestroy(stream));
     CUDA_CHECK(cudaFree(workspace));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(x));
     CUDA_CHECK(cudaFree(y));
     CUDA_CHECK(cudaFree(dist_ref));
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 4573a070b6..a7b763a2bc 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -29,40 +29,40 @@ template <typename LabelT, typename DataT>
 struct CubKVPMinReduce {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
-  DI KVP operator()(const KVP &a, const KVP &b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
 };  // KVPMinReduce
 
 template <typename DataT, bool Sqrt, typename ReduceOpT, int NWARPS>
-__global__ void naiveKernel(cub::KeyValuePair<int, DataT> *min, DataT *x,
-                            DataT *y, int m, int n, int k, int *workspace,
-                            DataT maxVal) {
-  int midx = threadIdx.y + blockIdx.y * blockDim.y;
-  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+__global__ void naiveKernel(cub::KeyValuePair<int, DataT>* min,
+                            DataT* x,
+                            DataT* y,
+                            int m,
+                            int n,
+                            int k,
+                            int* workspace,
+                            DataT maxVal)
+{
+  int midx  = threadIdx.y + blockIdx.y * blockDim.y;
+  int nidx  = threadIdx.x + blockIdx.x * blockDim.x;
   DataT acc = DataT(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = i + midx * k;
-    int yidx = i + nidx * k;
+    int xidx  = i + midx * k;
+    int yidx  = i + nidx * k;
     auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx];
     acc += diff * diff;
   }
-  if (Sqrt) {
-    acc = raft::mySqrt(acc);
-  }
+  if (Sqrt) { acc = raft::mySqrt(acc); }
   ReduceOpT redOp;
   typedef cub::WarpReduce<cub::KeyValuePair<int, DataT>> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp[NWARPS];
   int warpId = threadIdx.x / raft::WarpSize;
   cub::KeyValuePair<int, DataT> tmp;
-  tmp.key = nidx;
+  tmp.key   = nidx;
   tmp.value = midx >= m || nidx >= n ? maxVal : acc;
-  tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce<int, DataT>());
+  tmp       = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce<int, DataT>());
   if (threadIdx.x % raft::WarpSize == 0 && midx < m) {
     while (atomicCAS(workspace + midx, 0, 1) == 1)
       ;
@@ -74,8 +74,15 @@ __global__ void naiveKernel(cub::KeyValuePair<int, DataT> *min, DataT *x,
 }
 
 template <typename DataT, bool Sqrt>
-void naive(cub::KeyValuePair<int, DataT> *min, DataT *x, DataT *y, int m, int n,
-           int k, int *workspace, cudaStream_t stream) {
+void naive(cub::KeyValuePair<int, DataT>* min,
+           DataT* x,
+           DataT* y,
+           int m,
+           int n,
+           int k,
+           int* workspace,
+           cudaStream_t stream)
+{
   static const dim3 TPB(32, 16, 1);
   dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1);
   CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
@@ -85,8 +92,7 @@ void naive(cub::KeyValuePair<int, DataT> *min, DataT *x, DataT *y, int m, int n,
     <<<blks, 256, 0, stream>>>(min, m, std::numeric_limits<DataT>::max(), op);
   CUDA_CHECK(cudaGetLastError());
   naiveKernel<DataT, Sqrt, MinAndDistanceReduceOp<int, DataT>, 16>
-    <<<nblks, TPB, 0, stream>>>(min, x, y, m, n, k, workspace,
-                                std::numeric_limits<DataT>::max());
+    <<<nblks, TPB, 0, stream>>>(min, x, y, m, n, k, workspace, std::numeric_limits<DataT>::max());
   CUDA_CHECK(cudaGetLastError());
 }
 
@@ -100,7 +106,8 @@ struct Inputs {
 template <typename DataT, bool Sqrt>
 class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
  public:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<Inputs<DataT>>::GetParam();
     raft::random::Rng r(params.seed);
     int m = params.m;
@@ -121,7 +128,8 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
     raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, true, stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaStreamDestroy(stream));
     CUDA_CHECK(cudaFree(x));
@@ -136,25 +144,38 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
  protected:
   Inputs<DataT> params;
   DataT *x, *y, *xn, *yn;
-  char *workspace;
-  cub::KeyValuePair<int, DataT> *min, *min_ref;
+  char* workspace;
+  cub::KeyValuePair<int, DataT>*min, *min_ref;
   cudaStream_t stream;
 
-  virtual void generateGoldenResult() {
+  virtual void generateGoldenResult()
+  {
     int m = params.m;
     int n = params.n;
     int k = params.k;
-    naive<DataT, Sqrt>(min_ref, x, y, m, n, k, (int *)workspace, stream);
+    naive<DataT, Sqrt>(min_ref, x, y, m, n, k, (int*)workspace, stream);
   }
 
-  void runTest(cub::KeyValuePair<int, DataT> *out) {
+  void runTest(cub::KeyValuePair<int, DataT>* out)
+  {
     int m = params.m;
     int n = params.n;
     int k = params.k;
     MinAndDistanceReduceOp<int, DataT> redOp;
-    fusedL2NN<DataT, cub::KeyValuePair<int, DataT>, int>(
-      out, x, y, xn, yn, m, n, k, (void *)workspace, redOp,
-      raft::distance::KVPMinReduce<int, DataT>(), Sqrt, true, stream);
+    fusedL2NN<DataT, cub::KeyValuePair<int, DataT>, int>(out,
+                                                         x,
+                                                         y,
+                                                         xn,
+                                                         yn,
+                                                         m,
+                                                         n,
+                                                         k,
+                                                         (void*)workspace,
+                                                         redOp,
+                                                         raft::distance::KVPMinReduce<int, DataT>(),
+                                                         Sqrt,
+                                                         true,
+                                                         stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 };
@@ -163,9 +184,10 @@ template <typename T>
 struct CompareApproxAbsKVP {
   typedef typename cub::KeyValuePair<int, T> KVP;
   CompareApproxAbsKVP(T eps_) : eps(eps_) {}
-  bool operator()(const KVP &a, const KVP &b) const {
-    T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value));
-    T m = std::max(raft::abs(a.value), raft::abs(b.value));
+  bool operator()(const KVP& a, const KVP& b) const
+  {
+    T diff  = raft::abs(raft::abs(a.value) - raft::abs(b.value));
+    T m     = std::max(raft::abs(a.value), raft::abs(b.value));
     T ratio = m >= eps ? diff / m : diff;
     return (ratio <= eps);
   }
@@ -177,17 +199,20 @@ struct CompareApproxAbsKVP {
 template <typename T>
 struct CompareExactKVP {
   typedef typename cub::KeyValuePair<int, T> KVP;
-  bool operator()(const KVP &a, const KVP &b) const {
+  bool operator()(const KVP& a, const KVP& b) const
+  {
     if (a.value != b.value) return false;
     return true;
   }
 };
 
 template <typename K, typename V, typename L>
-::testing::AssertionResult devArrMatch(const cub::KeyValuePair<K, V> *expected,
-                                       const cub::KeyValuePair<K, V> *actual,
-                                       size_t size, L eq_compare,
-                                       cudaStream_t stream = 0) {
+::testing::AssertionResult devArrMatch(const cub::KeyValuePair<K, V>* expected,
+                                       const cub::KeyValuePair<K, V>* actual,
+                                       size_t size,
+                                       L eq_compare,
+                                       cudaStream_t stream = 0)
+{
   typedef typename cub::KeyValuePair<K, V> KVP;
   std::shared_ptr<KVP> exp_h(new KVP[size]);
   std::shared_ptr<KVP> act_h(new KVP[size]);
@@ -199,47 +224,42 @@ template <typename K, typename V, typename L>
     auto act = act_h.get()[i];
     if (!eq_compare(exp, act)) {
       return ::testing::AssertionFailure()
-             << "actual=" << act.key << "," << act.value
-             << " != expected=" << exp.key << "," << exp.value << " @" << i;
+             << "actual=" << act.key << "," << act.value << " != expected=" << exp.key << ","
+             << exp.value << " @" << i;
     }
   }
   return ::testing::AssertionSuccess();
 }
 
 const std::vector<Inputs<float>> inputsf = {
-  {0.001f, 32, 32, 32, 1234ULL},   {0.001f, 32, 64, 32, 1234ULL},
-  {0.001f, 64, 32, 32, 1234ULL},   {0.001f, 64, 64, 32, 1234ULL},
-  {0.001f, 128, 32, 32, 1234ULL},  {0.001f, 128, 64, 32, 1234ULL},
+  {0.001f, 32, 32, 32, 1234ULL},   {0.001f, 32, 64, 32, 1234ULL},   {0.001f, 64, 32, 32, 1234ULL},
+  {0.001f, 64, 64, 32, 1234ULL},   {0.001f, 128, 32, 32, 1234ULL},  {0.001f, 128, 64, 32, 1234ULL},
   {0.001f, 128, 128, 64, 1234ULL}, {0.001f, 64, 128, 128, 1234ULL},
 
-  {0.001f, 32, 32, 34, 1234ULL},   {0.001f, 32, 64, 34, 1234ULL},
-  {0.001f, 64, 32, 34, 1234ULL},   {0.001f, 64, 64, 34, 1234ULL},
-  {0.001f, 128, 32, 34, 1234ULL},  {0.001f, 128, 64, 34, 1234ULL},
+  {0.001f, 32, 32, 34, 1234ULL},   {0.001f, 32, 64, 34, 1234ULL},   {0.001f, 64, 32, 34, 1234ULL},
+  {0.001f, 64, 64, 34, 1234ULL},   {0.001f, 128, 32, 34, 1234ULL},  {0.001f, 128, 64, 34, 1234ULL},
   {0.001f, 128, 128, 66, 1234ULL}, {0.001f, 64, 128, 130, 1234ULL},
 
-  {0.001f, 32, 32, 33, 1234ULL},   {0.001f, 32, 64, 33, 1234ULL},
-  {0.001f, 64, 32, 33, 1234ULL},   {0.001f, 64, 64, 33, 1234ULL},
-  {0.001f, 128, 32, 33, 1234ULL},  {0.001f, 128, 64, 33, 1234ULL},
+  {0.001f, 32, 32, 33, 1234ULL},   {0.001f, 32, 64, 33, 1234ULL},   {0.001f, 64, 32, 33, 1234ULL},
+  {0.001f, 64, 64, 33, 1234ULL},   {0.001f, 128, 32, 33, 1234ULL},  {0.001f, 128, 64, 33, 1234ULL},
   {0.001f, 128, 128, 65, 1234ULL}, {0.001f, 64, 128, 129, 1234ULL},
 
   {0.006f, 1805, 134, 2, 1234ULL},
 };
 typedef FusedL2NNTest<float, false> FusedL2NNTestF_Sq;
-TEST_P(FusedL2NNTestF_Sq, Result) {
+TEST_P(FusedL2NNTestF_Sq, Result)
+{
   runTest(min);
-  ASSERT_TRUE(devArrMatch(min_ref, min, params.m,
-                          CompareApproxAbsKVP<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, ::testing::ValuesIn(inputsf));
 typedef FusedL2NNTest<float, true> FusedL2NNTestF_Sqrt;
-TEST_P(FusedL2NNTestF_Sqrt, Result) {
+TEST_P(FusedL2NNTestF_Sqrt, Result)
+{
   runTest(min);
-  ASSERT_TRUE(devArrMatch(min_ref, min, params.m,
-                          CompareApproxAbsKVP<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, ::testing::ValuesIn(inputsf));
 
 const std::vector<Inputs<double>> inputsd = {
   {0.00001, 32, 32, 32, 1234ULL},   {0.00001, 32, 64, 32, 1234ULL},
@@ -260,38 +280,38 @@ const std::vector<Inputs<double>> inputsd = {
   {0.00001, 1805, 134, 2, 1234ULL},
 };
 typedef FusedL2NNTest<double, false> FusedL2NNTestD_Sq;
-TEST_P(FusedL2NNTestD_Sq, Result) {
+TEST_P(FusedL2NNTestD_Sq, Result)
+{
   runTest(min);
-  ASSERT_TRUE(devArrMatch(min_ref, min, params.m,
-                          CompareApproxAbsKVP<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, ::testing::ValuesIn(inputsd));
 typedef FusedL2NNTest<double, true> FusedL2NNTestD_Sqrt;
-TEST_P(FusedL2NNTestD_Sqrt, Result) {
+TEST_P(FusedL2NNTestD_Sqrt, Result)
+{
   runTest(min);
-  ASSERT_TRUE(devArrMatch(min_ref, min, params.m,
-                          CompareApproxAbsKVP<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, ::testing::ValuesIn(inputsd));
 
 /// This is to test output determinism of the prim
 template <typename DataT, bool Sqrt>
 class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
-  void SetUp() override {
+  void SetUp() override
+  {
     FusedL2NNTest<DataT, Sqrt>::SetUp();
     int m = this->params.m;
     raft::allocate(min1, m);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     FusedL2NNTest<DataT, Sqrt>::TearDown();
     CUDA_CHECK(cudaFree(min1));
   }
 
  protected:
-  cub::KeyValuePair<int, DataT> *min1;
+  cub::KeyValuePair<int, DataT>* min1;
 
   static const int NumRepeats = 100;
 
@@ -299,46 +319,46 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
 };
 
 typedef FusedL2NNDetTest<float, false> FusedL2NNDetTestF_Sq;
-TEST_P(FusedL2NNDetTestF_Sq, Result) {
+TEST_P(FusedL2NNDetTestF_Sq, Result)
+{
   runTest(min);  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1);
     ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP<float>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, ::testing::ValuesIn(inputsf));
 typedef FusedL2NNDetTest<float, true> FusedL2NNDetTestF_Sqrt;
-TEST_P(FusedL2NNDetTestF_Sqrt, Result) {
+TEST_P(FusedL2NNDetTestF_Sqrt, Result)
+{
   runTest(min);  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1);
     ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP<float>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, ::testing::ValuesIn(inputsf));
 
 typedef FusedL2NNDetTest<double, false> FusedL2NNDetTestD_Sq;
-TEST_P(FusedL2NNDetTestD_Sq, Result) {
+TEST_P(FusedL2NNDetTestD_Sq, Result)
+{
   runTest(min);  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1);
     ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP<double>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, ::testing::ValuesIn(inputsd));
 typedef FusedL2NNDetTest<double, true> FusedL2NNDetTestD_Sqrt;
-TEST_P(FusedL2NNDetTestD_Sqrt, Result) {
+TEST_P(FusedL2NNDetTestD_Sqrt, Result)
+{
   runTest(min);  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1);
     ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP<double>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index e6ee09262e..e14841eb54 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -23,7 +23,8 @@
 
 namespace raft {
 
-TEST(Raft, EigenSolvers) {
+TEST(Raft, EigenSolvers)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -35,10 +36,10 @@ TEST(Raft, EigenSolvers) {
   index_type* ro{nullptr};
   index_type* ci{nullptr};
   value_type* vs{nullptr};
-  index_type nnz = 0;
+  index_type nnz   = 0;
   index_type nrows = 0;
-  auto stream = h.get_stream();
-  auto t_exe_pol = thrust::cuda::par.on(stream);
+  auto stream      = h.get_stream();
+  auto t_exe_pol   = thrust::cuda::par.on(stream);
 
   sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
@@ -49,7 +50,7 @@ TEST(Raft, EigenSolvers) {
   value_type tol{1.0e-10};
   bool reorthog{true};
 
-  //nullptr expected to trigger exceptions:
+  // nullptr expected to trigger exceptions:
   //
   value_type* eigvals{nullptr};
   value_type* eigvecs{nullptr};
@@ -60,14 +61,13 @@ TEST(Raft, EigenSolvers) {
 
   lanczos_solver_t<index_type, value_type> eig_solver{cfg};
 
-  EXPECT_ANY_THROW(
-    eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
+  EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
 
-  EXPECT_ANY_THROW(
-    eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs));
+  EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs));
 }
 
-TEST(Raft, SpectralSolvers) {
+TEST(Raft, SpectralSolvers)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -82,7 +82,7 @@ TEST(Raft, SpectralSolvers) {
   value_type tol{1.0e-10};
   bool reorthog{true};
 
-  //nullptr expected to trigger exceptions:
+  // nullptr expected to trigger exceptions:
   //
   index_type* clusters{nullptr};
   value_type* eigvals{nullptr};
@@ -96,22 +96,19 @@ TEST(Raft, SpectralSolvers) {
 
   index_type k{5};
 
-  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol,
-                                                            seed};
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
   kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
 
   auto stream = h.get_stream();
 
   auto t_exe_p = thrust::cuda::par.on(stream);
-  sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
-                                             nullptr, 0,       0};
-  EXPECT_ANY_THROW(spectral::partition(
-    h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
+  EXPECT_ANY_THROW(
+    spectral::partition(h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
 
   value_type edgeCut{0};
   value_type cost{0};
-  EXPECT_ANY_THROW(
-    spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost));
+  EXPECT_ANY_THROW(spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost));
 }
 
 }  // namespace raft
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 4cb9809844..8023fca319 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -22,7 +22,8 @@
 
 namespace raft {
 
-TEST(Raft, HandleDefault) {
+TEST(Raft, HandleDefault)
+{
   handle_t h;
   ASSERT_EQ(0, h.get_num_internal_streams());
   ASSERT_EQ(0, h.get_device());
@@ -33,7 +34,8 @@ TEST(Raft, HandleDefault) {
   ASSERT_NE(nullptr, h.get_cusparse_handle());
 }
 
-TEST(Raft, Handle) {
+TEST(Raft, Handle)
+{
   handle_t h(4);
   ASSERT_EQ(4, h.get_num_internal_streams());
   cudaStream_t stream;
@@ -44,13 +46,15 @@ TEST(Raft, Handle) {
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, GetInternalStreams) {
+TEST(Raft, GetInternalStreams)
+{
   handle_t h(4);
   auto streams = h.get_internal_streams();
   ASSERT_EQ(4U, streams.size());
 }
 
-TEST(Raft, GetHandleFromPool) {
+TEST(Raft, GetHandleFromPool)
+{
   handle_t parent(4);
 
   handle_t child(parent, 2);
@@ -64,7 +68,8 @@ TEST(Raft, GetHandleFromPool) {
   ASSERT_EQ(parent.get_device(), child.get_device());
 }
 
-TEST(Raft, GetHandleFromPoolPerf) {
+TEST(Raft, GetHandleFromPoolPerf)
+{
   handle_t parent(100);
   auto start = curTimeMillis();
   for (int i = 0; i < parent.get_num_internal_streams(); i++) {
@@ -76,13 +81,13 @@ TEST(Raft, GetHandleFromPoolPerf) {
   ASSERT_LE(curTimeMillis() - start, 10);
 }
 
-TEST(Raft, GetHandleStreamViews) {
+TEST(Raft, GetHandleStreamViews)
+{
   handle_t parent(4);
 
   handle_t child(parent, 2);
   ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view());
-  ASSERT_EQ(parent.get_internal_stream_view(2).value(),
-            child.get_stream_view().value());
+  ASSERT_EQ(parent.get_internal_stream_view(2).value(), child.get_stream_view().value());
   EXPECT_FALSE(child.get_stream_view().is_default());
 }
 }  // namespace raft
diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp
index 830d085a40..d883de59fe 100644
--- a/cpp/test/integer_utils.cpp
+++ b/cpp/test/integer_utils.cpp
@@ -20,7 +20,8 @@
 
 namespace raft {
 
-TEST(Raft, rounding_up) {
+TEST(Raft, rounding_up)
+{
   ASSERT_EQ(raft::div_rounding_up_safe(5, 3), 2);
   ASSERT_EQ(raft::div_rounding_up_safe(0, 3), 0);
   ASSERT_EQ(raft::div_rounding_up_safe(7, 8), 1);
@@ -29,7 +30,8 @@ TEST(Raft, rounding_up) {
   ASSERT_EQ(raft::div_rounding_up_unsafe(7, 8), 1);
 }
 
-TEST(Raft, is_a_power_of_two) {
+TEST(Raft, is_a_power_of_two)
+{
   ASSERT_EQ(raft::is_a_power_of_two(1 << 5), true);
   ASSERT_EQ(raft::is_a_power_of_two((1 << 5) + 1), false);
 }
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index dc2846fdba..209bb0355a 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -36,7 +36,8 @@ class labelTest : public ::testing::Test {
 };
 
 typedef labelTest MakeMonotonicTest;
-TEST_F(MakeMonotonicTest, Result) {
+TEST_F(MakeMonotonicTest, Result)
+{
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
 
@@ -48,17 +49,14 @@ TEST_F(MakeMonotonicTest, Result) {
   raft::allocate(actual, m, true);
   raft::allocate(expected, m, true);
 
-  float *data_h =
-    new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0};
+  float* data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0};
 
-  float *expected_h =
-    new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0};
+  float* expected_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0};
 
   raft::update_device(data, data_h, m, stream);
   raft::update_device(expected, expected_h, m, stream);
 
-  std::shared_ptr<raft::mr::device::allocator> allocator(
-    new raft::mr::device::default_allocator);
+  std::shared_ptr<raft::mr::device::allocator> allocator(new raft::mr::device::default_allocator);
   make_monotonic(actual, data, m, stream, allocator);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -73,37 +71,36 @@ TEST_F(MakeMonotonicTest, Result) {
   delete expected_h;
 }
 
-TEST(labelTest, Classlabels) {
+TEST(labelTest, Classlabels)
+{
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
-  std::shared_ptr<raft::mr::device::allocator> allocator(
-    new raft::mr::device::default_allocator);
+  std::shared_ptr<raft::mr::device::allocator> allocator(new raft::mr::device::default_allocator);
 
   int n_rows = 6;
-  float *y_d;
+  float* y_d;
   raft::allocate(y_d, n_rows);
 
   float y_h[] = {2, -1, 1, 2, 1, 1};
   raft::update_device(y_d, y_h, n_rows, stream);
 
   int n_classes;
-  float *y_unique_d;
+  float* y_unique_d;
   getUniquelabels(y_d, n_rows, &y_unique_d, &n_classes, stream, allocator);
 
   ASSERT_EQ(n_classes, 3);
 
   float y_unique_exp[] = {-1, 1, 2};
-  EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes,
-                              raft::Compare<float>(), stream));
+  EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes, raft::Compare<float>(), stream));
 
-  float *y_relabeled_d;
+  float* y_relabeled_d;
   raft::allocate(y_relabeled_d, n_rows);
 
   getOvrlabels(y_d, n_rows, y_unique_d, n_classes, y_relabeled_d, 2, stream);
 
   float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1};
-  EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows,
-                              raft::Compare<float>(), stream));
+  EXPECT_TRUE(
+    devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows, raft::Compare<float>(), stream));
 
   CUDA_CHECK(cudaStreamDestroy(stream));
   CUDA_CHECK(cudaFree(y_d));
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index a2f14a8dbc..3d930ff22e 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -39,8 +39,7 @@ struct MergeLabelsInputs {
 };
 
 template <typename Index_>
-class MergeLabelsTest
-  : public ::testing::TestWithParam<MergeLabelsInputs<Index_>> {
+class MergeLabelsTest : public ::testing::TestWithParam<MergeLabelsInputs<Index_>> {
  protected:
   MergeLabelsTest()
     : params(::testing::TestWithParam<MergeLabelsInputs<Index_>>::GetParam()),
@@ -50,25 +49,23 @@ class MergeLabelsTest
       expected(params.N, stream),
       R(params.N, stream),
       mask(params.N, stream),
-      m(1, stream) {}
-
-  void Run() {
-    raft::update_device(labels_a.data(), params.labels_a.data(), params.N,
-                        stream);
-    raft::update_device(labels_b.data(), params.labels_b.data(), params.N,
-                        stream);
-    raft::update_device(expected.data(), params.expected.data(), params.N,
-                        stream);
-    raft::update_device(mask.data(),
-                        reinterpret_cast<bool *>(params.mask.data()), params.N,
-                        stream);
-
-    merge_labels(labels_a.data(), labels_b.data(), mask.data(), R.data(),
-                 m.data(), params.N, stream);
+      m(1, stream)
+  {
+  }
+
+  void Run()
+  {
+    raft::update_device(labels_a.data(), params.labels_a.data(), params.N, stream);
+    raft::update_device(labels_b.data(), params.labels_b.data(), params.N, stream);
+    raft::update_device(expected.data(), params.expected.data(), params.N, stream);
+    raft::update_device(mask.data(), reinterpret_cast<bool*>(params.mask.data()), params.N, stream);
+
+    merge_labels(
+      labels_a.data(), labels_b.data(), mask.data(), R.data(), m.data(), params.N, stream);
 
     cudaStreamSynchronize(stream);
-    ASSERT_TRUE(raft::devArrMatch<Index_>(expected.data(), labels_a.data(),
-                                          params.N, raft::Compare<Index_>()));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(
+      expected.data(), labels_a.data(), params.N, raft::Compare<Index_>()));
   }
 
  protected:
@@ -85,22 +82,14 @@ TEST_P(MergeLabelsTestI, Result) { Run(); }
 using MergeLabelsTestL = MergeLabelsTest<int64_t>;
 TEST_P(MergeLabelsTestL, Result) { Run(); }
 
-constexpr int MAX32 = std::numeric_limits<int>::max();
+constexpr int MAX32     = std::numeric_limits<int>::max();
 constexpr int64_t MAX64 = std::numeric_limits<int64_t>::max();
 
 const std::vector<MergeLabelsInputs<int>> merge_inputs_32 = {
   {4, {1, 1, 3, MAX32}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}},
   {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-  {6,
-   {1, 2, 1, 4, 5, MAX32},
-   {1, 2, MAX32, 4, 5, 4},
-   {1, 1, 0, 1, 1, 0},
-   {1, 2, 1, 4, 5, 4}},
-  {6,
-   {1, 2, 2, 2, 2, 6},
-   {1, 1, 1, 5, 5, 5},
-   {1, 1, 1, 1, 1, 1},
-   {1, 1, 1, 1, 1, 1}},
+  {6, {1, 2, 1, 4, 5, MAX32}, {1, 2, MAX32, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}},
+  {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}},
   {8,
    {1, 1, 3, 3, MAX32, 1, 3, MAX32},
    {1, 2, 3, 2, MAX32, 2, 2, 2},
@@ -116,16 +105,8 @@ const std::vector<MergeLabelsInputs<int>> merge_inputs_32 = {
 const std::vector<MergeLabelsInputs<int64_t>> merge_inputs_64 = {
   {4, {1, 1, 3, MAX64}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}},
   {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-  {6,
-   {1, 2, 1, 4, 5, MAX64},
-   {1, 2, MAX64, 4, 5, 4},
-   {1, 1, 0, 1, 1, 0},
-   {1, 2, 1, 4, 5, 4}},
-  {6,
-   {1, 2, 2, 2, 2, 6},
-   {1, 1, 1, 5, 5, 5},
-   {1, 1, 1, 1, 1, 1},
-   {1, 1, 1, 1, 1, 1}},
+  {6, {1, 2, 1, 4, 5, MAX64}, {1, 2, MAX64, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}},
+  {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}},
   {8,
    {1, 1, 3, 3, MAX64, 1, 3, MAX64},
    {1, 2, 3, 2, MAX64, 2, 2, 2},
@@ -138,10 +119,8 @@ const std::vector<MergeLabelsInputs<int64_t>> merge_inputs_64 = {
    {1, 1, 1, 1, 1, 7, 7, 7}},
 };
 
-INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI,
-                        ::testing::ValuesIn(merge_inputs_32));
-INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL,
-                        ::testing::ValuesIn(merge_inputs_64));
+INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, ::testing::ValuesIn(merge_inputs_32));
+INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, ::testing::ValuesIn(merge_inputs_64));
 
 }  // namespace label
 }  // namespace raft
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index 04f473f836..61c7182c72 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -29,11 +29,11 @@
 #include <raft/lap/lap.cuh>
 #include <random>
 
-#define PROBLEMSIZE 1000  // Number of rows/columns
-#define BATCHSIZE 10      // Number of problems in the batch
-#define COSTRANGE 1000
+#define PROBLEMSIZE  1000  // Number of rows/columns
+#define BATCHSIZE    10    // Number of problems in the batch
+#define COSTRANGE    1000
 #define PROBLEMCOUNT 1
-#define REPETITIONS 1
+#define REPETITIONS  1
 
 #define SEED 01010001
 
@@ -43,41 +43,43 @@ namespace raft {
 
 // Function for generating problem with uniformly distributed integer costs between [0, COSTRANGE].
 template <typename weight_t>
-void generateProblem(weight_t *cost_matrix, int SP, int N, int costrange) {
+void generateProblem(weight_t* cost_matrix, int SP, int N, int costrange)
+{
   long N2 = SP * N * N;
 
   std::uniform_int_distribution<int> distribution(0, costrange);
 
   for (long i = 0; i < N2; i++) {
-    int val = distribution(generator);
+    int val        = distribution(generator);
     cost_matrix[i] = (weight_t)val;
   }
 }
 
 template <typename vertex_t, typename weight_t>
-void hungarian_test(int problemsize, int costrange, int problemcount,
-                    int repetitions, int batchsize, weight_t epsilon,
-                    bool verbose = false) {
+void hungarian_test(int problemsize,
+                    int costrange,
+                    int problemcount,
+                    int repetitions,
+                    int batchsize,
+                    weight_t epsilon,
+                    bool verbose = false)
+{
   raft::handle_t handle;
 
-  weight_t *h_cost = new weight_t[batchsize * problemsize * problemsize];
+  weight_t* h_cost = new weight_t[batchsize * problemsize * problemsize];
 
   for (int j = 0; j < problemcount; j++) {
     generateProblem(h_cost, batchsize, problemsize, costrange);
 
     raft::mr::device::buffer<weight_t> elements_v(
-      handle.get_device_allocator(), handle.get_stream(),
-      batchsize * problemsize * problemsize);
+      handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize * problemsize);
     raft::mr::device::buffer<vertex_t> row_assignment_v(
-      handle.get_device_allocator(), handle.get_stream(),
-      batchsize * problemsize);
+      handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize);
     raft::mr::device::buffer<vertex_t> col_assignment_v(
-      handle.get_device_allocator(), handle.get_stream(),
-      batchsize * problemsize);
+      handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize);
 
-    raft::update_device(elements_v.data(), h_cost,
-                        batchsize * problemsize * problemsize,
-                        handle.get_stream());
+    raft::update_device(
+      elements_v.data(), h_cost, batchsize * problemsize * problemsize, handle.get_stream());
 
     for (int i = 0; i < repetitions; i++) {
       float start = omp_get_wtime();
@@ -87,20 +89,18 @@ void hungarian_test(int problemsize, int costrange, int problemcount,
         handle, problemsize, batchsize, epsilon);
 
       // Solve LAP(s) for given cost matrix
-      lpx.solve(elements_v.data(), row_assignment_v.data(),
-                col_assignment_v.data());
+      lpx.solve(elements_v.data(), row_assignment_v.data(), col_assignment_v.data());
 
       float end = omp_get_wtime();
 
       float total_time = (end - start);
 
       if (verbose) {
-        // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual objectives. At optimality both values should match.
+        // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual
+        // objectives. At optimality both values should match.
         for (int k = 0; k < batchsize; k++) {
-          std::cout << j << ":" << i << ":" << k << ":"
-                    << lpx.getPrimalObjectiveValue(k) << ":"
-                    << lpx.getDualObjectiveValue(k) << ":" << total_time
-                    << std::endl;
+          std::cout << j << ":" << i << ":" << k << ":" << lpx.getPrimalObjectiveValue(k) << ":"
+                    << lpx.getDualObjectiveValue(k) << ":" << total_time << std::endl;
         }
       }
     }
@@ -109,34 +109,38 @@ void hungarian_test(int problemsize, int costrange, int problemcount,
   delete[] h_cost;
 }
 
-TEST(Raft, HungarianIntFloat) {
-  hungarian_test<int, float>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                             BATCHSIZE, float{1e-6});
+TEST(Raft, HungarianIntFloat)
+{
+  hungarian_test<int, float>(
+    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6});
 }
 
-TEST(Raft, HungarianIntDouble) {
-  hungarian_test<int, double>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                              BATCHSIZE, double{1e-6});
+TEST(Raft, HungarianIntDouble)
+{
+  hungarian_test<int, double>(
+    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6});
 }
 
-TEST(Raft, HungarianIntLong) {
-  hungarian_test<int, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                            BATCHSIZE, long{0});
+TEST(Raft, HungarianIntLong)
+{
+  hungarian_test<int, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0});
 }
 
-TEST(Raft, HungarianLongFloat) {
-  hungarian_test<long, float>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                              BATCHSIZE, float{1e-6});
+TEST(Raft, HungarianLongFloat)
+{
+  hungarian_test<long, float>(
+    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6});
 }
 
-TEST(Raft, HungarianLongDouble) {
-  hungarian_test<long, double>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT,
-                               REPETITIONS, BATCHSIZE, double{1e-6});
+TEST(Raft, HungarianLongDouble)
+{
+  hungarian_test<long, double>(
+    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6});
 }
 
-TEST(Raft, HungarianLongLong) {
-  hungarian_test<long, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                             BATCHSIZE, long{0});
+TEST(Raft, HungarianLongLong)
+{
+  hungarian_test<long, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0});
 }
 
 }  // namespace raft
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index 2fc9d4e30f..38e189f27e 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -27,7 +27,8 @@ namespace linalg {
 template <typename InT, typename OutT = InT>
 class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<AddInputs<InT, OutT>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
@@ -42,7 +43,8 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     add<InT, OutT>(out, in1, in2, len, stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(in1));
     CUDA_CHECK(cudaFree(in2));
@@ -51,9 +53,10 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void compare() {
-    ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len,
-                                  raft::CompareApprox<OutT>(params.tolerance)));
+  void compare()
+  {
+    ASSERT_TRUE(
+      raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox<OutT>(params.tolerance)));
   }
 
  protected:
diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh
index 137419758f..1d9352bfc1 100644
--- a/cpp/test/linalg/add.cuh
+++ b/cpp/test/linalg/add.cuh
@@ -23,18 +23,17 @@ namespace raft {
 namespace linalg {
 
 template <typename InT, typename OutT = InT>
-__global__ void naiveAddElemKernel(OutT *out, const InT *in1, const InT *in2,
-                                   int len) {
+__global__ void naiveAddElemKernel(OutT* out, const InT* in1, const InT* in2, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = OutT(in1[idx] + in2[idx]);
-  }
+  if (idx < len) { out[idx] = OutT(in1[idx] + in2[idx]); }
 }
 
 template <typename InT, typename OutT = InT>
-void naiveAddElem(OutT *out, const InT *in1, const InT *in2, int len) {
+void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveAddElemKernel<InT, OutT><<<nblks, TPB>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -47,8 +46,8 @@ struct AddInputs {
 };
 
 template <typename InT, typename OutT = InT>
-::std::ostream &operator<<(::std::ostream &os,
-                           const AddInputs<InT, OutT> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const AddInputs<InT, OutT>& dims)
+{
   return os;
 }
 
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index 3ae4f86066..078c41356a 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -29,20 +29,19 @@ namespace linalg {
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename IdxType, typename OutType>
-void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2,
-                    IdxType len, cudaStream_t stream) {
+void binaryOpLaunch(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-class BinaryOpTest
-  : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>> {
+class BinaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>> {
  protected:
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      BinaryOpInputs<InType, IdxType, OutType>>::GetParam();
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
 
     cudaStream_t stream;
@@ -59,7 +58,8 @@ class BinaryOpTest
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(in1));
     CUDA_CHECK(cudaFree(in2));
     CUDA_CHECK(cudaFree(out_ref));
@@ -72,67 +72,61 @@ class BinaryOpTest
   OutType *out_ref, *out;
 };
 
-const std::vector<BinaryOpInputs<float, int>> inputsf_i32 = {
-  {0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<BinaryOpInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, int> BinaryOpTestF_i32;
-TEST_P(BinaryOpTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(BinaryOpTestF_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32,
-                         ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
-const std::vector<BinaryOpInputs<float, size_t>> inputsf_i64 = {
-  {0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<BinaryOpInputs<float, size_t>> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, size_t> BinaryOpTestF_i64;
-TEST_P(BinaryOpTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(BinaryOpTestF_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64,
-                         ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<BinaryOpInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, int, double> BinaryOpTestF_i32_D;
-TEST_P(BinaryOpTestF_i32_D, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(BinaryOpTestF_i32_D, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D,
-                         ::testing::ValuesIn(inputsf_i32_d));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d));
 
-const std::vector<BinaryOpInputs<double, int>> inputsd_i32 = {
-  {0.00000001, 1024 * 1024, 1234ULL}};
+const std::vector<BinaryOpInputs<double, int>> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<double, int> BinaryOpTestD_i32;
-TEST_P(BinaryOpTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(BinaryOpTestD_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32,
-                         ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<BinaryOpInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<double, size_t> BinaryOpTestD_i64;
-TEST_P(BinaryOpTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(BinaryOpTestD_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64,
-                         ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 template <typename math_t>
 class BinaryOpAlignment : public ::testing::Test {
  protected:
-  BinaryOpAlignment() {
+  BinaryOpAlignment()
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
   }
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
  public:
-  void Misaligned() {
+  void Misaligned()
+  {
     // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly
     // chosen.
     int n = 1024;
@@ -142,8 +136,12 @@ class BinaryOpAlignment : public ::testing::Test {
     CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream));
     CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream));
     raft::linalg::binaryOp(
-      z.data() + 9, x.data() + 137, y.data() + 19, 256,
-      [] __device__(math_t x, math_t y) { return x + y; }, stream);
+      z.data() + 9,
+      x.data() + 137,
+      y.data() + 19,
+      256,
+      [] __device__(math_t x, math_t y) { return x + y; },
+      stream);
   }
 
   raft::handle_t handle;
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
index fd8ed6dd1e..97cb3ecb24 100644
--- a/cpp/test/linalg/binary_op.cuh
+++ b/cpp/test/linalg/binary_op.cuh
@@ -24,18 +24,17 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename IdxType>
-__global__ void naiveAddKernel(OutType *out, const InType *in1,
-                               const InType *in2, IdxType len) {
+__global__ void naiveAddKernel(OutType* out, const InType* in1, const InType* in2, IdxType len)
+{
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
-  if (idx < len) {
-    out[idx] = static_cast<OutType>(in1[idx] + in2[idx]);
-  }
+  if (idx < len) { out[idx] = static_cast<OutType>(in1[idx] + in2[idx]); }
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) {
+void naiveAdd(OutType* out, const InType* in1, const InType* in2, IdxType len)
+{
   static const IdxType TPB = 64;
-  IdxType nblks = raft::ceildiv(len, TPB);
+  IdxType nblks            = raft::ceildiv(len, TPB);
   naiveAddKernel<InType, OutType, IdxType><<<nblks, TPB>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -48,8 +47,8 @@ struct BinaryOpInputs {
 };
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const BinaryOpInputs<InType, IdxType, OutType> &d) {
+::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs<InType, IdxType, OutType>& d)
+{
   return os;
 }
 
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 00236d53fa..5bbe3166cf 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -36,7 +36,8 @@ class CholeskyR1Test : public ::testing::Test {
       L(allocator, handle.get_stream(), n_rows * n_rows),
       L_exp(allocator, handle.get_stream(), n_rows * n_rows),
       devInfo(allocator, handle.get_stream(), 1),
-      workspace(allocator, handle.get_stream()) {
+      workspace(allocator, handle.get_stream())
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(G.data(), G_host, n_rows * n_rows, stream);
@@ -48,55 +49,58 @@ class CholeskyR1Test : public ::testing::Test {
     int n_bytes = 0;
     // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace
     // requirements.
-    raft::linalg::choleskyRank1Update(handle, L.data(), n_rows, n_rows, nullptr,
-                                      &n_bytes, CUBLAS_FILL_MODE_LOWER, stream);
+    raft::linalg::choleskyRank1Update(
+      handle, L.data(), n_rows, n_rows, nullptr, &n_bytes, CUBLAS_FILL_MODE_LOWER, stream);
     Lwork = std::max(Lwork * sizeof(math_t), (size_t)n_bytes);
     workspace.resize(Lwork, stream);
   }
 
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
-  void testR1Update() {
+  void testR1Update()
+  {
     int n = n_rows * n_rows;
-    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER,
-                                           CUBLAS_FILL_MODE_UPPER};
+    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER};
     for (auto uplo : fillmode) {
       raft::copy(L.data(), G.data(), n, stream);
       for (int rank = 1; rank <= n_rows; rank++) {
         std::stringstream ss;
-        ss << "Rank " << rank
-           << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper");
+        ss << "Rank " << rank << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper");
         SCOPED_TRACE(ss.str());
 
         // Expected solution using Cholesky factorization from scratch
         raft::copy(L_exp.data(), G.data(), n, stream);
-        CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(
-          solver_handle, uplo, rank, L_exp.data(), n_rows,
-          (math_t*)workspace.data(), Lwork, devInfo.data(), stream));
+        CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(solver_handle,
+                                                     uplo,
+                                                     rank,
+                                                     L_exp.data(),
+                                                     n_rows,
+                                                     (math_t*)workspace.data(),
+                                                     Lwork,
+                                                     devInfo.data(),
+                                                     stream));
 
         // Incremental Cholesky factorization using rank one updates.
-        raft::linalg::choleskyRank1Update(handle, L.data(), rank, n_rows,
-                                          workspace.data(), &Lwork, uplo,
-                                          stream);
+        raft::linalg::choleskyRank1Update(
+          handle, L.data(), rank, n_rows, workspace.data(), &Lwork, uplo, stream);
 
-        ASSERT_TRUE(raft::devArrMatch(L_exp.data(), L.data(), n_rows * rank,
-                                      raft::CompareApprox<math_t>(3e-3)));
+        ASSERT_TRUE(raft::devArrMatch(
+          L_exp.data(), L.data(), n_rows * rank, raft::CompareApprox<math_t>(3e-3)));
       }
     }
   }
 
-  void testR1Error() {
+  void testR1Error()
+  {
     raft::update_device(G.data(), G2_host, 4, stream);
-    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER,
-                                           CUBLAS_FILL_MODE_UPPER};
+    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER};
     for (auto uplo : fillmode) {
       raft::copy(L.data(), G.data(), 4, stream);
       ASSERT_NO_THROW(raft::linalg::choleskyRank1Update(
         handle, L.data(), 1, 2, workspace.data(), &Lwork, uplo, stream));
-      ASSERT_THROW(
-        raft::linalg::choleskyRank1Update(
-          handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream),
-        raft::exception);
+      ASSERT_THROW(raft::linalg::choleskyRank1Update(
+                     handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream),
+                   raft::exception);
 
       math_t eps = std::numeric_limits<math_t>::epsilon();
       ASSERT_NO_THROW(raft::linalg::choleskyRank1Update(
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index e45f5651b4..2760d522bc 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -33,8 +33,8 @@ struct coalescedReductionInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os,
-                           const coalescedReductionInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const coalescedReductionInputs<T>& dims)
+{
   return os;
 }
 
@@ -42,17 +42,18 @@ template <typename T>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename T>
-void coalescedReductionLaunch(T *dots, const T *data, int cols, int rows,
-                              cudaStream_t stream, bool inplace = false) {
-  coalescedReduction(dots, data, cols, rows, (T)0, stream, inplace,
-                     [] __device__(T in, int i) { return in * in; });
+void coalescedReductionLaunch(
+  T* dots, const T* data, int cols, int rows, cudaStream_t stream, bool inplace = false)
+{
+  coalescedReduction(
+    dots, data, cols, rows, (T)0, stream, inplace, [] __device__(T in, int i) { return in * in; });
 }
 
 template <typename T>
-class coalescedReductionTest
-  : public ::testing::TestWithParam<coalescedReductionInputs<T>> {
+class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductionInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<coalescedReductionInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
@@ -73,7 +74,8 @@ class coalescedReductionTest
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(dots_exp));
     CUDA_CHECK(cudaFree(dots_act));
@@ -84,34 +86,36 @@ class coalescedReductionTest
   T *data, *dots_exp, *dots_act;
 };
 
-const std::vector<coalescedReductionInputs<float>> inputsf = {
-  {0.000002f, 1024, 32, 1234ULL},
-  {0.000002f, 1024, 64, 1234ULL},
-  {0.000002f, 1024, 128, 1234ULL},
-  {0.000002f, 1024, 256, 1234ULL}};
+const std::vector<coalescedReductionInputs<float>> inputsf = {{0.000002f, 1024, 32, 1234ULL},
+                                                              {0.000002f, 1024, 64, 1234ULL},
+                                                              {0.000002f, 1024, 128, 1234ULL},
+                                                              {0.000002f, 1024, 256, 1234ULL}};
 
-const std::vector<coalescedReductionInputs<double>> inputsd = {
-  {0.000000001, 1024, 32, 1234ULL},
-  {0.000000001, 1024, 64, 1234ULL},
-  {0.000000001, 1024, 128, 1234ULL},
-  {0.000000001, 1024, 256, 1234ULL}};
+const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 1024, 32, 1234ULL},
+                                                               {0.000000001, 1024, 64, 1234ULL},
+                                                               {0.000000001, 1024, 128, 1234ULL},
+                                                               {0.000000001, 1024, 256, 1234ULL}};
 
 typedef coalescedReductionTest<float> coalescedReductionTestF;
-TEST_P(coalescedReductionTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(coalescedReductionTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp, dots_act, params.rows, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef coalescedReductionTest<double> coalescedReductionTestD;
-TEST_P(coalescedReductionTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(coalescedReductionTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp, dots_act, params.rows, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestF,
+INSTANTIATE_TEST_CASE_P(coalescedReductionTests,
+                        coalescedReductionTestF,
                         ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestD,
+INSTANTIATE_TEST_CASE_P(coalescedReductionTests,
+                        coalescedReductionTestD,
                         ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index 2396558939..d8995ffa0a 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -25,30 +25,27 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naiveDivideKernel(Type *out, const Type *in, Type scalar,
-                                  int len) {
+__global__ void naiveDivideKernel(Type* out, const Type* in, Type scalar, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in[idx] / scalar;
-  }
+  if (idx < len) { out[idx] = in[idx] / scalar; }
 }
 
 template <typename Type>
-void naiveDivide(Type *out, const Type *in, Type scalar, int len,
-                 cudaStream_t stream) {
+void naiveDivide(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveDivideKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename T>
-class DivideTest
-  : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>> {
+class DivideTest : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>> {
  protected:
-  void SetUp() override {
-    params =
-      ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam();
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
     cudaStream_t stream;
@@ -63,7 +60,8 @@ class DivideTest
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(in));
     CUDA_CHECK(cudaFree(out_ref));
     CUDA_CHECK(cudaFree(out));
@@ -74,25 +72,21 @@ class DivideTest
   T *in, *out_ref, *out;
 };
 
-const std::vector<UnaryOpInputs<float>> inputsf = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<UnaryOpInputs<float>> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef DivideTest<float> DivideTestF;
-TEST_P(DivideTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(DivideTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, ::testing::ValuesIn(inputsf));
 
 typedef DivideTest<double> DivideTestD;
-const std::vector<UnaryOpInputs<double>> inputsd = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-TEST_P(DivideTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          raft::CompareApprox<double>(params.tolerance)));
+const std::vector<UnaryOpInputs<double>> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+TEST_P(DivideTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 159d288174..5cad657dab 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -35,14 +35,16 @@ struct EigInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const EigInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const EigInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::handle_t handle;
     stream = handle.get_stream();
 
@@ -51,8 +53,8 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
     int len = params.len;
 
     raft::allocate(cov_matrix, len);
-    T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
-                        0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
+    T cov_matrix_h[] = {
+      1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
     raft::update_device(cov_matrix, cov_matrix_h, len, stream);
 
@@ -61,10 +63,23 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
     raft::allocate(eig_vectors_jacobi, len);
     raft::allocate(eig_vals_jacobi, params.n_col);
 
-    T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874,
-                             0.4874, -0.5123, 0.6498, 0.2789,  -0.2789, -0.6498,
-                             0.4874, 0.5123,  0.5123, 0.4874};
-    T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266};
+    T eig_vectors_ref_h[] = {0.2790,
+                             -0.6498,
+                             0.6498,
+                             -0.2789,
+                             -0.5123,
+                             0.4874,
+                             0.4874,
+                             -0.5123,
+                             0.6498,
+                             0.2789,
+                             -0.2789,
+                             -0.6498,
+                             0.4874,
+                             0.5123,
+                             0.5123,
+                             0.4874};
+    T eig_vals_ref_h[]    = {0.0614, 0.1024, 0.3096, 3.5266};
 
     raft::allocate(eig_vectors_ref, len);
     raft::allocate(eig_vals_ref, params.n_col);
@@ -72,13 +87,19 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
     raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream);
     raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream);
 
-    eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals,
-          stream);
+    eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals, stream);
 
-    T tol = 1.e-7;
+    T tol      = 1.e-7;
     int sweeps = 15;
-    eigJacobi(handle, cov_matrix, params.n_row, params.n_col,
-              eig_vectors_jacobi, eig_vals_jacobi, stream, tol, sweeps);
+    eigJacobi(handle,
+              cov_matrix,
+              params.n_row,
+              params.n_col,
+              eig_vectors_jacobi,
+              eig_vals_jacobi,
+              stream,
+              tol,
+              sweeps);
 
     // test code for comparing two methods
     len = params.n * params.n;
@@ -90,14 +111,20 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
 
     r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream);
 
-    eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large,
-          eig_vals_large, stream);
-    eigJacobi(handle, cov_matrix_large, params.n, params.n,
-              eig_vectors_jacobi_large, eig_vals_jacobi_large, stream, tol,
+    eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large, eig_vals_large, stream);
+    eigJacobi(handle,
+              cov_matrix_large,
+              params.n,
+              params.n,
+              eig_vectors_jacobi_large,
+              eig_vals_jacobi_large,
+              stream,
+              tol,
               sweeps);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(cov_matrix));
     CUDA_CHECK(cudaFree(eig_vectors));
     CUDA_CHECK(cudaFree(eig_vectors_jacobi));
@@ -109,89 +136,95 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
 
  protected:
   EigInputs<T> params;
-  T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals,
-    *eig_vals_jacobi, *eig_vals_ref;
+  T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals, *eig_vals_jacobi,
+    *eig_vals_ref;
 
-  T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large,
-    *eig_vals_large, *eig_vals_jacobi_large;
+  T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large, *eig_vals_large,
+    *eig_vals_jacobi_large;
 
   cudaStream_t stream;
 };
 
-const std::vector<EigInputs<float>> inputsf2 = {
-  {0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
 
-const std::vector<EigInputs<double>> inputsd2 = {
-  {0.001, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}};
 
 typedef EigTest<float> EigTestValF;
-TEST_P(EigTestValF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestValF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestValD;
-TEST_P(EigTestValD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestValD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecF;
-TEST_P(EigTestVecF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestVecF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vectors_ref, eig_vectors, params.len, raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecD;
-TEST_P(EigTestVecD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestVecD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vectors_ref, eig_vectors, params.len, raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestValJacobiF;
-TEST_P(EigTestValJacobiF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestValJacobiF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vals_ref, eig_vals_jacobi, params.n_col, raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestValJacobiD;
-TEST_P(EigTestValJacobiD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestValJacobiD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vals_ref, eig_vals_jacobi, params.n_col, raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecJacobiF;
-TEST_P(EigTestVecJacobiF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestVecJacobiF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref,
+                                eig_vectors_jacobi,
+                                params.len,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecJacobiD;
-TEST_P(EigTestVecJacobiD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestVecJacobiD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref,
+                                eig_vectors_jacobi,
+                                params.len,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecCompareF;
-TEST_P(EigTestVecCompareF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n),
-    raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestVecCompareF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_large,
+                                eig_vectors_jacobi_large,
+                                (params.n * params.n),
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecCompareD;
-TEST_P(EigTestVecCompareD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n),
-    raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestVecCompareD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_large,
+                                eig_vectors_jacobi_large,
+                                (params.n * params.n),
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2));
@@ -202,17 +235,13 @@ INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecF, ::testing::ValuesIn(inputsf2));
 
 INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, ::testing::ValuesIn(inputsd2));
 
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index b3980f281d..b3cfb19174 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -37,32 +37,44 @@ struct EigSelInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const EigSelInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const EigSelInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::handle_t handle;
     stream = handle.get_stream();
 
-    params = ::testing::TestWithParam<EigSelInputs<T>>::GetParam();
+    params  = ::testing::TestWithParam<EigSelInputs<T>>::GetParam();
     int len = params.len;
 
     raft::allocate(cov_matrix, len);
-    T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
-                        0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
+    T cov_matrix_h[] = {
+      1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
     raft::update_device(cov_matrix, cov_matrix_h, len, stream);
 
     raft::allocate(eig_vectors, 12);
     raft::allocate(eig_vals, params.n_col);
 
-    T eig_vectors_ref_h[] = {-0.5123, 0.4874,  0.4874, -0.5123, 0.6498, 0.2789,
-                             -0.2789, -0.6498, 0.4874, 0.5123,  0.5123, 0.4874};
-    T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266};
+    T eig_vectors_ref_h[] = {-0.5123,
+                             0.4874,
+                             0.4874,
+                             -0.5123,
+                             0.6498,
+                             0.2789,
+                             -0.2789,
+                             -0.6498,
+                             0.4874,
+                             0.5123,
+                             0.5123,
+                             0.4874};
+    T eig_vals_ref_h[]    = {0.1024, 0.3096, 3.5266, 3.5266};
 
     raft::allocate(eig_vectors_ref, 12);
     raft::allocate(eig_vals_ref, params.n_col);
@@ -70,11 +82,19 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
     raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream);
     raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream);
 
-    eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors,
-             eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream);
+    eigSelDC(handle,
+             cov_matrix,
+             params.n_row,
+             params.n_col,
+             3,
+             eig_vectors,
+             eig_vals,
+             EigVecMemUsage::OVERWRITE_INPUT,
+             stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(cov_matrix));
     CUDA_CHECK(cudaFree(eig_vectors));
     CUDA_CHECK(cudaFree(eig_vals));
@@ -89,51 +109,45 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
   cudaStream_t stream;
 };
 
-const std::vector<EigSelInputs<float>> inputsf2 = {
-  {0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigSelInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
 
-const std::vector<EigSelInputs<double>> inputsd2 = {
-  {0.001, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigSelInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}};
 
 typedef EigSelTest<float> EigSelTestValF;
-TEST_P(EigSelTestValF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigSelTestValF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigSelTest<double> EigSelTestValD;
-TEST_P(EigSelTestValD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigSelTestValD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigSelTest<float> EigSelTestVecF;
-TEST_P(EigSelTestVecF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref, eig_vectors, 12,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigSelTestVecF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vectors_ref, eig_vectors, 12, raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigSelTest<double> EigSelTestVecD;
-TEST_P(EigSelTestVecD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref, eig_vectors, 12,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigSelTestVecD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vectors_ref, eig_vectors, 12, raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index 572951c557..f0e04403e8 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -26,19 +26,17 @@ namespace linalg {
 //// Testing unary ops
 
 template <typename Type>
-__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar,
-                                 int len) {
+__global__ void naiveScaleKernel(Type* out, const Type* in, Type scalar, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = scalar * in[idx];
-  }
+  if (idx < len) { out[idx] = scalar * in[idx]; }
 }
 
 template <typename Type>
-void naiveScale(Type *out, const Type *in, Type scalar, int len,
-                cudaStream_t stream) {
+void naiveScale(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveScaleKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -52,19 +50,19 @@ struct ScalarMultiplyInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os,
-                           const ScalarMultiplyInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const ScalarMultiplyInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
-class ScalarMultiplyTest
-  : public ::testing::TestWithParam<ScalarMultiplyInputs<T>> {
+class ScalarMultiplyTest : public ::testing::TestWithParam<ScalarMultiplyInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<ScalarMultiplyInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
-    int len = params.len;
+    int len  = params.len;
     T scalar = params.scalar;
 
     cudaStream_t stream;
@@ -78,7 +76,8 @@ class ScalarMultiplyTest
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(in));
     CUDA_CHECK(cudaFree(out_ref));
     CUDA_CHECK(cudaFree(out));
@@ -89,46 +88,41 @@ class ScalarMultiplyTest
   T *in, *out_ref, *out;
 };
 
-const std::vector<ScalarMultiplyInputs<float>> inputsf1 = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<ScalarMultiplyInputs<float>> inputsf1 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 
 const std::vector<ScalarMultiplyInputs<double>> inputsd1 = {
   {0.00000001, 1024 * 1024, 2.0, 1234ULL}};
 
 typedef ScalarMultiplyTest<float> ScalarMultiplyTestF;
-TEST_P(ScalarMultiplyTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(ScalarMultiplyTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef ScalarMultiplyTest<double> ScalarMultiplyTestD;
-TEST_P(ScalarMultiplyTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(ScalarMultiplyTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF,
-                         ::testing::ValuesIn(inputsf1));
+INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, ::testing::ValuesIn(inputsf1));
 
-INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD,
-                         ::testing::ValuesIn(inputsd1));
+INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, ::testing::ValuesIn(inputsd1));
 
 //// Testing binary ops
 
 template <typename Type>
-__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2,
-                               int len) {
+__global__ void naiveAddKernel(Type* out, const Type* in1, const Type* in2, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in1[idx] + in2[idx];
-  }
+  if (idx < len) { out[idx] = in1[idx] + in2[idx]; }
 }
 
 template <typename Type>
-void naiveAdd(Type *out, const Type *in1, const Type *in2, int len,
-              cudaStream_t stream) {
+void naiveAdd(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveAddKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -141,15 +135,16 @@ struct EltwiseAddInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os,
-                           const EltwiseAddInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const EltwiseAddInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<EltwiseAddInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
 
@@ -167,7 +162,8 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(in1));
     CUDA_CHECK(cudaFree(in2));
     CUDA_CHECK(cudaFree(out_ref));
@@ -179,29 +175,25 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
   T *in1, *in2, *out_ref, *out;
 };
 
-const std::vector<EltwiseAddInputs<float>> inputsf2 = {
-  {0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<EltwiseAddInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
 
-const std::vector<EltwiseAddInputs<double>> inputsd2 = {
-  {0.00000001, 1024 * 1024, 1234ULL}};
+const std::vector<EltwiseAddInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
 
 typedef EltwiseAddTest<float> EltwiseAddTestF;
-TEST_P(EltwiseAddTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(EltwiseAddTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef EltwiseAddTest<double> EltwiseAddTestD;
-TEST_P(EltwiseAddTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(EltwiseAddTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index cecfc5eb8e..e95dbbc502 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -36,9 +36,9 @@ struct GemmLayoutInputs {
 
 // Reference GEMM implementation.
 template <typename T>
-__global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K,
-                          bool isZColMajor, bool isXColMajor,
-                          bool isYColMajor) {
+__global__ void naiveGemm(
+  T* Z, T* X, T* Y, int M, int N, int K, bool isZColMajor, bool isXColMajor, bool isYColMajor)
+{
   int tidx = blockIdx.x * blockDim.x + threadIdx.x;
   int tidy = blockIdx.y * blockDim.y + threadIdx.y;
 
@@ -51,7 +51,7 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K,
         temp += X[xIndex] * Y[yIndex];
       }
       int zIndex = isZColMajor ? m + n * M : m * N + n;
-      Z[zIndex] = temp;
+      Z[zIndex]  = temp;
     }
   }
 }
@@ -59,7 +59,8 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K,
 template <typename T>
 class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<GemmLayoutInputs<T>>::GetParam();
 
     raft::handle_t handle;
@@ -72,8 +73,8 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
     // Dimensions of Y : K x N
     // Dimensions of Z : M x N
 
-    T *X = NULL;  // Argument X
-    T *Y = NULL;  // Argument Y
+    T* X = NULL;  // Argument X
+    T* Y = NULL;  // Argument Y
 
     size_t xElems = params.M * params.K;
     size_t yElems = params.K * params.N;
@@ -87,27 +88,35 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
     r.uniform(X, xElems, T(-10.0), T(10.0), stream);
     r.uniform(Y, yElems, T(-10.0), T(10.0), stream);
 
-    dim3 blocks(raft::ceildiv<int>(params.M, 128),
-                raft::ceildiv<int>(params.N, 4), 1);
+    dim3 blocks(raft::ceildiv<int>(params.M, 128), raft::ceildiv<int>(params.N, 4), 1);
     dim3 threads(128, 4, 1);
 
-    naiveGemm<<<blocks, threads>>>(refZ, X, Y, params.M, params.N, params.K,
-                                   params.zLayout, params.xLayout,
-                                   params.yLayout);
-
-    gemm(handle, Z, X, Y, params.M, params.N, params.K, params.zLayout,
-         params.xLayout, params.yLayout, stream);
+    naiveGemm<<<blocks, threads>>>(
+      refZ, X, Y, params.M, params.N, params.K, params.zLayout, params.xLayout, params.yLayout);
+
+    gemm(handle,
+         Z,
+         X,
+         Y,
+         params.M,
+         params.N,
+         params.K,
+         params.zLayout,
+         params.xLayout,
+         params.yLayout,
+         stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(refZ));
     CUDA_CHECK(cudaFree(Z));
   }
 
  protected:
   GemmLayoutInputs<T> params;
-  T *refZ = NULL;  // Reference result for comparison
-  T *Z = NULL;     // Computed result
+  T* refZ = NULL;  // Reference result for comparison
+  T* Z    = NULL;  // Computed result
 };
 
 const std::vector<GemmLayoutInputs<float>> inputsf = {
@@ -131,22 +140,20 @@ const std::vector<GemmLayoutInputs<double>> inputsd = {
   {50, 80, 60, false, false, false, 893038ULL}};
 
 typedef GemmLayoutTest<float> GemmLayoutTestF;
-TEST_P(GemmLayoutTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N,
-                                raft::CompareApprox<float>(1e-4)));
+TEST_P(GemmLayoutTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox<float>(1e-4)));
 }
 
 typedef GemmLayoutTest<double> GemmLayoutTestD;
-TEST_P(GemmLayoutTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N,
-                                raft::CompareApprox<float>(1e-6)));
+TEST_P(GemmLayoutTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox<float>(1e-6)));
 }
 
-INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 227bce6a48..0e33d9758f 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -26,13 +26,22 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename IdxType, typename OutType>
-void mapLaunch(OutType *out, const InType *in1, const InType *in2,
-               const InType *in3, InType scalar, IdxType len,
-               cudaStream_t stream) {
+void mapLaunch(OutType* out,
+               const InType* in1,
+               const InType* in2,
+               const InType* in3,
+               InType scalar,
+               IdxType len,
+               cudaStream_t stream)
+{
   map(
-    out, len,
+    out,
+    len,
     [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; },
-    stream, in1, in2, in3);
+    stream,
+    in1,
+    in2,
+    in3);
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
@@ -44,10 +53,15 @@ struct MapInputs {
 };
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void create_ref(OutType *out_ref, const InType *in1, const InType *in2,
-                const InType *in3, InType scalar, IdxType len,
-                cudaStream_t stream) {
-  InType *tmp;
+void create_ref(OutType* out_ref,
+                const InType* in1,
+                const InType* in2,
+                const InType* in3,
+                InType scalar,
+                IdxType len,
+                cudaStream_t stream)
+{
+  InType* tmp;
   allocate(tmp, len);
   eltwiseAdd(tmp, in1, in2, len, stream);
   eltwiseAdd(out_ref, tmp, in3, len, stream);
@@ -56,12 +70,11 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2,
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-class MapTest
-  : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>> {
+class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>> {
  protected:
-  void SetUp() override {
-    params =
-      ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>>::GetParam();
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
 
     cudaStream_t stream;
@@ -81,7 +94,8 @@ class MapTest
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(in1));
     CUDA_CHECK(cudaFree(in2));
     CUDA_CHECK(cudaFree(in3));
@@ -95,55 +109,47 @@ class MapTest
   OutType *out_ref, *out;
 };
 
-const std::vector<MapInputs<float, int>> inputsf_i32 = {
-  {0.000001f, 1024 * 1024, 1234ULL, 3.2}};
+const std::vector<MapInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL, 3.2}};
 typedef MapTest<float, int> MapTestF_i32;
-TEST_P(MapTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MapTestF_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32,
-                         ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
-const std::vector<MapInputs<float, size_t>> inputsf_i64 = {
-  {0.000001f, 1024 * 1024, 1234ULL, 9.4}};
+const std::vector<MapInputs<float, size_t>> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL, 9.4}};
 typedef MapTest<float, size_t> MapTestF_i64;
-TEST_P(MapTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MapTestF_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64,
-                         ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MapInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 1234ULL, 5.9}};
 typedef MapTest<float, int, double> MapTestF_i32_D;
-TEST_P(MapTestF_i32_D, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapTestF_i32_D, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D,
-                         ::testing::ValuesIn(inputsf_i32_d));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d));
 
-const std::vector<MapInputs<double, int>> inputsd_i32 = {
-  {0.00000001, 1024 * 1024, 1234ULL, 7.5}};
+const std::vector<MapInputs<double, int>> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL, 7.5}};
 typedef MapTest<double, int> MapTestD_i32;
-TEST_P(MapTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapTestD_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32,
-                         ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MapInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 1234ULL, 5.2}};
 typedef MapTest<double, size_t> MapTestD_i64;
-TEST_P(MapTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapTestD_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64,
-                         ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 6e146fa4bb..a1b82e7644 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -25,21 +25,18 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename MapOp>
-__global__ void naiveMapReduceKernel(OutType *out, const InType *in, size_t len,
-                                     MapOp map) {
+__global__ void naiveMapReduceKernel(OutType* out, const InType* in, size_t len, MapOp map)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    raft::myAtomicAdd(out, (OutType)map(in[idx]));
-  }
+  if (idx < len) { raft::myAtomicAdd(out, (OutType)map(in[idx])); }
 }
 
 template <typename InType, typename OutType, typename MapOp>
-void naiveMapReduce(OutType *out, const InType *in, size_t len, MapOp map,
-                    cudaStream_t stream) {
+void naiveMapReduce(OutType* out, const InType* in, size_t len, MapOp map, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, (size_t)TPB);
-  naiveMapReduceKernel<InType, OutType, MapOp>
-    <<<nblks, TPB, 0, stream>>>(out, in, len, map);
+  int nblks            = raft::ceildiv(len, (size_t)TPB);
+  naiveMapReduceKernel<InType, OutType, MapOp><<<nblks, TPB, 0, stream>>>(out, in, len, map);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -51,7 +48,8 @@ struct MapReduceInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const MapReduceInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MapReduceInputs<T>& dims)
+{
   return os;
 }
 
@@ -59,8 +57,9 @@ template <typename T>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename OutType>
-void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in,
-                     size_t len, cudaStream_t stream) {
+void mapReduceLaunch(
+  OutType* out_ref, OutType* out, const InType* in, size_t len, cudaStream_t stream)
+{
   auto op = [] __device__(InType in) { return in; };
   naiveMapReduce(out_ref, in, len, op, stream);
   mapThenSumReduce(out, len, op, 0, in);
@@ -69,7 +68,8 @@ void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in,
 template <typename InType, typename OutType>
 class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<MapReduceInputs<InType>>::GetParam();
     raft::random::Rng r(params.seed);
     auto len = params.len;
@@ -84,7 +84,8 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(in));
     CUDA_CHECK(cudaFree(out_ref));
     CUDA_CHECK(cudaFree(out));
@@ -92,48 +93,44 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
 
  protected:
   MapReduceInputs<InType> params;
-  InType *in;
+  InType* in;
   OutType *out_ref, *out;
 };
 
-const std::vector<MapReduceInputs<float>> inputsf = {
-  {0.001f, 1024 * 1024, 1234ULL}};
+const std::vector<MapReduceInputs<float>> inputsf = {{0.001f, 1024 * 1024, 1234ULL}};
 typedef MapReduceTest<float, float> MapReduceTestFF;
-TEST_P(MapReduceTestFF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MapReduceTestFF, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, ::testing::ValuesIn(inputsf));
 
 typedef MapReduceTest<float, double> MapReduceTestFD;
-TEST_P(MapReduceTestFD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapReduceTestFD, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, ::testing::ValuesIn(inputsf));
 
-const std::vector<MapReduceInputs<double>> inputsd = {
-  {0.000001, 1024 * 1024, 1234ULL}};
+const std::vector<MapReduceInputs<double>> inputsd = {{0.000001, 1024 * 1024, 1234ULL}};
 typedef MapReduceTest<double, double> MapReduceTestDD;
-TEST_P(MapReduceTestDD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapReduceTestDD, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, ::testing::ValuesIn(inputsd));
 
 template <typename T>
 class MapGenericReduceTest : public ::testing::Test {
-  using InType = typename T::first_type;
+  using InType  = typename T::first_type;
   using OutType = typename T::second_type;
 
  protected:
   MapGenericReduceTest()
     : allocator(handle.get_device_allocator()),
       input(allocator, handle.get_stream(), n),
-      output(allocator, handle.get_stream(), 1) {
+      output(allocator, handle.get_stream(), 1)
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     initInput(input.data(), input.size(), stream);
@@ -142,7 +139,8 @@ class MapGenericReduceTest : public ::testing::Test {
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
  public:
-  void initInput(InType *input, int n, cudaStream_t stream) {
+  void initInput(InType* input, int n, cudaStream_t stream)
+  {
     raft::random::Rng r(137);
     r.uniform(input, n, InType(2), InType(3), stream);
     InType val = 1;
@@ -151,21 +149,19 @@ class MapGenericReduceTest : public ::testing::Test {
     raft::update_device(input + 337, &val, 1, stream);
   }
 
-  void testMin() {
-    auto op = [] __device__(InType in) { return in; };
+  void testMin()
+  {
+    auto op               = [] __device__(InType in) { return in; };
     const OutType neutral = std::numeric_limits<InType>::max();
-    mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream,
-                  input.data());
-    EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1,
-                                  raft::Compare<OutType>()));
+    mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, input.data());
+    EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, raft::Compare<OutType>()));
   }
-  void testMax() {
-    auto op = [] __device__(InType in) { return in; };
+  void testMax()
+  {
+    auto op               = [] __device__(InType in) { return in; };
     const OutType neutral = std::numeric_limits<InType>::min();
-    mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream,
-                  input.data());
-    EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1,
-                                  raft::Compare<OutType>()));
+    mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, input.data());
+    EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, raft::Compare<OutType>()));
   }
 
  protected:
@@ -178,8 +174,7 @@ class MapGenericReduceTest : public ::testing::Test {
 };
 
 using IoTypePair =
-  ::testing::Types<std::pair<float, float>, std::pair<float, double>,
-                   std::pair<double, double>>;
+  ::testing::Types<std::pair<float, float>, std::pair<float, double>, std::pair<double, double>>;
 
 TYPED_TEST_CASE(MapGenericReduceTest, IoTypePair);
 TYPED_TEST(MapGenericReduceTest, min) { this->testMin(); }
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index aa46c78b0f..6ad9bfba10 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -32,8 +32,8 @@ struct MatVecOpInputs {
 };
 
 template <typename T, typename IdxType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const MatVecOpInputs<T, IdxType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs<T, IdxType>& dims)
+{
   return os;
 }
 
@@ -41,26 +41,48 @@ template <typename T, typename IdxType>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename T, typename IdxType>
-void matrixVectorOpLaunch(T *out, const T *in, const T *vec1, const T *vec2,
-                          IdxType D, IdxType N, bool rowMajor,
-                          bool bcastAlongRows, bool useTwoVectors,
-                          cudaStream_t stream) {
+void matrixVectorOpLaunch(T* out,
+                          const T* in,
+                          const T* vec1,
+                          const T* vec2,
+                          IdxType D,
+                          IdxType N,
+                          bool rowMajor,
+                          bool bcastAlongRows,
+                          bool useTwoVectors,
+                          cudaStream_t stream)
+{
   if (useTwoVectors) {
     matrixVectorOp(
-      out, in, vec1, vec2, D, N, rowMajor, bcastAlongRows,
-      [] __device__(T a, T b, T c) { return a + b + c; }, stream);
+      out,
+      in,
+      vec1,
+      vec2,
+      D,
+      N,
+      rowMajor,
+      bcastAlongRows,
+      [] __device__(T a, T b, T c) { return a + b + c; },
+      stream);
   } else {
     matrixVectorOp(
-      out, in, vec1, D, N, rowMajor, bcastAlongRows,
-      [] __device__(T a, T b) { return a + b; }, stream);
+      out,
+      in,
+      vec1,
+      D,
+      N,
+      rowMajor,
+      bcastAlongRows,
+      [] __device__(T a, T b) { return a + b; },
+      stream);
   }
 }
 
 template <typename T, typename IdxType>
-class MatVecOpTest
-  : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>> {
+class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>::GetParam();
     raft::random::Rng r(params.seed);
     IdxType N = params.rows, D = params.cols;
@@ -78,18 +100,25 @@ class MatVecOpTest
     r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream);
     r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream);
     if (params.useTwoVectors) {
-      naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor,
-                  params.bcastAlongRows, (T)1.0);
+      naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor, params.bcastAlongRows, (T)1.0);
     } else {
-      naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor,
-                  params.bcastAlongRows, (T)1.0);
+      naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor, params.bcastAlongRows, (T)1.0);
     }
-    matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor,
-                         params.bcastAlongRows, params.useTwoVectors, stream);
+    matrixVectorOpLaunch(out,
+                         in,
+                         vec1,
+                         vec2,
+                         D,
+                         N,
+                         params.rowMajor,
+                         params.bcastAlongRows,
+                         params.useTwoVectors,
+                         stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(vec1));
     CUDA_CHECK(cudaFree(vec2));
     CUDA_CHECK(cudaFree(out));
@@ -121,23 +150,23 @@ const std::vector<MatVecOpInputs<float, int>> inputsf_i32 = {
   {0.00001f, 1024, 32, false, false, true, 1234ULL},
   {0.00001f, 1024, 64, false, false, true, 1234ULL}};
 typedef MatVecOpTest<float, int> MatVecOpTestF_i32;
-TEST_P(MatVecOpTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MatVecOpTestF_i32, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32,
-                         ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
 const std::vector<MatVecOpInputs<float, size_t>> inputsf_i64 = {
   {0.00001f, 2500, 250, false, false, false, 1234ULL},
   {0.00001f, 2500, 250, false, false, true, 1234ULL}};
 typedef MatVecOpTest<float, size_t> MatVecOpTestF_i64;
-TEST_P(MatVecOpTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MatVecOpTestF_i64, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64,
-                         ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MatVecOpInputs<double, int>> inputsd_i32 = {
   {0.0000001, 1024, 32, true, true, false, 1234ULL},
@@ -158,23 +187,23 @@ const std::vector<MatVecOpInputs<double, int>> inputsd_i32 = {
   {0.0000001, 1024, 32, false, false, true, 1234ULL},
   {0.0000001, 1024, 64, false, false, true, 1234ULL}};
 typedef MatVecOpTest<double, int> MatVecOpTestD_i32;
-TEST_P(MatVecOpTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MatVecOpTestD_i32, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32,
-                         ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MatVecOpInputs<double, size_t>> inputsd_i64 = {
   {0.0000001, 2500, 250, false, false, false, 1234ULL},
   {0.0000001, 2500, 250, false, false, true, 1234ULL}};
 typedef MatVecOpTest<double, size_t> MatVecOpTestD_i64;
-TEST_P(MatVecOpTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MatVecOpTestD_i64, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64,
-                         ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index 69c45c9866..5f9c6f1ef3 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -22,9 +22,15 @@ namespace raft {
 namespace linalg {
 
 template <typename Type, typename IdxType = int>
-__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec,
-                                  IdxType D, IdxType N, bool rowMajor,
-                                  bool bcastAlongRows, Type scalar) {
+__global__ void naiveMatVecKernel(Type* out,
+                                  const Type* mat,
+                                  const Type* vec,
+                                  IdxType D,
+                                  IdxType N,
+                                  bool rowMajor,
+                                  bool bcastAlongRows,
+                                  Type scalar)
+{
   IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
   IdxType len = N * D;
   IdxType col;
@@ -37,27 +43,37 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec,
   } else {
     col = idx / N;
   }
-  if (idx < len) {
-    out[idx] = mat[idx] + scalar * vec[col];
-  }
+  if (idx < len) { out[idx] = mat[idx] + scalar * vec[col]; }
 }
 
 template <typename Type, typename IdxType = int>
-void naiveMatVec(Type *out, const Type *mat, const Type *vec, IdxType D,
-                 IdxType N, bool rowMajor, bool bcastAlongRows, Type scalar) {
+void naiveMatVec(Type* out,
+                 const Type* mat,
+                 const Type* vec,
+                 IdxType D,
+                 IdxType N,
+                 bool rowMajor,
+                 bool bcastAlongRows,
+                 Type scalar)
+{
   static const IdxType TPB = 64;
-  IdxType len = N * D;
-  IdxType nblks = raft::ceildiv(len, TPB);
-  naiveMatVecKernel<Type>
-    <<<nblks, TPB>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
+  IdxType len              = N * D;
+  IdxType nblks            = raft::ceildiv(len, TPB);
+  naiveMatVecKernel<Type><<<nblks, TPB>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type, typename IdxType = int>
-__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1,
-                                  const Type *vec2, IdxType D, IdxType N,
-                                  bool rowMajor, bool bcastAlongRows,
-                                  Type scalar) {
+__global__ void naiveMatVecKernel(Type* out,
+                                  const Type* mat,
+                                  const Type* vec1,
+                                  const Type* vec2,
+                                  IdxType D,
+                                  IdxType N,
+                                  bool rowMajor,
+                                  bool bcastAlongRows,
+                                  Type scalar)
+{
   IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
   IdxType len = N * D;
   IdxType col;
@@ -70,20 +86,25 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1,
   } else {
     col = idx / N;
   }
-  if (idx < len) {
-    out[idx] = mat[idx] + scalar * vec1[col] + vec2[col];
-  }
+  if (idx < len) { out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; }
 }
 
 template <typename Type, typename IdxType = int>
-void naiveMatVec(Type *out, const Type *mat, const Type *vec1, const Type *vec2,
-                 IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows,
-                 Type scalar) {
+void naiveMatVec(Type* out,
+                 const Type* mat,
+                 const Type* vec1,
+                 const Type* vec2,
+                 IdxType D,
+                 IdxType N,
+                 bool rowMajor,
+                 bool bcastAlongRows,
+                 Type scalar)
+{
   static const IdxType TPB = 64;
-  IdxType len = N * D;
-  IdxType nblks = raft::ceildiv(len, TPB);
-  naiveMatVecKernel<Type><<<nblks, TPB>>>(out, mat, vec1, vec2, D, N, rowMajor,
-                                          bcastAlongRows, scalar);
+  IdxType len              = N * D;
+  IdxType nblks            = raft::ceildiv(len, TPB);
+  naiveMatVecKernel<Type>
+    <<<nblks, TPB>>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index 1d3e753de3..6c38d89891 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -27,7 +27,8 @@ namespace linalg {
 template <typename T>
 class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<UnaryOpInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
@@ -43,7 +44,8 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(in));
     CUDA_CHECK(cudaFree(out_ref));
     CUDA_CHECK(cudaFree(out));
@@ -54,25 +56,21 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
   T *in, *out_ref, *out;
 };
 
-const std::vector<UnaryOpInputs<float>> inputsf = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<UnaryOpInputs<float>> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef MultiplyTest<float> MultiplyTestF;
-TEST_P(MultiplyTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MultiplyTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, ::testing::ValuesIn(inputsf));
 
 typedef MultiplyTest<double> MultiplyTestD;
-const std::vector<UnaryOpInputs<double>> inputsd = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-TEST_P(MultiplyTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                          raft::CompareApprox<double>(params.tolerance)));
+const std::vector<UnaryOpInputs<double>> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+TEST_P(MultiplyTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index acc25addd0..35bc72dee4 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -34,17 +34,19 @@ struct NormInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const NormInputs<T> &I) {
-  os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", "
-     << I.type << ", " << I.do_sqrt << ", " << I.seed << '}' << std::endl;
+::std::ostream& operator<<(::std::ostream& os, const NormInputs<T>& I)
+{
+  os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.type << ", "
+     << I.do_sqrt << ", " << I.seed << '}' << std::endl;
   return os;
 }
 
 ///// Row-wise norm test definitions
 template <typename Type>
-__global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N,
-                                   NormType type, bool do_sqrt) {
-  Type acc = (Type)0;
+__global__ void naiveRowNormKernel(
+  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt)
+{
+  Type acc     = (Type)0;
   int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
   if (rowStart < N) {
     for (int i = 0; i < D; ++i) {
@@ -59,19 +61,20 @@ __global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N,
 }
 
 template <typename Type>
-void naiveRowNorm(Type *dots, const Type *data, int D, int N, NormType type,
-                  bool do_sqrt, cudaStream_t stream) {
+void naiveRowNorm(
+  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(N, TPB);
-  naiveRowNormKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
+  int nblks            = raft::ceildiv(N, TPB);
+  naiveRowNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename T>
 class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
  public:
-  void SetUp() override {
+  void SetUp() override
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     params = ::testing::TestWithParam<NormInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
@@ -82,19 +85,18 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     raft::allocate(dots_exp, rows);
     raft::allocate(dots_act, rows);
     r.uniform(data, len, T(-1.0), T(1.0), stream);
-    naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt,
-                 stream);
+    naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream);
     if (params.do_sqrt) {
       auto fin_op = [] __device__(T in) { return raft::mySqrt(in); };
-      rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream,
-              fin_op);
+      rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, fin_op);
     } else {
       rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream);
     }
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(dots_exp));
     CUDA_CHECK(cudaFree(dots_act));
@@ -109,10 +111,11 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
 
 ///// Column-wise norm test definitisons
 template <typename Type>
-__global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N,
-                                   NormType type, bool do_sqrt) {
+__global__ void naiveColNormKernel(
+  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt)
+{
   int colID = threadIdx.x + blockIdx.x * blockDim.x;
-  if (colID > D) return;  //avoid out-of-bounds thread
+  if (colID > D) return;  // avoid out-of-bounds thread
 
   Type acc = 0;
   for (int i = 0; i < N; i++) {
@@ -124,19 +127,20 @@ __global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N,
 }
 
 template <typename Type>
-void naiveColNorm(Type *dots, const Type *data, int D, int N, NormType type,
-                  bool do_sqrt, cudaStream_t stream) {
+void naiveColNorm(
+  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(D, TPB);
-  naiveColNormKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
+  int nblks            = raft::ceildiv(D, TPB);
+  naiveColNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename T>
 class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
  public:
-  void SetUp() override {
+  void SetUp() override
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     params = ::testing::TestWithParam<NormInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
@@ -148,19 +152,18 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     raft::allocate(dots_exp, cols);
     raft::allocate(dots_act, cols);
 
-    naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt,
-                 stream);
+    naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream);
     if (params.do_sqrt) {
       auto fin_op = [] __device__(T in) { return raft::mySqrt(in); };
-      colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream,
-              fin_op);
+      colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, fin_op);
     } else {
       colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream);
     }
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(dots_exp));
     CUDA_CHECK(cudaFree(dots_act));
@@ -174,24 +177,23 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
 };
 
 ///// Row- and column-wise tests
-const std::vector<NormInputs<float>> inputsf = {
-  {0.00001f, 1024, 32, L1Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL},
-
-  {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}};
+const std::vector<NormInputs<float>> inputsf = {{0.00001f, 1024, 32, L1Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL},
+
+                                                {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}};
 
 const std::vector<NormInputs<double>> inputsd = {
   {0.00000001, 1024, 32, L1Norm, false, true, 1234ULL},
@@ -213,22 +215,22 @@ const std::vector<NormInputs<double>> inputsd = {
   {0.00000001, 1024, 256, L2Norm, true, true, 1234ULL}};
 
 typedef RowNormTest<float> RowNormTestF;
-TEST_P(RowNormTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(RowNormTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp, dots_act, params.rows, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef RowNormTest<double> RowNormTestD;
-TEST_P(RowNormTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(RowNormTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp, dots_act, params.rows, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, ::testing::ValuesIn(inputsd));
 
 const std::vector<NormInputs<float>> inputscf = {
   {0.00001f, 32, 1024, L1Norm, false, true, 1234ULL},
@@ -269,22 +271,22 @@ const std::vector<NormInputs<double>> inputscd = {
   {0.00000001, 256, 1024, L2Norm, true, true, 1234ULL}};
 
 typedef ColNormTest<float> ColNormTestF;
-TEST_P(ColNormTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(ColNormTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp, dots_act, params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef ColNormTest<double> ColNormTestD;
-TEST_P(ColNormTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(ColNormTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp, dots_act, params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF,
-                        ::testing::ValuesIn(inputscf));
+INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, ::testing::ValuesIn(inputscf));
 
-INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD,
-                        ::testing::ValuesIn(inputscd));
+INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, ::testing::ValuesIn(inputscd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index 9082397265..85c84777e4 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -34,8 +34,8 @@ struct ReduceInputs {
 };
 
 template <typename InType, typename OutType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const ReduceInputs<InType, OutType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const ReduceInputs<InType, OutType>& dims)
+{
   return os;
 }
 
@@ -43,45 +43,55 @@ template <typename InType, typename OutType>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename OutType>
-void reduceLaunch(OutType *dots, const InType *data, int cols, int rows,
-                  bool rowMajor, bool alongRows, bool inplace,
-                  cudaStream_t stream) {
-  reduce(
-    dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace,
-    [] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
+void reduceLaunch(OutType* dots,
+                  const InType* data,
+                  int cols,
+                  int rows,
+                  bool rowMajor,
+                  bool alongRows,
+                  bool inplace,
+                  cudaStream_t stream)
+{
+  reduce(dots,
+         data,
+         cols,
+         rows,
+         (OutType)0,
+         rowMajor,
+         alongRows,
+         stream,
+         inplace,
+         [] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
 }
 
 template <typename InType, typename OutType>
-class ReduceTest
-  : public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
+class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
-    params =
-      ::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam();
+    params = ::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
-    outlen = params.alongRows ? rows : cols;
+    outlen  = params.alongRows ? rows : cols;
     raft::allocate(data, len);
     raft::allocate(dots_exp, outlen);
     raft::allocate(dots_act, outlen);
     r.uniform(data, len, InType(-1.0), InType(1.0), stream);
-    naiveReduction(dots_exp, data, cols, rows, params.rowMajor,
-                   params.alongRows, stream);
+    naiveReduction(dots_exp, data, cols, rows, params.rowMajor, params.alongRows, stream);
 
     // Perform reduction with default inplace = false first
-    reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows,
-                 false, stream);
+    reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, false, stream);
     // Add to result with inplace = true next, which shouldn't affect
     // in the case of coalescedReduction!
     if (!(params.rowMajor ^ params.alongRows)) {
-      reduceLaunch(dots_act, data, cols, rows, params.rowMajor,
-                   params.alongRows, true, stream);
+      reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, true, stream);
     }
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(dots_exp));
     CUDA_CHECK(cudaFree(dots_act));
@@ -90,7 +100,7 @@ class ReduceTest
 
  protected:
   ReduceInputs<InType, OutType> params;
-  InType *data;
+  InType* data;
   OutType *dots_exp, *dots_act;
   int outlen;
   cudaStream_t stream;
@@ -151,31 +161,31 @@ const std::vector<ReduceInputs<float, double>> inputsfd = {
   {0.000002f, 1024, 256, false, false, 1234ULL}};
 
 typedef ReduceTest<float, float> ReduceTestFF;
-TEST_P(ReduceTestFF, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(ReduceTestFF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef ReduceTest<double, double> ReduceTestDD;
-TEST_P(ReduceTestDD, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(ReduceTestDD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox<double>(params.tolerance)));
 }
 
 typedef ReduceTest<float, double> ReduceTestFD;
-TEST_P(ReduceTestFD, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(ReduceTestFD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF,
-                        ::testing::ValuesIn(inputsff));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, ::testing::ValuesIn(inputsff));
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD,
-                        ::testing::ValuesIn(inputsdd));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, ::testing::ValuesIn(inputsdd));
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD,
-                        ::testing::ValuesIn(inputsfd));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, ::testing::ValuesIn(inputsfd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 30a9c2e271..86f9c2d8b8 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -26,52 +26,69 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType>
-__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data,
-                                              int D, int N) {
-  OutType acc = (OutType)0;
+__global__ void naiveCoalescedReductionKernel(OutType* dots, const InType* data, int D, int N)
+{
+  OutType acc  = (OutType)0;
   int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
   if (rowStart < N) {
     for (int i = 0; i < D; ++i) {
-      acc +=
-        static_cast<OutType>(data[rowStart * D + i] * data[rowStart * D + i]);
+      acc += static_cast<OutType>(data[rowStart * D + i] * data[rowStart * D + i]);
     }
     dots[rowStart] = 2 * acc;
   }
 }
 
 template <typename InType, typename OutType>
-void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N,
-                             cudaStream_t stream) {
+void naiveCoalescedReduction(OutType* dots, const InType* data, int D, int N, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(N, TPB);
-  naiveCoalescedReductionKernel<InType, OutType>
-    <<<nblks, TPB, 0, stream>>>(dots, data, D, N);
+  int nblks            = raft::ceildiv(N, TPB);
+  naiveCoalescedReductionKernel<InType, OutType><<<nblks, TPB, 0, stream>>>(dots, data, D, N);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename InType, typename OutType>
-void unaryAndGemv(OutType *dots, const InType *data, int D, int N,
-                  cudaStream_t stream) {
-  //computes a MLCommon unary op on data (squares it), then computes Ax
+void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t stream)
+{
+  // computes a MLCommon unary op on data (squares it), then computes Ax
   //(A input matrix and x column vector) to sum columns
   thrust::device_vector<OutType> sq(D * N);
   raft::linalg::unaryOp(
-    thrust::raw_pointer_cast(sq.data()), data, D * N,
-    [] __device__(InType v) { return static_cast<OutType>(v * v); }, stream);
+    thrust::raw_pointer_cast(sq.data()),
+    data,
+    D * N,
+    [] __device__(InType v) { return static_cast<OutType>(v * v); },
+    stream);
   cublasHandle_t handle;
   CUBLAS_CHECK(cublasCreate(&handle));
-  thrust::device_vector<OutType> ones(N, 1);  //column vector [1...1]
+  thrust::device_vector<OutType> ones(N, 1);  // column vector [1...1]
   OutType alpha = 1, beta = 0;
-  CUBLAS_CHECK(raft::linalg::cublasgemv(
-    handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D,
-    thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream));
+  CUBLAS_CHECK(raft::linalg::cublasgemv(handle,
+                                        CUBLAS_OP_N,
+                                        D,
+                                        N,
+                                        &alpha,
+                                        thrust::raw_pointer_cast(sq.data()),
+                                        D,
+                                        thrust::raw_pointer_cast(ones.data()),
+                                        1,
+                                        &beta,
+                                        dots,
+                                        1,
+                                        stream));
   CUDA_CHECK(cudaDeviceSynchronize());
   CUBLAS_CHECK(cublasDestroy(handle));
 }
 
 template <typename InType, typename OutType>
-void naiveReduction(OutType *dots, const InType *data, int D, int N,
-                    bool rowMajor, bool alongRows, cudaStream_t stream) {
+void naiveReduction(OutType* dots,
+                    const InType* data,
+                    int D,
+                    int N,
+                    bool rowMajor,
+                    bool alongRows,
+                    cudaStream_t stream)
+{
   if (rowMajor && alongRows) {
     naiveCoalescedReduction(dots, data, D, N, stream);
   } else if (rowMajor && !alongRows) {
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index b27fa2ac1a..57699cb050 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -32,17 +32,17 @@ struct stridedReductionInputs {
 };
 
 template <typename T>
-void stridedReductionLaunch(T *dots, const T *data, int cols, int rows,
-                            cudaStream_t stream) {
-  stridedReduction(dots, data, cols, rows, (T)0, stream, false,
-                   [] __device__(T in, int i) { return in * in; });
+void stridedReductionLaunch(T* dots, const T* data, int cols, int rows, cudaStream_t stream)
+{
+  stridedReduction(
+    dots, data, cols, rows, (T)0, stream, false, [] __device__(T in, int i) { return in * in; });
 }
 
 template <typename T>
-class stridedReductionTest
-  : public ::testing::TestWithParam<stridedReductionInputs<T>> {
+class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     params = ::testing::TestWithParam<stridedReductionInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
@@ -50,16 +50,17 @@ class stridedReductionTest
     int len = rows * cols;
 
     raft::allocate(data, len);
-    raft::allocate(dots_exp, cols);  //expected dot products (from test)
-    raft::allocate(dots_act, cols);  //actual dot products (from prim)
+    raft::allocate(dots_exp, cols);  // expected dot products (from test)
+    raft::allocate(dots_act, cols);  // actual dot products (from prim)
     r.uniform(data, len, T(-1.0), T(1.0),
-              stream);  //initialize matrix to random
+              stream);  // initialize matrix to random
 
     unaryAndGemv(dots_exp, data, cols, rows, stream);
     stridedReductionLaunch(dots_act, data, cols, rows, stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(dots_exp));
     CUDA_CHECK(cudaFree(dots_act));
@@ -72,35 +73,33 @@ class stridedReductionTest
   cudaStream_t stream;
 };
 
-const std::vector<stridedReductionInputs<float>> inputsf = {
-  {0.00001f, 1024, 32, 1234ULL},
-  {0.00001f, 1024, 64, 1234ULL},
-  {0.00001f, 1024, 128, 1234ULL},
-  {0.00001f, 1024, 256, 1234ULL}};
+const std::vector<stridedReductionInputs<float>> inputsf = {{0.00001f, 1024, 32, 1234ULL},
+                                                            {0.00001f, 1024, 64, 1234ULL},
+                                                            {0.00001f, 1024, 128, 1234ULL},
+                                                            {0.00001f, 1024, 256, 1234ULL}};
 
-const std::vector<stridedReductionInputs<double>> inputsd = {
-  {0.000000001, 1024, 32, 1234ULL},
-  {0.000000001, 1024, 64, 1234ULL},
-  {0.000000001, 1024, 128, 1234ULL},
-  {0.000000001, 1024, 256, 1234ULL}};
+const std::vector<stridedReductionInputs<double>> inputsd = {{0.000000001, 1024, 32, 1234ULL},
+                                                             {0.000000001, 1024, 64, 1234ULL},
+                                                             {0.000000001, 1024, 128, 1234ULL},
+                                                             {0.000000001, 1024, 256, 1234ULL}};
 
 typedef stridedReductionTest<float> stridedReductionTestF;
-TEST_P(stridedReductionTestF, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(stridedReductionTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(dots_exp, dots_act, params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef stridedReductionTest<double> stridedReductionTestD;
-TEST_P(stridedReductionTestD, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(stridedReductionTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(dots_exp, dots_act, params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index ced3f65fdd..4295b91f3e 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -24,39 +24,34 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naiveSubtractElemKernel(Type *out, const Type *in1,
-                                        const Type *in2, int len) {
+__global__ void naiveSubtractElemKernel(Type* out, const Type* in1, const Type* in2, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in1[idx] - in2[idx];
-  }
+  if (idx < len) { out[idx] = in1[idx] - in2[idx]; }
 }
 
 template <typename Type>
-void naiveSubtractElem(Type *out, const Type *in1, const Type *in2, int len,
-                       cudaStream_t stream) {
+void naiveSubtractElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveSubtractElemKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void naiveSubtractScalarKernel(Type *out, const Type *in1,
-                                          const Type in2, int len) {
+__global__ void naiveSubtractScalarKernel(Type* out, const Type* in1, const Type in2, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in1[idx] - in2;
-  }
+  if (idx < len) { out[idx] = in1[idx] - in2; }
 }
 
 template <typename Type>
-void naiveSubtractScalar(Type *out, const Type *in1, const Type in2, int len,
-                         cudaStream_t stream) {
+void naiveSubtractScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
-  naiveSubtractScalarKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+  int nblks            = raft::ceildiv(len, TPB);
+  naiveSubtractScalarKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -68,14 +63,16 @@ struct SubtractInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const SubtractInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SubtractInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<SubtractInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
@@ -98,7 +95,8 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(in1));
     CUDA_CHECK(cudaFree(in2));
     CUDA_CHECK(cudaFree(out_ref));
@@ -110,35 +108,33 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
   T *in1, *in2, *out_ref, *out;
 };
 
-const std::vector<SubtractInputs<float>> inputsf2 = {
-  {0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<SubtractInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
 
-const std::vector<SubtractInputs<double>> inputsd2 = {
-  {0.00000001, 1024 * 1024, 1234ULL}};
+const std::vector<SubtractInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
 
 typedef SubtractTest<float> SubtractTestF;
-TEST_P(SubtractTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(SubtractTestF, Result)
+{
+  ASSERT_TRUE(
+    raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(out_ref, in1, params.len, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef SubtractTest<double> SubtractTestD;
-TEST_P(SubtractTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(SubtractTestD, Result)
+{
+  ASSERT_TRUE(
+    raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    raft::devArrMatch(out_ref, in1, params.len, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index fff321768f..e9e1a6dc02 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -35,19 +35,21 @@ struct SvdInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const SvdInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SvdInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::handle_t handle;
 
     params = ::testing::TestWithParam<SvdInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
-    int len = params.len;
+    int len             = params.len;
     cudaStream_t stream = handle.get_stream();
     raft::allocate(data, len);
 
@@ -56,7 +58,7 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0};
     raft::update_device(data, data_h, len, stream);
 
-    int left_evl = params.n_row * params.n_col;
+    int left_evl  = params.n_row * params.n_col;
     int right_evl = params.n_col * params.n_col;
 
     raft::allocate(left_eig_vectors_qr, left_evl);
@@ -67,8 +69,7 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     // allocate(right_eig_vectors_trans_jacobi, right_evl);
     // allocate(sing_vals_jacobi, params.n_col);
 
-    T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695,
-                                  0.488195,  0.110706,  -0.865685};
+    T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, 0.488195, 0.110706, -0.865685};
 
     T right_eig_vectors_ref_h[] = {-0.638636, -0.769509, -0.769509, 0.638636};
 
@@ -78,18 +79,25 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     raft::allocate(right_eig_vectors_ref, right_evl);
     raft::allocate(sing_vals_ref, params.n_col);
 
-    raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl,
-                        stream);
-    raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h,
-                        right_evl, stream);
+    raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl, stream);
+    raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h, right_evl, stream);
     raft::update_device(sing_vals_ref, sing_vals_ref_h, params.n_col, stream);
 
-    svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr,
-          left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true,
+    svdQR(handle,
+          data,
+          params.n_row,
+          params.n_col,
+          sing_vals_qr,
+          left_eig_vectors_qr,
+          right_eig_vectors_trans_qr,
+          true,
+          true,
+          true,
           stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(left_eig_vectors_qr));
     CUDA_CHECK(cudaFree(right_eig_vectors_trans_qr));
@@ -101,69 +109,71 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
 
  protected:
   SvdInputs<T> params;
-  T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr,
-    *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref;
+  T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr, *left_eig_vectors_ref,
+    *right_eig_vectors_ref, *sing_vals_ref;
 };
 
-const std::vector<SvdInputs<float>> inputsf2 = {
-  {0.00001f, 3 * 2, 3, 2, 1234ULL}};
+const std::vector<SvdInputs<float>> inputsf2 = {{0.00001f, 3 * 2, 3, 2, 1234ULL}};
 
-const std::vector<SvdInputs<double>> inputsd2 = {
-  {0.00001, 3 * 2, 3, 2, 1234ULL}};
+const std::vector<SvdInputs<double>> inputsd2 = {{0.00001, 3 * 2, 3, 2, 1234ULL}};
 
 typedef SvdTest<float> SvdTestValF;
-TEST_P(SvdTestValF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(SvdTestValF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    sing_vals_ref, sing_vals_qr, params.n_col, raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestValD;
-TEST_P(SvdTestValD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(SvdTestValD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    sing_vals_ref, sing_vals_qr, params.n_col, raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef SvdTest<float> SvdTestLeftVecF;
-TEST_P(SvdTestLeftVecF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col,
-    raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(SvdTestLeftVecF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref,
+                                left_eig_vectors_qr,
+                                params.n_row * params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestLeftVecD;
-TEST_P(SvdTestLeftVecD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col,
-    raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(SvdTestLeftVecD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref,
+                                left_eig_vectors_qr,
+                                params.n_row * params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef SvdTest<float> SvdTestRightVecF;
-TEST_P(SvdTestRightVecF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr,
-                      params.n_col * params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(SvdTestRightVecF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref,
+                                right_eig_vectors_trans_qr,
+                                params.n_col * params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestRightVecD;
-TEST_P(SvdTestRightVecD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr,
-                      params.n_col * params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(SvdTestRightVecD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref,
+                                right_eig_vectors_trans_qr,
+                                params.n_col * params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValF, ::testing::ValuesIn(inputsf2));
 
 INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, ::testing::ValuesIn(inputsd2));
 
 // INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF,
 // ::testing::ValuesIn(inputsf2));
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index f10b029962..659bed04c6 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -34,14 +34,16 @@ struct TranposeInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const TranposeInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const TranposeInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<TranposeInputs<T>>::GetParam();
 
     stream = handle.get_stream();
@@ -63,7 +65,8 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
     transpose(data, params.n_row, stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(data_trans));
     CUDA_CHECK(cudaFree(data_trans_ref));
@@ -76,39 +79,33 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
   cudaStream_t stream;
 };
 
-const std::vector<TranposeInputs<float>> inputsf2 = {
-  {0.1f, 3 * 3, 3, 3, 1234ULL}};
+const std::vector<TranposeInputs<float>> inputsf2 = {{0.1f, 3 * 3, 3, 3, 1234ULL}};
 
-const std::vector<TranposeInputs<double>> inputsd2 = {
-  {0.1, 3 * 3, 3, 3, 1234ULL}};
+const std::vector<TranposeInputs<double>> inputsd2 = {{0.1, 3 * 3, 3, 3, 1234ULL}};
 
 typedef TransposeTest<float> TransposeTestValF;
-TEST_P(TransposeTestValF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref, data_trans, params.len,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
-
-  ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref, data, params.len,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(TransposeTestValF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    data_trans_ref, data_trans, params.len, raft::CompareApproxAbs<float>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(
+    data_trans_ref, data, params.len, raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef TransposeTest<double> TransposeTestValD;
-TEST_P(TransposeTestValD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref, data_trans, params.len,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
-
-  ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref, data, params.len,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(TransposeTestValD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    data_trans_ref, data_trans, params.len, raft::CompareApproxAbs<double>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(
+    data_trans_ref, data, params.len, raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 666ab8619d..6349a1907a 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -28,28 +28,25 @@ namespace linalg {
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename IdxType = int, typename OutType = InType>
-void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len,
-                   cudaStream_t stream) {
+void unaryOpLaunch(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
   if (in == nullptr) {
     auto op = [scalar] __device__(OutType * ptr, IdxType idx) {
       *ptr = static_cast<OutType>(scalar * idx);
     };
     writeOnlyUnaryOp<OutType, decltype(op), IdxType>(out, len, op, stream);
   } else {
-    auto op = [scalar] __device__(InType in) {
-      return static_cast<OutType>(in * scalar);
-    };
+    auto op = [scalar] __device__(InType in) { return static_cast<OutType>(in * scalar); };
     unaryOp<InType, decltype(op), IdxType, OutType>(out, in, len, op, stream);
   }
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-class UnaryOpTest
-  : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
+class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
  protected:
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      UnaryOpInputs<InType, IdxType, OutType>>::GetParam();
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
     CUDA_CHECK(cudaStreamCreate(&stream));
     auto len = params.len;
@@ -59,7 +56,8 @@ class UnaryOpTest
     r.uniform(in, len, InType(-1.0), InType(1.0), stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaStreamDestroy(stream));
     CUDA_CHECK(cudaFree(in));
@@ -67,18 +65,18 @@ class UnaryOpTest
     CUDA_CHECK(cudaFree(out));
   }
 
-  virtual void DoTest() {
-    auto len = params.len;
+  virtual void DoTest()
+  {
+    auto len    = params.len;
     auto scalar = params.scalar;
     naiveScale(out_ref, in, scalar, len, stream);
     unaryOpLaunch(out, in, scalar, len, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
-    ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
-                            CompareApprox<OutType>(params.tolerance)));
+    ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<OutType>(params.tolerance)));
   }
 
   UnaryOpInputs<InType, IdxType, OutType> params;
-  InType *in;
+  InType* in;
   OutType *out_ref, *out;
   cudaStream_t stream;
 };
@@ -86,14 +84,15 @@ class UnaryOpTest
 template <typename OutType, typename IdxType>
 class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
  protected:
-  void DoTest() override {
-    auto len = this->params.len;
+  void DoTest() override
+  {
+    auto len    = this->params.len;
     auto scalar = this->params.scalar;
-    naiveScale(this->out_ref, (OutType *)nullptr, scalar, len, this->stream);
-    unaryOpLaunch(this->out, (OutType *)nullptr, scalar, len, this->stream);
+    naiveScale(this->out_ref, (OutType*)nullptr, scalar, len, this->stream);
+    unaryOpLaunch(this->out, (OutType*)nullptr, scalar, len, this->stream);
     CUDA_CHECK(cudaStreamSynchronize(this->stream));
-    ASSERT_TRUE(devArrMatch(this->out_ref, this->out, this->params.len,
-                            CompareApprox<OutType>(this->params.tolerance)));
+    ASSERT_TRUE(devArrMatch(
+      this->out_ref, this->out, this->params.len, CompareApprox<OutType>(this->params.tolerance)));
   }
 };
 
@@ -101,8 +100,7 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
   TEST_P(Name, Result) { DoTest(); } \
   INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs))
 
-const std::vector<UnaryOpInputs<float, int>> inputsf_i32 = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<UnaryOpInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef UnaryOpTest<float, int> UnaryOpTestF_i32;
 UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32);
 typedef WriteOnlyUnaryOpTest<float, int> WriteOnlyUnaryOpTestF_i32;
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index be3f1124c5..3343389af8 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -24,8 +24,8 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename IdxType>
-__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar,
-                                 IdxType len) {
+__global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar, IdxType len)
+{
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
   if (idx < len) {
     if (in == nullptr) {
@@ -38,12 +38,11 @@ __global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar,
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-void naiveScale(OutType *out, const InType *in, InType scalar, int len,
-                cudaStream_t stream) {
+void naiveScale(OutType* out, const InType* in, InType scalar, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
-  naiveScaleKernel<InType, OutType, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
+  int nblks            = raft::ceildiv(len, TPB);
+  naiveScaleKernel<InType, OutType, IdxType><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -56,8 +55,8 @@ struct UnaryOpInputs {
 };
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const UnaryOpInputs<InType, IdxType, OutType> &d) {
+::std::ostream& operator<<(::std::ostream& os, const UnaryOpInputs<InType, IdxType, OutType>& d)
+{
   return os;
 }
 
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 578139623a..9cdd36b252 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -24,53 +24,51 @@ namespace raft {
 namespace matrix {
 
 template <typename Type>
-__global__ void nativePowerKernel(Type *in, Type *out, int len) {
+__global__ void nativePowerKernel(Type* in, Type* out, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in[idx] * in[idx];
-  }
+  if (idx < len) { out[idx] = in[idx] * in[idx]; }
 }
 
 template <typename Type>
-void naivePower(Type *in, Type *out, int len, cudaStream_t stream) {
+void naivePower(Type* in, Type* out, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   nativePowerKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void nativeSqrtKernel(Type *in, Type *out, int len) {
+__global__ void nativeSqrtKernel(Type* in, Type* out, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = sqrt(in[idx]);
-  }
+  if (idx < len) { out[idx] = sqrt(in[idx]); }
 }
 
 template <typename Type>
-void naiveSqrt(Type *in, Type *out, int len) {
+void naiveSqrt(Type* in, Type* out, int len)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   nativeSqrtKernel<Type><<<nblks, TPB>>>(in, out, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount,
-                                    int colCount) {
+__global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCount)
+{
   int d_i = blockIdx.x * rowCount;
   int end = d_i + rowCount;
 
   if (blockIdx.x < colCount) {
-    Type max = 0.0;
+    Type max      = 0.0;
     int max_index = 0;
     for (int i = d_i; i < end; i++) {
       Type val = in[i];
-      if (val < 0.0) {
-        val = -val;
-      }
+      if (val < 0.0) { val = -val; }
       if (val > max) {
-        max = val;
+        max       = val;
         max_index = i;
       }
     }
@@ -88,7 +86,8 @@ __global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount,
 }
 
 template <typename Type>
-void naiveSignFlip(Type *in, Type *out, int rowCount, int colCount) {
+void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount)
+{
   naiveSignFlipKernel<Type><<<colCount, 1>>>(in, out, rowCount, colCount);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -103,14 +102,16 @@ struct MathInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const MathInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MathInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<MathInputs<T>>::GetParam();
     random::Rng r(params.seed);
     int len = params.len;
@@ -154,7 +155,7 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     allocate(in_recip_ref, 4);
     allocate(out_recip, 4);
     // default threshold is 1e-15
-    std::vector<T> in_recip_h = {0.1, 0.01, -0.01, 0.1e-16};
+    std::vector<T> in_recip_h     = {0.1, 0.01, -0.01, 0.1e-16};
     std::vector<T> in_recip_ref_h = {10.0, 100.0, -100.0, 0.0};
     update_device(in_recip, in_recip_h.data(), 4, stream);
     update_device(in_recip_ref, in_recip_ref_h.data(), 4, stream);
@@ -165,7 +166,7 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
 
     reciprocal(in_recip, recip_scalar, 4, stream, true);
 
-    std::vector<T> in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1};
+    std::vector<T> in_small_val_zero_h     = {0.1, 1e-16, -1e-16, -0.1};
     std::vector<T> in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1};
     allocate(in_smallzero, 4);
     allocate(out_smallzero, 4);
@@ -177,7 +178,8 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(in_power));
     CUDA_CHECK(cudaFree(out_power_ref));
     CUDA_CHECK(cudaFree(in_sqrt));
@@ -196,137 +198,129 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
 
  protected:
   MathInputs<T> params;
-  T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio,
-    *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref,
-    *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref;
+  T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio, *out_ratio_ref, *in_sign_flip,
+    *out_sign_flip_ref, *in_recip, *in_recip_ref, *out_recip, *in_smallzero, *out_smallzero,
+    *out_smallzero_ref;
 };
 
-const std::vector<MathInputs<float>> inputsf = {
-  {0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}};
+const std::vector<MathInputs<float>> inputsf = {{0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}};
 
-const std::vector<MathInputs<double>> inputsd = {
-  {0.00001, 1024, 1024, 1024 * 1024, 1234ULL}};
+const std::vector<MathInputs<double>> inputsd = {{0.00001, 1024, 1024, 1024 * 1024, 1234ULL}};
 
 typedef MathTest<float> MathPowerTestF;
-TEST_P(MathPowerTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathPowerTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_power, out_power_ref, params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathPowerTestD;
-TEST_P(MathPowerTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathPowerTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_power, out_power_ref, params.len, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSqrtTestF;
-TEST_P(MathSqrtTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathSqrtTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_sqrt, out_sqrt_ref, params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSqrtTestD;
-TEST_P(MathSqrtTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathSqrtTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_sqrt, out_sqrt_ref, params.len, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathRatioTestF;
-TEST_P(MathRatioTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathRatioTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathRatioTestD;
-TEST_P(MathRatioTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathRatioTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSignFlipTestF;
-TEST_P(MathSignFlipTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathSignFlipTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_sign_flip, out_sign_flip_ref, params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSignFlipTestD;
-TEST_P(MathSignFlipTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathSignFlipTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_sign_flip, out_sign_flip_ref, params.len, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathReciprocalTestF;
-TEST_P(MathReciprocalTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathReciprocalTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, CompareApprox<float>(params.tolerance)));
 
   // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
-  ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3,
-                          CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathReciprocalTestD;
-TEST_P(MathReciprocalTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathReciprocalTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, CompareApprox<double>(params.tolerance)));
 
   // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
-  ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3,
-                          CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSetSmallZeroTestF;
-TEST_P(MathSetSmallZeroTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathSetSmallZeroTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_smallzero, out_smallzero_ref, 4, CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4,
-                          CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(out_smallzero, out_smallzero_ref, 4, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSetSmallZeroTestD;
-TEST_P(MathSetSmallZeroTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathSetSmallZeroTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_smallzero, out_smallzero_ref, 4, CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4,
-                          CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(out_smallzero, out_smallzero_ref, 4, CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF,
-                         ::testing::ValuesIn(inputsf));
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace matrix
 }  // namespace raft
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index 28222c0697..fc5a418bda 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -32,14 +32,16 @@ struct MatrixInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const MatrixInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MatrixInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<MatrixInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.n_row * params.n_col;
@@ -54,13 +56,14 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
     // copy(in1, in1_revr, params.n_row, params.n_col);
     // colReverse(in1_revr, params.n_row, params.n_col);
 
-    T *outTrunc;
+    T* outTrunc;
     raft::allocate(outTrunc, 6);
     truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(in1));
     CUDA_CHECK(cudaFree(in2));
     // CUDA_CHECK(cudaFree(in1_revr));
@@ -73,31 +76,30 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
 
 const std::vector<MatrixInputs<float>> inputsf2 = {{0.000001f, 4, 4, 1234ULL}};
 
-const std::vector<MatrixInputs<double>> inputsd2 = {
-  {0.00000001, 4, 4, 1234ULL}};
+const std::vector<MatrixInputs<double>> inputsd2 = {{0.00000001, 4, 4, 1234ULL}};
 
 typedef MatrixTest<float> MatrixTestF;
-TEST_P(MatrixTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MatrixTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    in1, in2, params.n_row * params.n_col, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef MatrixTest<double> MatrixTestD;
-TEST_P(MatrixTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(MatrixTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    in1, in2, params.n_row * params.n_col, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, ::testing::ValuesIn(inputsd2));
 
 template <typename T>
 class MatrixCopyRowsTest : public ::testing::Test {
-  using math_t = typename std::tuple_element<0, T>::type;
-  using idx_t = typename std::tuple_element<1, T>::type;
+  using math_t      = typename std::tuple_element<0, T>::type;
+  using idx_t       = typename std::tuple_element<1, T>::type;
   using idx_array_t = typename std::tuple_element<2, T>::type;
 
  protected:
@@ -105,42 +107,38 @@ class MatrixCopyRowsTest : public ::testing::Test {
     : allocator(handle.get_device_allocator()),
       input(allocator, handle.get_stream(), n_cols * n_rows),
       indices(allocator, handle.get_stream(), n_selected),
-      output(allocator, handle.get_stream(), n_cols * n_selected) {
+      output(allocator, handle.get_stream(), n_cols * n_selected)
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(indices.data(), indices_host, n_selected, stream);
     // Init input array
     thrust::counting_iterator<idx_t> first(0);
     thrust::device_ptr<math_t> ptr(input.data());
-    thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows,
-                 ptr);
+    thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows, ptr);
   }
 
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
-  void testCopyRows() {
-    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(),
-             n_selected, stream, false);
-    EXPECT_TRUE(raft::devArrMatchHost(output_exp_colmajor, output.data(),
-                                      n_selected * n_cols,
-                                      raft::Compare<math_t>()));
-    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(),
-             n_selected, stream, true);
-    EXPECT_TRUE(raft::devArrMatchHost(output_exp_rowmajor, output.data(),
-                                      n_selected * n_cols,
-                                      raft::Compare<math_t>()));
+  void testCopyRows()
+  {
+    copyRows(
+      input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, false);
+    EXPECT_TRUE(raft::devArrMatchHost(
+      output_exp_colmajor, output.data(), n_selected * n_cols, raft::Compare<math_t>()));
+    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, true);
+    EXPECT_TRUE(raft::devArrMatchHost(
+      output_exp_rowmajor, output.data(), n_selected * n_cols, raft::Compare<math_t>()));
   }
 
  protected:
-  int n_rows = 10;
-  int n_cols = 3;
+  int n_rows     = 10;
+  int n_cols     = 3;
   int n_selected = 5;
 
-  idx_array_t indices_host[5] = {0, 3, 4, 7, 9};
-  math_t output_exp_colmajor[15] = {0,  3,  4,  7,  9,  10, 13, 14,
-                                    17, 19, 20, 23, 24, 27, 29};
-  math_t output_exp_rowmajor[15] = {0,  1,  2,  9,  10, 11, 12, 13,
-                                    14, 21, 22, 23, 27, 28, 29};
+  idx_array_t indices_host[5]    = {0, 3, 4, 7, 9};
+  math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, 17, 19, 20, 23, 24, 27, 29};
+  math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, 14, 21, 22, 23, 27, 28, 29};
   raft::handle_t handle;
   cudaStream_t stream;
   std::shared_ptr<raft::mr::device::allocator> allocator;
@@ -149,10 +147,10 @@ class MatrixCopyRowsTest : public ::testing::Test {
   raft::mr::device::buffer<idx_array_t> indices;
 };
 
-using TypeTuple =
-  ::testing::Types<std::tuple<float, int, int>, std::tuple<float, int64_t, int>,
-                   std::tuple<double, int, int>,
-                   std::tuple<double, int64_t, int>>;
+using TypeTuple = ::testing::Types<std::tuple<float, int, int>,
+                                   std::tuple<float, int64_t, int>,
+                                   std::tuple<double, int, int>,
+                                   std::tuple<double, int64_t, int>>;
 
 TYPED_TEST_CASE(MatrixCopyRowsTest, TypeTuple);
 TYPED_TEST(MatrixCopyRowsTest, CopyRows) { this->testCopyRows(); }
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
index 223efdbfe8..9ba2c3332b 100644
--- a/cpp/test/mr/device/buffer.cpp
+++ b/cpp/test/mr/device/buffer.cpp
@@ -25,7 +25,8 @@ namespace raft {
 namespace mr {
 namespace device {
 
-TEST(Raft, DeviceBufferAlloc) {
+TEST(Raft, DeviceBufferAlloc)
+{
   auto alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
@@ -52,13 +53,14 @@ TEST(Raft, DeviceBufferAlloc) {
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, DeviceBufferZeroResize) {
+TEST(Raft, DeviceBufferZeroResize)
+{
   // Create a limiting_resource_adaptor to track allocations
-  auto curr_mr = dynamic_cast<rmm::mr::cuda_memory_resource*>(
-    rmm::mr::get_current_device_resource());
-  auto limit_mr = std::make_shared<
-    rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_memory_resource>>(curr_mr,
-                                                                       1000);
+  auto curr_mr =
+    dynamic_cast<rmm::mr::cuda_memory_resource*>(rmm::mr::get_current_device_resource());
+  auto limit_mr =
+    std::make_shared<rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_memory_resource>>(curr_mr,
+                                                                                        1000);
 
   rmm::mr::set_current_device_resource(limit_mr.get());
 
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp
index 953f65ddfb..aadf05285c 100644
--- a/cpp/test/mr/host/buffer.cpp
+++ b/cpp/test/mr/host/buffer.cpp
@@ -24,7 +24,8 @@ namespace raft {
 namespace mr {
 namespace host {
 
-TEST(Raft, HostBuffer) {
+TEST(Raft, HostBuffer)
+{
   auto alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
@@ -51,14 +52,14 @@ TEST(Raft, HostBuffer) {
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, DeviceToHostBuffer) {
+TEST(Raft, DeviceToHostBuffer)
+{
   auto d_alloc = std::make_shared<device::default_allocator>();
   auto h_alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
   device::buffer<char> d_buff(d_alloc, stream, 32);
-  CUDA_CHECK(
-    cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
+  CUDA_CHECK(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
   buffer<char> h_buff(h_alloc, d_buff);
   ASSERT_EQ(d_buff.size(), h_buff.size());
   CUDA_CHECK(cudaStreamSynchronize(stream));
diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu
index d7aa76500b..5560c61c73 100644
--- a/cpp/test/mst.cu
+++ b/cpp/test/mst.cu
@@ -54,7 +54,8 @@ namespace mst {
 // Sequential prims function
 // Returns total weight of MST
 template <typename vertex_t, typename edge_t, typename weight_t>
-weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
+weight_t prims(CSRHost<vertex_t, edge_t, weight_t>& csr_h)
+{
   auto n_vertices = csr_h.offsets.size() - 1;
 
   bool active_vertex[n_vertices];
@@ -63,19 +64,18 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
 
   for (auto i = 0; i < n_vertices; i++) {
     active_vertex[i] = false;
-    curr_edge[i] = INT_MAX;
+    curr_edge[i]     = INT_MAX;
   }
   curr_edge[0] = 0;
 
   // function to pick next min vertex-edge
-  auto min_vertex_edge = [](auto *curr_edge, auto *active_vertex,
-                            auto n_vertices) {
+  auto min_vertex_edge = [](auto* curr_edge, auto* active_vertex, auto n_vertices) {
     weight_t min = INT_MAX;
     vertex_t min_vertex;
 
     for (auto v = 0; v < n_vertices; v++) {
       if (!active_vertex[v] && curr_edge[v] < min) {
-        min = curr_edge[v];
+        min        = curr_edge[v];
         min_vertex = v;
       }
     }
@@ -91,14 +91,13 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
     active_vertex[curr_v] = true;  // set to active
 
     // iterate through edges of current active vertex
-    auto edge_st = csr_h.offsets[curr_v];
+    auto edge_st  = csr_h.offsets[curr_v];
     auto edge_end = csr_h.offsets[curr_v + 1];
 
     for (auto e = edge_st; e < edge_end; e++) {
       // put edges to be considered for next iteration
       auto neighbor_idx = csr_h.indices[e];
-      if (!active_vertex[neighbor_idx] &&
-          csr_h.weights[e] < curr_edge[neighbor_idx]) {
+      if (!active_vertex[neighbor_idx] && csr_h.weights[e] < curr_edge[neighbor_idx]) {
         curr_edge[neighbor_idx] = csr_h.weights[e];
       }
     }
@@ -114,99 +113,101 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-class MSTTest
-  : public ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, weight_t>> {
+class MSTTest : public ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, weight_t>> {
  protected:
   std::pair<raft::Graph_COO<vertex_t, edge_t, weight_t>,
             raft::Graph_COO<vertex_t, edge_t, weight_t>>
-  mst_gpu() {
-    edge_t *offsets = static_cast<edge_t *>(csr_d.offsets.data());
-    vertex_t *indices = static_cast<vertex_t *>(csr_d.indices.data());
-    weight_t *weights = static_cast<weight_t *>(csr_d.weights.data());
+  mst_gpu()
+  {
+    edge_t* offsets   = static_cast<edge_t*>(csr_d.offsets.data());
+    vertex_t* indices = static_cast<vertex_t*>(csr_d.indices.data());
+    weight_t* weights = static_cast<weight_t*>(csr_d.weights.data());
 
     v = static_cast<vertex_t>((csr_d.offsets.size() / sizeof(vertex_t)) - 1);
     e = static_cast<edge_t>(csr_d.indices.size() / sizeof(edge_t));
 
-    rmm::device_vector<vertex_t> mst_src(2 * v - 2,
-                                         std::numeric_limits<vertex_t>::max());
-    rmm::device_vector<vertex_t> mst_dst(2 * v - 2,
-                                         std::numeric_limits<vertex_t>::max());
+    rmm::device_vector<vertex_t> mst_src(2 * v - 2, std::numeric_limits<vertex_t>::max());
+    rmm::device_vector<vertex_t> mst_dst(2 * v - 2, std::numeric_limits<vertex_t>::max());
     rmm::device_vector<vertex_t> color(v, 0);
 
-    vertex_t *color_ptr = thrust::raw_pointer_cast(color.data());
+    vertex_t* color_ptr = thrust::raw_pointer_cast(color.data());
 
     if (iterations == 0) {
       MST_solver<vertex_t, edge_t, weight_t, float> symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        true, true, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, true, 0);
       auto symmetric_result = symmetric_solver.solve();
 
       MST_solver<vertex_t, edge_t, weight_t, float> non_symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        false, true, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0);
       auto non_symmetric_result = non_symmetric_solver.solve();
 
       EXPECT_LE(symmetric_result.n_edges, 2 * v - 2);
       EXPECT_LE(non_symmetric_result.n_edges, v - 1);
 
-      return std::make_pair(std::move(symmetric_result),
-                            std::move(non_symmetric_result));
+      return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result));
     } else {
-      MST_solver<vertex_t, edge_t, weight_t, float> intermediate_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        true, true, iterations);
+      MST_solver<vertex_t, edge_t, weight_t, float> intermediate_solver(handle,
+                                                                        offsets,
+                                                                        indices,
+                                                                        weights,
+                                                                        v,
+                                                                        e,
+                                                                        color_ptr,
+                                                                        handle.get_stream(),
+                                                                        true,
+                                                                        true,
+                                                                        iterations);
       auto intermediate_result = intermediate_solver.solve();
 
       MST_solver<vertex_t, edge_t, weight_t, float> symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        true, false, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, false, 0);
       auto symmetric_result = symmetric_solver.solve();
 
       // symmetric_result.n_edges += intermediate_result.n_edges;
-      auto total_edge_size =
-        symmetric_result.n_edges + intermediate_result.n_edges;
+      auto total_edge_size = symmetric_result.n_edges + intermediate_result.n_edges;
       symmetric_result.src.resize(total_edge_size, handle.get_stream());
       symmetric_result.dst.resize(total_edge_size, handle.get_stream());
       symmetric_result.weights.resize(total_edge_size, handle.get_stream());
 
       raft::copy(symmetric_result.src.data() + symmetric_result.n_edges,
-                 intermediate_result.src.data(), intermediate_result.n_edges,
+                 intermediate_result.src.data(),
+                 intermediate_result.n_edges,
                  handle.get_stream());
       raft::copy(symmetric_result.dst.data() + symmetric_result.n_edges,
-                 intermediate_result.dst.data(), intermediate_result.n_edges,
+                 intermediate_result.dst.data(),
+                 intermediate_result.n_edges,
                  handle.get_stream());
       raft::copy(symmetric_result.weights.data() + symmetric_result.n_edges,
                  intermediate_result.weights.data(),
-                 intermediate_result.n_edges, handle.get_stream());
+                 intermediate_result.n_edges,
+                 handle.get_stream());
       symmetric_result.n_edges = total_edge_size;
 
       MST_solver<vertex_t, edge_t, weight_t, float> non_symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        false, true, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0);
       auto non_symmetric_result = non_symmetric_solver.solve();
 
       EXPECT_LE(symmetric_result.n_edges, 2 * v - 2);
       EXPECT_LE(non_symmetric_result.n_edges, v - 1);
 
-      return std::make_pair(std::move(symmetric_result),
-                            std::move(non_symmetric_result));
+      return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result));
     }
   }
 
-  void SetUp() override {
-    mst_input = ::testing::TestWithParam<
-      MSTTestInput<vertex_t, edge_t, weight_t>>::GetParam();
+  void SetUp() override
+  {
+    mst_input  = ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, weight_t>>::GetParam();
     iterations = mst_input.iterations;
 
-    csr_d.offsets = rmm::device_buffer(
-      mst_input.csr_h.offsets.data(),
-      mst_input.csr_h.offsets.size() * sizeof(edge_t), handle.get_stream());
-    csr_d.indices = rmm::device_buffer(
-      mst_input.csr_h.indices.data(),
-      mst_input.csr_h.indices.size() * sizeof(vertex_t), handle.get_stream());
-    csr_d.weights = rmm::device_buffer(
-      mst_input.csr_h.weights.data(),
-      mst_input.csr_h.weights.size() * sizeof(weight_t), handle.get_stream());
+    csr_d.offsets = rmm::device_buffer(mst_input.csr_h.offsets.data(),
+                                       mst_input.csr_h.offsets.size() * sizeof(edge_t),
+                                       handle.get_stream());
+    csr_d.indices = rmm::device_buffer(mst_input.csr_h.indices.data(),
+                                       mst_input.csr_h.indices.size() * sizeof(vertex_t),
+                                       handle.get_stream());
+    csr_d.weights = rmm::device_buffer(mst_input.csr_h.weights.data(),
+                                       mst_input.csr_h.weights.size() * sizeof(weight_t),
+                                       handle.get_stream());
   }
 
   void TearDown() override {}
@@ -259,41 +260,68 @@ const std::vector<MSTTestInput<int, int, float>> csr_in_h = {
 const std::vector<CSRHost<int, int, float>> csr_in4_h = {
   {{0, 3, 5, 8, 10, 12, 14, 16},
    {2, 4, 5, 3, 6, 0, 4, 5, 1, 6, 0, 2, 0, 2, 1, 3},
-   {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f,
-    1.0f, 6.0f, 7.0f, 10.0f}}};
+   {5.0f,
+    9.0f,
+    1.0f,
+    8.0f,
+    7.0f,
+    5.0f,
+    2.0f,
+    6.0f,
+    8.0f,
+    10.0f,
+    9.0f,
+    2.0f,
+    1.0f,
+    6.0f,
+    7.0f,
+    10.0f}}};
 
 //  singletons
 const std::vector<CSRHost<int, int, float>> csr_in5_h = {
   {{0, 3, 5, 8, 10, 10, 10, 12, 14, 16, 16},
    {2, 8, 7, 3, 8, 0, 8, 7, 1, 8, 0, 2, 0, 2, 1, 3},
-   {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f,
-    1.0f, 6.0f, 7.0f, 10.0f}}};
+   {5.0f,
+    9.0f,
+    1.0f,
+    8.0f,
+    7.0f,
+    5.0f,
+    2.0f,
+    6.0f,
+    8.0f,
+    10.0f,
+    9.0f,
+    2.0f,
+    1.0f,
+    6.0f,
+    7.0f,
+    10.0f}}};
 
 typedef MSTTest<int, int, float> MSTTestSequential;
-TEST_P(MSTTestSequential, Sequential) {
-  auto results_pair = mst_gpu();
-  auto &symmetric_result = results_pair.first;
-  auto &non_symmetric_result = results_pair.second;
+TEST_P(MSTTestSequential, Sequential)
+{
+  auto results_pair          = mst_gpu();
+  auto& symmetric_result     = results_pair.first;
+  auto& non_symmetric_result = results_pair.second;
 
   // do assertions here
   // in this case, running sequential MST
   auto prims_result = prims(mst_input.csr_h);
 
-  auto symmetric_sum =
-    thrust::reduce(thrust::device, symmetric_result.weights.data(),
-                   symmetric_result.weights.data() + symmetric_result.n_edges);
-  auto non_symmetric_sum = thrust::reduce(
-    thrust::device, non_symmetric_result.weights.data(),
-    non_symmetric_result.weights.data() + non_symmetric_result.n_edges);
-
-  ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum,
-                          raft::CompareApprox<float>(0.1)));
-  ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum,
-                          raft::CompareApprox<float>(0.1)));
+  auto symmetric_sum = thrust::reduce(thrust::device,
+                                      symmetric_result.weights.data(),
+                                      symmetric_result.weights.data() + symmetric_result.n_edges);
+  auto non_symmetric_sum =
+    thrust::reduce(thrust::device,
+                   non_symmetric_result.weights.data(),
+                   non_symmetric_result.weights.data() + non_symmetric_result.n_edges);
+
+  ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, raft::CompareApprox<float>(0.1)));
+  ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, raft::CompareApprox<float>(0.1)));
 }
 
-INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential,
-                         ::testing::ValuesIn(csr_in_h));
+INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, ::testing::ValuesIn(csr_in_h));
 
 }  // namespace mst
 }  // namespace raft
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index af10dcab30..25c8fe5084 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -38,12 +38,13 @@ enum RandomType {
 };
 
 template <typename T, int TPB>
-__global__ void meanKernel(T* out, const T* data, int len) {
+__global__ void meanKernel(T* out, const T* data, int len)
+{
   typedef cub::BlockReduce<T, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  T val = tid < len ? data[tid] : T(0);
-  T x = BlockReduce(temp_storage).Sum(val);
+  T val   = tid < len ? data[tid] : T(0);
+  T x     = BlockReduce(temp_storage).Sum(val);
   __syncthreads();
   T xx = BlockReduce(temp_storage).Sum(val * val);
   __syncthreads();
@@ -70,7 +71,8 @@ struct RngInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims) {
+::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims)
+{
   return os;
 }
 
@@ -80,46 +82,30 @@ template <typename T>
 template <typename T>
 class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     // Tests are configured with their expected test-values sigma. For example,
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
     num_sigma = 10;
-    params = ::testing::TestWithParam<RngInputs<T>>::GetParam();
+    params    = ::testing::TestWithParam<RngInputs<T>>::GetParam();
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(params.seed, params.gtype);
     allocate(data, params.len);
     allocate(stats, 2, true);
     switch (params.type) {
-      case RNG_Normal:
-        r.normal(data, params.len, params.start, params.end, stream);
-        break;
-      case RNG_LogNormal:
-        r.lognormal(data, params.len, params.start, params.end, stream);
-        break;
-      case RNG_Uniform:
-        r.uniform(data, params.len, params.start, params.end, stream);
-        break;
-      case RNG_Gumbel:
-        r.gumbel(data, params.len, params.start, params.end, stream);
-        break;
-      case RNG_Logistic:
-        r.logistic(data, params.len, params.start, params.end, stream);
-        break;
-      case RNG_Exp:
-        r.exponential(data, params.len, params.start, stream);
-        break;
-      case RNG_Rayleigh:
-        r.rayleigh(data, params.len, params.start, stream);
-        break;
-      case RNG_Laplace:
-        r.laplace(data, params.len, params.start, params.end, stream);
-        break;
+      case RNG_Normal: r.normal(data, params.len, params.start, params.end, stream); break;
+      case RNG_LogNormal: r.lognormal(data, params.len, params.start, params.end, stream); break;
+      case RNG_Uniform: r.uniform(data, params.len, params.start, params.end, stream); break;
+      case RNG_Gumbel: r.gumbel(data, params.len, params.start, params.end, stream); break;
+      case RNG_Logistic: r.logistic(data, params.len, params.start, params.end, stream); break;
+      case RNG_Exp: r.exponential(data, params.len, params.start, stream); break;
+      case RNG_Rayleigh: r.rayleigh(data, params.len, params.start, stream); break;
+      case RNG_Laplace: r.laplace(data, params.len, params.start, params.end, stream); break;
     };
     static const int threads = 128;
     meanKernel<T, threads>
-      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(stats, data,
-                                                                   params.len);
+      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(stats, data, params.len);
     update_host<T>(h_stats, stats, 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
@@ -127,23 +113,24 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(stats));
   }
 
-  void getExpectedMeanVar(T meanvar[2]) {
+  void getExpectedMeanVar(T meanvar[2])
+  {
     switch (params.type) {
       case RNG_Normal:
         meanvar[0] = params.start;
         meanvar[1] = params.end * params.end;
         break;
       case RNG_LogNormal: {
-        auto var = params.end * params.end;
-        auto mu = params.start;
+        auto var   = params.end * params.end;
+        auto mu    = params.start;
         meanvar[0] = raft::myExp(mu + var * T(0.5));
-        meanvar[1] =
-          (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
+        meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
         break;
       }
       case RNG_Uniform:
@@ -167,8 +154,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
         break;
       case RNG_Rayleigh:
         meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0));
-        meanvar[1] =
-          ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
+        meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
         break;
       case RNG_Laplace:
         meanvar[0] = params.start;
@@ -259,13 +245,12 @@ const std::vector<RngInputs<float>> inputsf = {
   {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL},
   {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}};
 
-TEST_P(RngTestF, Result) {
+TEST_P(RngTestF, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0],
-                    CompareApprox<float>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1],
-                    CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(num_sigma * params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestF, ::testing::ValuesIn(inputsf));
 
@@ -321,13 +306,12 @@ const std::vector<RngInputs<double>> inputsd = {
   {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL},
   {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL},
   {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}};
-TEST_P(RngTestD, Result) {
+TEST_P(RngTestD, Result)
+{
   double meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0],
-                    CompareApprox<double>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1],
-                    CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<double>(num_sigma * params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd));
 
@@ -335,7 +319,8 @@ INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd));
 // Test for expected variance in mean calculations
 
 template <typename T>
-T quick_mean(const std::vector<T>& d) {
+T quick_mean(const std::vector<T>& d)
+{
   T acc = T(0);
   for (const auto& di : d) {
     acc += di;
@@ -344,8 +329,9 @@ T quick_mean(const std::vector<T>& d) {
 }
 
 template <typename T>
-T quick_std(const std::vector<T>& d) {
-  T acc = T(0);
+T quick_std(const std::vector<T>& d)
+{
+  T acc    = T(0);
   T d_mean = quick_mean(d);
   for (const auto& di : d) {
     acc += ((di - d_mean) * (di - d_mean));
@@ -354,7 +340,8 @@ T quick_std(const std::vector<T>& d) {
 }
 
 template <typename T>
-std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
+std::ostream& operator<<(std::ostream& out, const std::vector<T>& v)
+{
   if (!v.empty()) {
     out << '[';
     std::copy(v.begin(), v.end(), std::ostream_iterator<T>(out, ", "));
@@ -369,11 +356,12 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
 // experiments computing the mean, giving us a distribution of the mean
 // itself. The mean error is simply the standard deviation of this
 // distribution (the standard deviation of the mean).
-TEST(Rng, MeanError) {
+TEST(Rng, MeanError)
+{
   timeb time_struct;
   ftime(&time_struct);
-  int seed = time_struct.millitm;
-  int num_samples = 1024;
+  int seed            = time_struct.millitm;
+  int num_samples     = 1024;
   int num_experiments = 1024;
   float* data;
   float* mean_result;
@@ -391,10 +379,9 @@ TEST(Rng, MeanError) {
     Rng r(seed, rtype);
     r.normal(data, len, 3.3f, 0.23f, stream);
     // r.uniform(data, len, -1.0, 2.0);
-    raft::stats::mean(mean_result, data, num_samples, num_experiments, false,
-                      false, stream);
-    raft::stats::stddev(std_result, data, mean_result, num_samples,
-                        num_experiments, false, false, stream);
+    raft::stats::mean(mean_result, data, num_samples, num_experiments, false, false, stream);
+    raft::stats::stddev(
+      std_result, data, mean_result, num_samples, num_experiments, false, false, stream);
     std::vector<float> h_mean_result(num_experiments);
     std::vector<float> h_std_result(num_experiments);
     update_host(h_mean_result.data(), mean_result, num_experiments, stream);
@@ -403,8 +390,8 @@ TEST(Rng, MeanError) {
     auto d_mean = quick_mean(h_mean_result);
 
     // std-dev of mean; also known as mean error
-    auto d_std_of_mean = quick_std(h_mean_result);
-    auto d_std = quick_mean(h_std_result);
+    auto d_std_of_mean            = quick_std(h_mean_result);
+    auto d_std                    = quick_mean(h_std_result);
     auto d_std_of_mean_analytical = d_std / std::sqrt(num_samples);
 
     // std::cout << "measured mean error: " << d_std_of_mean << "\n";
@@ -413,8 +400,7 @@ TEST(Rng, MeanError) {
     auto diff_expected_vs_measured_mean_error =
       std::abs(d_std_of_mean - d_std / std::sqrt(num_samples));
 
-    ASSERT_TRUE(
-      (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
+    ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
   }
   CUDA_CHECK(cudaStreamDestroy(stream));
   CUDA_CHECK(cudaFree(data));
@@ -427,7 +413,8 @@ TEST(Rng, MeanError) {
 template <typename T, int len, int scale>
 class ScaledBernoulliTest : public ::testing::Test {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     Rng r(42);
@@ -438,12 +425,12 @@ class ScaledBernoulliTest : public ::testing::Test {
 
   void TearDown() override { CUDA_CHECK(cudaFree(data)); }
 
-  void rangeCheck() {
+  void rangeCheck()
+  {
     T* h_data = new T[len];
     update_host(h_data, data, len, stream);
-    ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) {
-      return a < -scale || a > scale;
-    }));
+    ASSERT_TRUE(
+      std::none_of(h_data, h_data + len, [](const T& a) { return a < -scale || a > scale; }));
     delete[] h_data;
   }
 
@@ -460,7 +447,8 @@ TEST_F(ScaledBernoulliTest2, RangeCheck) { rangeCheck(); }
 template <typename T, int len>
 class BernoulliTest : public ::testing::Test {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(42);
     allocate(data, len * sizeof(bool), stream);
@@ -469,7 +457,8 @@ class BernoulliTest : public ::testing::Test {
 
   void TearDown() override { CUDA_CHECK(cudaFree(data)); }
 
-  void trueFalseCheck() {
+  void trueFalseCheck()
+  {
     // both true and false values must be present
     bool* h_data = new bool[len];
     update_host(h_data, data, len, stream);
@@ -499,21 +488,21 @@ struct RngNormalTableInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os,
-                           const RngNormalTableInputs<T>& dims) {
+::std::ostream& operator<<(::std::ostream& os, const RngNormalTableInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
-class RngNormalTableTest
-  : public ::testing::TestWithParam<RngNormalTableInputs<T>> {
+class RngNormalTableTest : public ::testing::TestWithParam<RngNormalTableInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     // Tests are configured with their expected test-values sigma. For example,
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
     num_sigma = 10;
-    params = ::testing::TestWithParam<RngNormalTableInputs<T>>::GetParam();
-    int len = params.rows * params.cols;
+    params    = ::testing::TestWithParam<RngNormalTableInputs<T>>::GetParam();
+    int len   = params.rows * params.cols;
 
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
@@ -523,11 +512,9 @@ class RngNormalTableTest
     allocate(mu_vec, params.cols);
     r.fill(mu_vec, params.cols, params.mu, stream);
     T* sigma_vec = nullptr;
-    r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec,
-                  params.sigma, stream);
+    r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec, params.sigma, stream);
     static const int threads = 128;
-    meanKernel<T, threads>
-      <<<raft::ceildiv(len, threads), threads, 0, stream>>>(stats, data, len);
+    meanKernel<T, threads><<<raft::ceildiv(len, threads), threads, 0, stream>>>(stats, data, len);
     update_host<T>(h_stats, stats, 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= len;
@@ -535,13 +522,15 @@ class RngNormalTableTest
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(stats));
     CUDA_CHECK(cudaFree(mu_vec));
   }
 
-  void getExpectedMeanVar(T meanvar[2]) {
+  void getExpectedMeanVar(T meanvar[2])
+  {
     meanvar[0] = params.mu;
     meanvar[1] = params.sigma * params.sigma;
   }
@@ -562,16 +551,14 @@ const std::vector<RngNormalTableInputs<float>> inputsf_t = {
   {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL},
   {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}};
 
-TEST_P(RngNormalTableTestF, Result) {
+TEST_P(RngNormalTableTestF, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0],
-                    CompareApprox<float>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1],
-                    CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(num_sigma * params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF,
-                         ::testing::ValuesIn(inputsf_t));
+INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, ::testing::ValuesIn(inputsf_t));
 
 typedef RngNormalTableTest<double> RngNormalTableTestD;
 const std::vector<RngNormalTableInputs<double>> inputsd_t = {
@@ -581,16 +568,14 @@ const std::vector<RngNormalTableInputs<double>> inputsd_t = {
   {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL},
   {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL},
   {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}};
-TEST_P(RngNormalTableTestD, Result) {
+TEST_P(RngNormalTableTestD, Result)
+{
   double meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0],
-                    CompareApprox<double>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1],
-                    CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<double>(num_sigma * params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD,
-                         ::testing::ValuesIn(inputsd_t));
+INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, ::testing::ValuesIn(inputsd_t));
 
 struct RngAffineInputs {
   int n;
@@ -599,13 +584,15 @@ struct RngAffineInputs {
 
 class RngAffineTest : public ::testing::TestWithParam<RngAffineInputs> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<RngAffineInputs>::GetParam();
     Rng r(params.seed);
     r.affine_transform_params(params.n, a, b);
   }
 
-  void check() {
+  void check()
+  {
     ASSERT_TRUE(gcd(a, params.n) == 1);
     ASSERT_TRUE(0 <= b && b < params.n);
   }
@@ -616,13 +603,17 @@ class RngAffineTest : public ::testing::TestWithParam<RngAffineInputs> {
 };  // RngAffineTest
 
 const std::vector<RngAffineInputs> inputs_affine = {
-  {100, 123456ULL},     {100, 1234567890ULL},  {101, 123456ULL},
-  {101, 1234567890ULL}, {7, 123456ULL},        {7, 1234567890ULL},
-  {2568, 123456ULL},    {2568, 1234567890ULL},
+  {100, 123456ULL},
+  {100, 1234567890ULL},
+  {101, 123456ULL},
+  {101, 1234567890ULL},
+  {7, 123456ULL},
+  {7, 1234567890ULL},
+  {2568, 123456ULL},
+  {2568, 1234567890ULL},
 };
 TEST_P(RngAffineTest, Result) { check(); }
-INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest,
-                         ::testing::ValuesIn(inputs_affine));
+INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, ::testing::ValuesIn(inputs_affine));
 
 }  // namespace random
 }  // namespace raft
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index 92f12206e8..c77c3df526 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -27,12 +27,13 @@ namespace random {
 enum RandomType { RNG_Uniform };
 
 template <typename T, int TPB>
-__global__ void meanKernel(float *out, const T *data, int len) {
+__global__ void meanKernel(float* out, const T* data, int len)
+{
   typedef cub::BlockReduce<float, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid   = threadIdx.x + blockIdx.x * blockDim.x;
   float val = tid < len ? data[tid] : T(0);
-  float x = BlockReduce(temp_storage).Sum(val);
+  float x   = BlockReduce(temp_storage).Sum(val);
   __syncthreads();
   float xx = BlockReduce(temp_storage).Sum(val * val);
   __syncthreads();
@@ -59,14 +60,16 @@ struct RngInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const RngInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<RngInputs<T>>::GetParam();
     Rng r(params.seed, params.gtype);
 
@@ -75,14 +78,11 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     allocate(data, params.len);
     allocate(stats, 2, true);
     switch (params.type) {
-      case RNG_Uniform:
-        r.uniformInt(data, params.len, params.start, params.end, stream);
-        break;
+      case RNG_Uniform: r.uniformInt(data, params.len, params.start, params.end, stream); break;
     };
     static const int threads = 128;
     meanKernel<T, threads>
-      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(stats, data,
-                                                                   params.len);
+      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(stats, data, params.len);
     update_host<float>(h_stats, stats, 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
@@ -90,12 +90,14 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(stats));
   }
 
-  void getExpectedMeanVar(float meanvar[2]) {
+  void getExpectedMeanVar(float meanvar[2])
+  {
     switch (params.type) {
       case RNG_Uniform:
         meanvar[0] = (params.start + params.end) * 0.5f;
@@ -107,8 +109,8 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
 
  protected:
   RngInputs<T> params;
-  T *data;
-  float *stats;
+  T* data;
+  float* stats;
   float h_stats[2];  // mean, var
 };
 
@@ -120,13 +122,12 @@ const std::vector<RngInputs<uint32_t>> inputs_u32 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestU32, Result) {
+TEST_P(RngTestU32, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(
-    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(
-    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU32, ::testing::ValuesIn(inputs_u32));
 
@@ -138,13 +139,12 @@ const std::vector<RngInputs<uint64_t>> inputs_u64 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestU64, Result) {
+TEST_P(RngTestU64, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(
-    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(
-    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU64, ::testing::ValuesIn(inputs_u64));
 
@@ -156,13 +156,12 @@ const std::vector<RngInputs<int32_t>> inputs_s32 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestS32, Result) {
+TEST_P(RngTestS32, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(
-    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(
-    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS32, ::testing::ValuesIn(inputs_s32));
 
@@ -174,13 +173,12 @@ const std::vector<RngInputs<int64_t>> inputs_s64 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestS64, Result) {
+TEST_P(RngTestS64, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(
-    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(
-    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS64, ::testing::ValuesIn(inputs_s64));
 
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index d7e52a8958..c258841c3e 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -38,14 +38,16 @@ struct SWoRInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const SWoRInputs<T>& dims) {
+::std::ostream& operator<<(::std::ostream& os, const SWoRInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<SWoRInputs<T>>::GetParam();
     CUDA_CHECK(cudaStreamCreate(&stream));
 
@@ -58,15 +60,14 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
     r.uniform(in, params.len, T(-1.0), T(1.0), stream);
     r.uniform(wts, params.len, T(1.0), T(2.0), stream);
     if (params.largeWeightIndex >= 0) {
-      update_device(wts + params.largeWeightIndex, &params.largeWeight, 1,
-                    stream);
+      update_device(wts + params.largeWeightIndex, &params.largeWeight, 1, stream);
     }
-    r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen,
-                               params.len, stream);
+    r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen, params.len, stream);
     update_host(&(h_outIdx[0]), outIdx, params.sampledLen, stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaStreamDestroy(stream));
     CUDA_CHECK(cudaFree(in));
@@ -147,14 +148,14 @@ const std::vector<SWoRInputs<float>> inputsf = {
   {1024, 512, 10, 100000.f, GenKiss99, 1234ULL},
 };
 
-TEST_P(SWoRTestF, Result) {
+TEST_P(SWoRTestF, Result)
+{
   std::set<int> occurence;
   for (int i = 0; i < params.sampledLen; ++i) {
     auto val = h_outIdx[i];
     // indices must be in the given range
     ASSERT_TRUE(0 <= val && val < params.len)
-      << "out-of-range index @i=" << i << " val=" << val
-      << " sampledLen=" << params.sampledLen;
+      << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen;
     // indices should not repeat
     ASSERT_TRUE(occurence.find(val) == occurence.end())
       << "repeated index @i=" << i << " idx=" << val;
@@ -162,9 +163,7 @@ TEST_P(SWoRTestF, Result) {
   }
   // if there's a skewed distribution, the top index should correspond to the
   // particular item with a large weight
-  if (params.largeWeightIndex >= 0) {
-    ASSERT_EQ(h_outIdx[0], params.largeWeightIndex);
-  }
+  if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); }
 }
 INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf));
 
@@ -231,14 +230,14 @@ const std::vector<SWoRInputs<double>> inputsd = {
   {1024, 512, 10, 100000.0, GenKiss99, 1234ULL},
 };
 
-TEST_P(SWoRTestD, Result) {
+TEST_P(SWoRTestD, Result)
+{
   std::set<int> occurence;
   for (int i = 0; i < params.sampledLen; ++i) {
     auto val = h_outIdx[i];
     // indices must be in the given range
     ASSERT_TRUE(0 <= val && val < params.len)
-      << "out-of-range index @i=" << i << " val=" << val
-      << " sampledLen=" << params.sampledLen;
+      << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen;
     // indices should not repeat
     ASSERT_TRUE(occurence.find(val) == occurence.end())
       << "repeated index @i=" << i << " idx=" << val;
@@ -246,9 +245,7 @@ TEST_P(SWoRTestD, Result) {
   }
   // if there's a skewed distribution, the top index should correspond to the
   // particular item with a large weight
-  if (params.largeWeightIndex >= 0) {
-    ASSERT_EQ(h_outIdx[0], params.largeWeightIndex);
-  }
+  if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); }
 }
 INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index 713708d4cd..e1f814a5b6 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -44,14 +44,14 @@ struct CSRAddInputs {
 };
 
 template <typename Type_f, typename Index_>
-class CSRAddTest
-  : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>> {
+class CSRAddTest : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>> {
  protected:
-  void SetUp() override {
-    params = ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>::GetParam();
-    n_rows = params.matrix_a.row_ind.size();
-    nnz_a = params.matrix_a.row_ind_ptr.size();
-    nnz_b = params.matrix_b.row_ind_ptr.size();
+  void SetUp() override
+  {
+    params     = ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>::GetParam();
+    n_rows     = params.matrix_a.row_ind.size();
+    nnz_a      = params.matrix_a.row_ind_ptr.size();
+    nnz_b      = params.matrix_b.row_ind_ptr.size();
     nnz_result = params.matrix_verify.row_ind_ptr.size();
 
     cudaStreamCreate(&stream);
@@ -73,46 +73,61 @@ class CSRAddTest
     raft::allocate(values_result, nnz_result);
   }
 
-  void Run() {
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
+  void Run()
+  {
+    std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
 
     raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream);
-    raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a,
-                        stream);
+    raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a, stream);
     raft::update_device(values_a, params.matrix_a.values.data(), nnz_a, stream);
 
     raft::update_device(ind_b, params.matrix_b.row_ind.data(), n_rows, stream);
-    raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b,
-                        stream);
+    raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b, stream);
     raft::update_device(values_b, params.matrix_b.values.data(), nnz_b, stream);
 
-    raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows,
-                        stream);
-    raft::update_device(ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(),
-                        nnz_result, stream);
-    raft::update_device(values_verify, params.matrix_verify.values.data(),
-                        nnz_result, stream);
-
-    Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(
-      ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b,
-      n_rows, ind_result, alloc, stream);
+    raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows, stream);
+    raft::update_device(
+      ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(), nnz_result, stream);
+    raft::update_device(values_verify, params.matrix_verify.values.data(), nnz_result, stream);
+
+    Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(ind_a,
+                                                       ind_ptr_a,
+                                                       values_a,
+                                                       nnz_a,
+                                                       ind_b,
+                                                       ind_ptr_b,
+                                                       values_b,
+                                                       nnz_b,
+                                                       n_rows,
+                                                       ind_result,
+                                                       alloc,
+                                                       stream);
 
     ASSERT_TRUE(nnz == nnz_result);
-    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_verify, ind_result, n_rows,
-                                          raft::Compare<Index_>()));
-
-    linalg::csr_add_finalize<Type_f, 32>(
-      ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b,
-      n_rows, ind_result, ind_ptr_result, values_result, stream);
-
-    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_ptr_verify, ind_ptr_result, nnz,
-                                          raft::Compare<Index_>()));
-    ASSERT_TRUE(raft::devArrMatch<Type_f>(values_verify, values_result, nnz,
-                                          raft::Compare<Type_f>()));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_verify, ind_result, n_rows, raft::Compare<Index_>()));
+
+    linalg::csr_add_finalize<Type_f, 32>(ind_a,
+                                         ind_ptr_a,
+                                         values_a,
+                                         nnz_a,
+                                         ind_b,
+                                         ind_ptr_b,
+                                         values_b,
+                                         nnz_b,
+                                         n_rows,
+                                         ind_result,
+                                         ind_ptr_result,
+                                         values_result,
+                                         stream);
+
+    ASSERT_TRUE(
+      raft::devArrMatch<Index_>(ind_ptr_verify, ind_ptr_result, nnz, raft::Compare<Index_>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(values_verify, values_result, nnz, raft::Compare<Type_f>()));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(ind_a));
     CUDA_CHECK(cudaFree(ind_b));
     CUDA_CHECK(cudaFree(ind_result));
@@ -131,8 +146,8 @@ class CSRAddTest
   CSRAddInputs<Type_f, Index_> params;
   cudaStream_t stream;
   Index_ n_rows, nnz_a, nnz_b, nnz_result;
-  Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b,
-    *ind_ptr_verify, *ind_ptr_result;
+  Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b, *ind_ptr_verify,
+    *ind_ptr_result;
   Type_f *values_a, *values_b, *values_verify, *values_result;
 };
 
@@ -165,10 +180,8 @@ const std::vector<CSRAddInputs<double, int>> csradd_inputs_d = {
     {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF,
-                        ::testing::ValuesIn(csradd_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD,
-                        ::testing::ValuesIn(csradd_inputs_d));
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, ::testing::ValuesIn(csradd_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, ::testing::ValuesIn(csradd_inputs_d));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index d98f9de9c3..3678d34bbe 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -51,26 +51,24 @@ struct ConnectComponentsInputs {
 };
 
 template <typename value_idx, typename value_t>
-class ConnectComponentsTest : public ::testing::TestWithParam<
-                                ConnectComponentsInputs<value_t, value_idx>> {
+class ConnectComponentsTest
+  : public ::testing::TestWithParam<ConnectComponentsInputs<value_t, value_idx>> {
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     raft::handle_t handle;
 
     auto d_alloc = handle.get_device_allocator();
-    auto stream = handle.get_stream();
+    auto stream  = handle.get_stream();
 
-    params = ::testing::TestWithParam<
-      ConnectComponentsInputs<value_t, value_idx>>::GetParam();
+    params = ::testing::TestWithParam<ConnectComponentsInputs<value_t, value_idx>>::GetParam();
 
-    raft::sparse::COO<value_t, value_idx> out_edges(
-      handle.get_device_allocator(), handle.get_stream());
+    raft::sparse::COO<value_t, value_idx> out_edges(handle.get_device_allocator(),
+                                                    handle.get_stream());
 
-    rmm::device_uvector<value_t> data(params.n_row * params.n_col,
-                                      handle.get_stream());
+    rmm::device_uvector<value_t> data(params.n_row * params.n_col, handle.get_stream());
 
-    raft::copy(data.data(), params.data.data(), data.size(),
-               handle.get_stream());
+    raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream());
 
     rmm::device_uvector<value_idx> indptr(params.n_row + 1, stream);
 
@@ -79,44 +77,58 @@ class ConnectComponentsTest : public ::testing::TestWithParam<
      */
     raft::sparse::COO<value_t, value_idx> knn_graph_coo(d_alloc, stream);
 
-    raft::sparse::selection::knn_graph(
-      handle, data.data(), params.n_row, params.n_col,
-      raft::distance::DistanceType::L2SqrtExpanded, knn_graph_coo, params.c);
+    raft::sparse::selection::knn_graph(handle,
+                                       data.data(),
+                                       params.n_row,
+                                       params.n_col,
+                                       raft::distance::DistanceType::L2SqrtExpanded,
+                                       knn_graph_coo,
+                                       params.c);
 
-    raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(),
-                                             knn_graph_coo.nnz, indptr.data(),
-                                             params.n_row + 1, d_alloc, stream);
+    raft::sparse::convert::sorted_coo_to_csr(
+      knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), params.n_row + 1, d_alloc, stream);
 
     /**
      * 2. Construct MST, sorted by weights
      */
     rmm::device_uvector<value_idx> colors(params.n_row, stream);
 
-    auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(
-      handle, indptr.data(), knn_graph_coo.cols(), knn_graph_coo.vals(),
-      params.n_row, knn_graph_coo.nnz, colors.data(), stream, false, true);
+    auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(handle,
+                                                                         indptr.data(),
+                                                                         knn_graph_coo.cols(),
+                                                                         knn_graph_coo.vals(),
+                                                                         params.n_row,
+                                                                         knn_graph_coo.nnz,
+                                                                         colors.data(),
+                                                                         stream,
+                                                                         false,
+                                                                         true);
 
     /**
      * 3. connect_components to fix connectivities
      */
-    raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> red_op(
-      colors.data(), params.n_row);
+    raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> red_op(colors.data(), params.n_row);
     raft::linkage::connect_components<value_idx, value_t>(
-      handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col,
-      red_op);
+      handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op);
 
     /**
      * Construct final edge list
      */
     rmm::device_uvector<value_idx> indptr2(params.n_row + 1, stream);
 
-    raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz,
-                                             indptr2.data(), params.n_row + 1,
-                                             d_alloc, stream);
+    raft::sparse::convert::sorted_coo_to_csr(
+      out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, d_alloc, stream);
 
-    auto output_mst = raft::mst::mst<value_idx, value_idx, value_t>(
-      handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row,
-      out_edges.nnz, colors.data(), stream, false, false);
+    auto output_mst = raft::mst::mst<value_idx, value_idx, value_t>(handle,
+                                                                    indptr2.data(),
+                                                                    out_edges.cols(),
+                                                                    out_edges.vals(),
+                                                                    params.n_row,
+                                                                    out_edges.nnz,
+                                                                    colors.data(),
+                                                                    stream,
+                                                                    false,
+                                                                    false);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
@@ -138,366 +150,199 @@ const std::vector<ConnectComponentsInputs<float, int>> fix_conn_inputsf2 = {
   // Test n_clusters == n_points
   {10,
    5,
-   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392,
-    0.77782677, 0.43772379, 0.4035871,  0.3282796,  0.47544681, 0.59862974,
-    0.12319357, 0.06239463, 0.28200272, 0.1345717,  0.50498218, 0.5113505,
-    0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
-    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,
-    0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792,
-    0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692,
-    0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
     0.76166195, 0.66613745},
    -1},
   // Test n_points == 100
   {100,
    10,
-   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01,
-    2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
-    6.88942598e-01, 5.79163537e-01, 6.70341547e-01,
-    2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
-    9.89948537e-01, 7.75253347e-01, 1.34491522e-02,
-    2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
-    7.86373507e-01, 7.18748577e-01, 8.66998621e-01,
-    6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
-    3.76246281e-01, 4.86828710e-01, 5.67464772e-01,
-    5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
-    9.49339111e-01, 3.55248484e-01, 9.06046929e-01,
-    4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
-    7.74840000e-01, 5.21046603e-01, 4.66423971e-02,
-    5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
-    4.31536306e-01, 5.83857744e-01, 4.41787364e-01,
-    4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
-    3.19650588e-01, 6.12579596e-01, 6.49126442e-02,
-    8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
-    9.46507115e-01, 8.58440748e-01, 3.61528940e-01,
-    2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
-    8.82216988e-01, 8.31498633e-01, 7.23474381e-01,
-    7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
-    5.13985168e-01, 3.00686418e-01, 8.70109949e-01,
-    2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
-    8.70985521e-01, 8.77491176e-01, 6.72537226e-01,
-    3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
-    6.18239142e-01, 2.64768597e-01, 5.76145451e-01,
-    8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
-    1.27645356e-01, 4.51004673e-01, 3.92292980e-01,
-    4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
-    5.71832605e-02, 2.06763039e-01, 3.70116249e-01,
-    2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
-    9.84156240e-02, 2.66249156e-01, 3.87635103e-01,
-    2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
-    6.86227676e-01, 1.08848960e-01, 5.96731841e-02,
-    3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
-    9.00700636e-01, 8.76363105e-01, 2.67334632e-01,
-    1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
-    1.09372387e-01, 8.74028108e-01, 6.46403232e-01,
-    4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
-    8.83865057e-01, 3.15879821e-01, 2.27043992e-01,
-    9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
-    2.40548962e-01, 3.21795663e-01, 8.75087904e-02,
-    8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
-    1.21958818e-01, 3.44348628e-02, 8.72630414e-01,
-    3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
-    5.33896401e-01, 6.21642973e-01, 4.93062535e-01,
-    4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
-    4.43447610e-01, 8.95646149e-01, 6.05220676e-01,
-    1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
-    6.92582693e-01, 7.55946922e-01, 7.95086143e-01,
-    6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
-    9.81114529e-01, 4.98266428e-01, 6.37127930e-03,
-    1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
-    7.38827633e-01, 8.93214770e-01, 2.16494306e-01,
-    9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
-    7.86240041e-01, 7.06854594e-01, 2.13725879e-02,
-    7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
-    5.01989826e-03, 4.22081572e-02, 1.65337732e-01,
-    8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
-    1.14028379e-01, 3.69739861e-01, 1.32955599e-01,
-    2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
-    6.88449594e-01, 4.44921417e-01, 8.23296587e-01,
-    1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
-    3.42600285e-01, 5.64505195e-01, 5.57594559e-01,
-    7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
-    3.21010077e-01, 8.55081359e-01, 4.30105779e-01,
-    1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
-    4.50880592e-01, 2.72289598e-01, 6.31615256e-01,
-    8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
-    2.68767748e-02, 2.43374608e-01, 4.02141103e-01,
-    4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
-    7.16149148e-01, 4.19664401e-01, 2.29335357e-01,
-    2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
-    5.69849716e-01, 5.86454477e-01, 3.54474989e-01,
-    9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
-    7.88039746e-02, 2.04814126e-01, 7.82251754e-01,
-    2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
-    2.95349590e-01, 6.57991826e-01, 8.81214312e-01,
-    5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
-    7.69797417e-02, 6.44792402e-01, 9.46950998e-01,
-    7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
-    1.67498426e-01, 2.66514296e-01, 6.50140368e-01,
-    1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
-    9.85033484e-01, 2.92909152e-01, 8.65816607e-01,
-    1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
-    2.89234322e-01, 8.18668708e-01, 4.71706924e-01,
-    9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
-    4.13915748e-01, 9.31274932e-02, 6.66322195e-01,
-    9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
-    5.03096313e-02, 6.95225201e-01, 5.78469859e-01,
-    6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
-    6.80663678e-01, 6.34607157e-01, 6.42765834e-01,
-    1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
-    4.68676824e-01, 2.86003928e-01, 7.18608322e-01,
-    8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
-    5.24379196e-01, 2.13526524e-01, 5.88375435e-01,
-    9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
-    9.53760881e-01, 5.27151288e-01, 7.03017278e-01,
-    3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
-    1.98979011e-01, 4.24917361e-01, 5.73172761e-01,
-    2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
-    9.29665524e-01, 2.26135696e-01, 9.20563384e-01,
-    7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
-    3.78559302e-03, 9.15219382e-01, 3.55705698e-01,
-    6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
-    2.49478206e-01, 7.93679304e-01, 4.75830027e-01,
-    4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
-    1.70386675e-01, 7.04056121e-01, 4.85963102e-01,
-    9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
-    2.58915007e-01, 6.70052890e-01, 2.61945109e-01,
-    8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
-    2.45776933e-01, 2.66658783e-01, 3.71724077e-01,
-    4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
-    8.07997684e-01, 1.64296275e-01, 6.01638065e-01,
-    8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
-    9.75844338e-01, 7.81226782e-01, 2.20925515e-01,
-    7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
-    9.08058083e-01, 6.88010677e-01, 8.14271847e-01,
-    5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
-    9.17455497e-01, 2.12052706e-01, 7.04074603e-01,
-    8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
-    8.54801557e-01, 2.49729159e-01, 9.76594604e-01,
-    2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
-    4.25193986e-01, 7.61869994e-01, 5.13334255e-01,
-    6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
-    1.08154647e-01, 8.78446825e-01, 2.43833016e-01,
-    9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
-    3.74510294e-01, 4.08451278e-02, 9.78392777e-01,
-    3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
-    5.25978080e-01, 1.42803678e-01, 4.05451674e-01,
-    7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
-    1.43159543e-02, 1.80363779e-01, 5.05096904e-01,
-    2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
-    8.73223968e-01, 4.38545619e-01, 4.81348800e-01,
-    6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
-    1.89869550e-01, 2.34083070e-01, 2.94066207e-01,
-    5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
-    6.72650672e-02, 8.47345378e-01, 2.80916761e-01,
-    7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
-    8.48781331e-01, 8.83225408e-01, 7.34398275e-01,
-    7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
-    5.40732486e-01, 3.69704071e-01, 5.77305837e-01,
-    2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
-    3.49496706e-01, 8.34948910e-01, 1.56403291e-02,
-    6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
-    1.43943153e-01, 3.49618530e-01, 2.10440392e-01,
-    3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
-    2.72177079e-01, 7.07946300e-01, 4.33717726e-02,
-    3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
-    6.22777789e-01, 2.95989228e-02, 4.32855769e-01,
-    7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
-    6.43721247e-01, 6.58025802e-01, 1.05247633e-02,
-    5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
-    6.62634841e-01, 8.25936616e-01, 9.91253704e-01,
-    6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
-    3.32139049e-01, 7.98732398e-01, 7.38865223e-01,
-    9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
-    1.83778839e-01, 7.27558919e-02, 5.91602822e-01,
-    3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
-    9.18556407e-01, 9.35373324e-01, 6.89209070e-01,
-    2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
-    9.84983432e-01, 6.62322741e-01, 2.04144457e-01,
-    3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
-    3.14043787e-01, 5.91072666e-01, 7.44703771e-01,
-    8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
-    1.41526372e-01, 4.14878484e-01, 6.80683651e-01,
-    5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
-    9.03269815e-01, 8.68443745e-01, 9.86939190e-01,
-    7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
-    9.69509248e-01, 1.11908818e-01, 4.49198556e-01,
-    1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
-    2.10747488e-01, 9.53884090e-01, 8.43167950e-01,
-    4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
-    3.55290379e-01, 2.95705968e-01, 1.69622690e-01,
-    1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
-    3.70932500e-01, 9.94292830e-01, 4.62587505e-01,
-    7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
-    5.75768304e-01, 9.71448393e-01, 6.95574827e-02,
-    3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
-    6.73797120e-02, 6.76596969e-01, 5.50948898e-01,
-    3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
-    3.03264879e-01, 7.61037886e-03, 2.72289601e-01,
-    1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
-    1.92088941e-01, 2.19043977e-01, 9.09320161e-01,
-    2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
-    2.23355609e-01, 1.84789435e-01, 4.16104518e-01,
-    4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
-    4.50328256e-01, 8.72199917e-01, 2.51279916e-01,
-    4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
-    1.06187277e-01, 4.92341327e-01, 1.46017513e-01,
-    5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
-    8.72648431e-01, 5.54051490e-01, 1.80745062e-01,
-    2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
-    8.30254678e-01, 5.00003328e-01, 4.69017439e-01,
-    6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
-    9.06516882e-02, 8.52975842e-01, 1.19985883e-01,
-    3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
-    6.28362507e-02, 4.32693501e-01, 3.10500685e-01,
-    6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
-    7.91284868e-01, 7.93054570e-01, 2.93406765e-01,
-    8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
-    8.67523104e-01, 1.47963482e-01, 1.25584706e-01,
-    3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
-    5.75553531e-02, 5.31607516e-01, 2.63869588e-01,
-    9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
-    7.74866558e-01, 5.65210610e-01, 7.28015327e-02,
-    6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
-    1.29932405e-01, 8.64026259e-01, 9.92599934e-01,
-    7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
-    8.11335531e-01, 7.87734900e-01, 9.87344678e-01,
-    5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
-    1.66085871e-01, 1.12937664e-01, 5.24423470e-01,
-    6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
-    3.08722276e-02, 6.26979315e-01, 4.49754105e-01,
-    8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
-    9.22168418e-01, 3.73210378e-01, 8.04432575e-01,
-    5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
-    2.40407640e-01, 5.91631279e-01, 1.59369206e-01,
-    7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
-    6.39105224e-01, 4.85274738e-01, 2.12630838e-01,
-    2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
-    5.23869697e-01, 9.99418314e-01, 8.35331599e-01,
-    4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
-    2.77001890e-02, 5.75809742e-01, 2.78513031e-01,
-    8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
-    7.88311357e-01, 9.64676177e-01, 1.75752651e-01,
-    4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
-    4.06647450e-01, 8.46539387e-01, 2.12620694e-01,
-    9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
-    9.63626055e-01, 5.96689242e-01, 1.63372670e-01,
-    4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
-    2.82327625e-01, 4.75535418e-01, 6.27760926e-01,
-    8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
-    5.05508062e-01, 5.28102944e-01, 6.13045057e-01,
-    7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
-    4.89839179e-01, 3.10496849e-01, 8.82309038e-01,
-    2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
-    4.62955493e-01, 2.38185305e-01, 5.47259907e-02,
-    7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
-    8.77741168e-01, 4.19881322e-01, 4.81222328e-01,
-    1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
-    7.37216484e-01, 5.62134821e-02, 7.14089724e-01,
-    9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
-    4.70237690e-01, 2.66524167e-01, 7.93875484e-01,
-    4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
-    1.70082405e-01, 6.35905179e-01, 3.75379109e-01,
-    4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
-    2.24643800e-01, 2.42142981e-01, 6.57283636e-01,
-    3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
-    9.43856291e-01, 4.47518596e-01, 5.44453573e-01,
-    9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
-    1.01179183e-01, 4.45473958e-01, 4.60327322e-01,
-    4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
-    3.41027487e-01, 1.56175026e-01, 7.58283148e-01,
-    6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
-    3.92517893e-01, 6.70418431e-01, 5.16440832e-01,
-    8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
-    7.39396341e-01, 7.20852434e-01, 2.35653246e-02,
-    3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
-    8.79339335e-01, 7.41599159e-02, 5.62433904e-01,
-    6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
-    5.26845015e-02, 5.58471266e-01, 1.63632233e-01,
-    5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
-    8.99035326e-01, 7.20847756e-01, 5.68954684e-01,
-    7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
-    4.89328290e-01, 5.62208561e-01, 4.97540804e-02,
-    4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
-    7.89548214e-01, 8.46136387e-01, 8.46816189e-01,
-    1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
-    4.50646654e-01, 3.74785037e-01, 4.87196697e-01,
-    4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
-    7.13597697e-01, 1.23641270e-02, 5.10031271e-01,
-    4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
-    1.91165703e-01, 4.51170940e-01, 7.50843157e-01,
-    4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
-    6.55689206e-01, 9.68257670e-02, 1.96528793e-01,
-    8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
-    9.41828079e-01, 4.54397338e-01, 5.61893331e-01,
-    5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
-    1.74888861e-01, 6.65641378e-01, 2.81668336e-01,
-    1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
-    8.25092797e-01, 5.18106324e-01, 1.71904024e-01,
-    3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
-    9.30274827e-01, 2.38198517e-01, 9.52222901e-01,
-    5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
-    4.83356794e-01, 2.73050402e-01, 3.68027050e-01,
-    5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
-    7.13926203e-01, 8.16750052e-01, 1.57890291e-01,
-    6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
-    1.02429784e-01, 9.17488471e-01, 4.03584434e-01,
-    9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
-    2.45200576e-01, 1.28896951e-01, 3.15713052e-01,
-    5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
-    7.74738919e-02, 8.42422142e-01, 3.75598924e-01,
-    3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
-    7.43107867e-01, 9.46182666e-01, 9.44344819e-01,
-    3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
-    3.84060507e-01, 2.91057722e-01, 7.68173662e-02,
-    1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
-    7.21342202e-01, 6.69471294e-03, 9.07298311e-01,
-    5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
-    2.06407453e-01, 2.59590556e-01, 7.58512718e-01,
-    5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
-    2.42829343e-01, 9.19323719e-01, 3.46832864e-01,
-    3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
-    9.58438860e-01, 5.66326411e-01, 6.60292846e-01,
-    5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
-    4.44713264e-01, 2.09732933e-01, 5.22732436e-01,
-    1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
-    4.94036404e-01, 4.09785794e-01, 6.40025507e-01,
-    5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
-    5.41072639e-01, 5.18847173e-01, 1.97093284e-01,
-    8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
-    3.87699807e-01, 4.50705808e-01, 2.49371643e-01,
-    3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
-    9.07275994e-01, 3.73075859e-01, 4.14044139e-03,
-    2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
-    4.50350196e-01, 3.48618117e-01, 5.07193932e-01,
-    5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
-    1.02623450e-01, 3.06088345e-01, 7.80461650e-01,
-    2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
-    3.68286735e-01, 7.39358243e-01, 8.97879394e-01,
-    9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
-    4.23976657e-02, 8.25922012e-01, 2.60956996e-01,
-    2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
-    8.49071471e-01, 3.45835425e-01, 7.65458276e-01,
-    5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
-    5.63368667e-02, 4.26548945e-01, 5.46745780e-01,
-    5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
-    4.46492976e-01, 6.40240123e-01, 2.73246969e-01,
-    2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
-    1.96617189e-01, 6.61271644e-01, 8.12687657e-01,
-    8.66342445e-01
+   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
+    6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
+    9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
+    7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
+    3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
+    9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
+    7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
+    4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
+    3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
+    9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
+    8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
+    5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
+    8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
+    6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
+    1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
+    5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
+    9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
+    6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
+    9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
+    1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
+    8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
+    2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
+    1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
+    5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
+    4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
+    6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
+    9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
+    7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
+    7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
+    5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
+    1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
+    6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
+    3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
+    3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
+    4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
+    2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
+    7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
+    5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
+    7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
+    2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
+    7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
+    1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
+    9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
+    2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
+    4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
+    5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
+    6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
+    4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
+    5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
+    9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
+    1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
+    9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
+    3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
+    2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
+    1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
+    2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
+    2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
+    8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
+    9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
+    9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
+    9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
+    8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
+    4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
+    1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
+    3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
+    5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
+    1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
+    8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
+    1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
+    6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
+    8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
+    5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
+    3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
+    1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
+    2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
+    6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
+    6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
+    6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
+    3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
+    1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
+    9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
+    9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
+    3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
+    1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
+    9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
+    9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
+    2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
+    3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
+    3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
+    5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
+    6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
+    3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
+    1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
+    2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
+    4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
+    1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
+    8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
+    8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
+    9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
+    6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
+    7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
+    8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
+    5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
+    7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
+    1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
+    8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
+    1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
+    3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
+    9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
+    2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
+    6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
+    5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
+    2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
+    7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
+    4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
+    9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
+    2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
+    5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
+    4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
+    4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
+    8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
+    7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
+    4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
+    1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
+    2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
+    9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
+    1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
+    3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
+    3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
+    7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
+    8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
+    5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
+    8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
+    4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
+    7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
+    4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
+    7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
+    1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
+    6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
+    9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
+    1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
+    8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
+    9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
+    4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
+    7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
+    1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
+    2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
+    7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
+    7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
+    3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
+    7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
+    2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
+    2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
+    9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
+    4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
+    4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
+    5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
+    3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
+    9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
+    4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
+    1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
+    3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
+    4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
+    8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
+    5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
+    4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
+    1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01
 
    },
    -4}};
 
 typedef ConnectComponentsTest<int, float> ConnectComponentsTestF_Int;
-TEST_P(ConnectComponentsTestF_Int, Result) {
+TEST_P(ConnectComponentsTestF_Int, Result)
+{
   /**
-     * Verify the src & dst vertices on each edge have different colors
-     */
+   * Verify the src & dst vertices on each edge have different colors
+   */
   EXPECT_TRUE(final_edges == params.n_row - 1);
 }
 
-INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, ConnectComponentsTestF_Int,
+INSTANTIATE_TEST_CASE_P(ConnectComponentsTest,
+                        ConnectComponentsTestF_Int,
                         ::testing::ValuesIn(fix_conn_inputsf2));
 };  // namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index ea69ecfc53..2e4c2c1a14 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -39,7 +39,8 @@ struct CSRtoCOOInputs {
 template <typename Index_>
 class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<CSRtoCOOInputs<Index_>>::GetParam();
 
     cudaStreamCreate(&stream);
@@ -48,20 +49,21 @@ class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
     raft::allocate(result, params.verify.size(), true);
   }
 
-  void Run() {
+  void Run()
+  {
     Index_ n_rows = params.ex_scan.size();
-    Index_ nnz = params.verify.size();
+    Index_ nnz    = params.verify.size();
 
     raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
     raft::update_device(verify, params.verify.data(), nnz, stream);
 
     convert::csr_to_coo<Index_, 32>(ex_scan, n_rows, result, nnz, stream);
 
-    ASSERT_TRUE(raft::devArrMatch<Index_>(verify, result, nnz,
-                                          raft::Compare<float>(), stream));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(verify, result, nnz, raft::Compare<float>(), stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(ex_scan));
     CUDA_CHECK(cudaFree(verify));
     CUDA_CHECK(cudaFree(result));
@@ -89,9 +91,11 @@ const std::vector<CSRtoCOOInputs<int64_t>> csrtocoo_inputs_64 = {
   {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestI,
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest,
+                        CSRtoCOOTestI,
                         ::testing::ValuesIn(csrtocoo_inputs_32));
-INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestL,
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest,
+                        CSRtoCOOTestL,
                         ::testing::ValuesIn(csrtocoo_inputs_64));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index 553ef2ddee..b2878081ae 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -37,14 +37,13 @@ struct SparseConvertCSRInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os,
-                           const SparseConvertCSRInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SparseConvertCSRInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
-class SparseConvertCSRTest
-  : public ::testing::TestWithParam<SparseConvertCSRInputs<T>> {
+class SparseConvertCSRTest : public ::testing::TestWithParam<SparseConvertCSRInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -54,22 +53,21 @@ class SparseConvertCSRTest
   SparseConvertCSRInputs<T> params;
 };
 
-const std::vector<SparseConvertCSRInputs<float>> inputsf = {
-  {5, 10, 5, 1234ULL}};
+const std::vector<SparseConvertCSRInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseConvertCSRTest<float> SortedCOOToCSR;
-TEST_P(SortedCOOToCSR, Result) {
+TEST_P(SortedCOOToCSR, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  std::shared_ptr<raft::mr::device::allocator> alloc(
-    new raft::mr::device::default_allocator);
+  std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
 
   int nnz = 8;
 
   int *in, *out, *exp;
 
-  int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
-  int *exp_h = new int[4]{0, 2, 4, 6};
+  int* in_h  = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
+  int* exp_h = new int[4]{0, 2, 4, 6};
 
   raft::allocate(in, nnz, true);
   raft::allocate(exp, 4, true);
@@ -92,8 +90,7 @@ TEST_P(SortedCOOToCSR, Result) {
   CUDA_CHECK(cudaFree(out));
 }
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesIn(inputsf));
 
 /******************************** adj graph ********************************/
 
@@ -107,10 +104,10 @@ struct CSRAdjGraphInputs {
 };
 
 template <typename Index_>
-class CSRAdjGraphTest
-  : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_>> {
+class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<CSRAdjGraphInputs<Index_>>::GetParam();
     cudaStreamCreate(&stream);
     nnz = params.verify.size();
@@ -121,20 +118,21 @@ class CSRAdjGraphTest
     raft::allocate(verify, nnz);
   }
 
-  void Run() {
+  void Run()
+  {
     raft::update_device(row_ind, params.row_ind.data(), params.n_rows, stream);
-    raft::update_device(adj, reinterpret_cast<bool *>(params.adj.data()),
-                        params.n_rows * params.n_cols, stream);
+    raft::update_device(
+      adj, reinterpret_cast<bool*>(params.adj.data()), params.n_rows * params.n_cols, stream);
     raft::update_device(verify, params.verify.data(), nnz, stream);
 
     convert::csr_adj_graph_batched<Index_, 32>(
       row_ind, params.n_cols, nnz, params.n_rows, adj, result, stream);
 
-    ASSERT_TRUE(
-      raft::devArrMatch<Index_>(verify, result, nnz, raft::Compare<Index_>()));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(verify, result, nnz, raft::Compare<Index_>()));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(row_ind));
     CUDA_CHECK(cudaFree(adj));
     CUDA_CHECK(cudaFree(verify));
@@ -147,7 +145,7 @@ class CSRAdjGraphTest
   cudaStream_t stream;
   Index_ nnz;
   Index_ *row_ind, *result, *verify;
-  bool *adj;
+  bool* adj;
 };
 
 using CSRAdjGraphTestI = CSRAdjGraphTest<int>;
@@ -171,9 +169,11 @@ const std::vector<CSRAdjGraphInputs<int64_t>> csradjgraph_inputs_l = {
    {0, 1, 2, 0, 1, 2, 0, 1, 2}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestI,
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        CSRAdjGraphTestI,
                         ::testing::ValuesIn(csradjgraph_inputs_i));
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestL,
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        CSRAdjGraphTestL,
                         ::testing::ValuesIn(csradjgraph_inputs_l));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index 625772a842..fe43f0d182 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -47,19 +47,19 @@ struct CSRRowSliceInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const CSRRowSliceInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const CSRRowSliceInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class CSRRowSliceTest
-  : public ::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>> {
+class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>> {
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
@@ -69,31 +69,27 @@ class CSRRowSliceTest
     update_device(indices, indices_h.data(), indices_h.size(), stream);
     update_device(data, data_h.data(), data_h.size(), stream);
 
-    std::vector<value_idx> out_indptr_ref_h = params.out_indptr_ref_h;
+    std::vector<value_idx> out_indptr_ref_h  = params.out_indptr_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
-    std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
+    std::vector<value_t> out_data_ref_h      = params.out_data_ref_h;
 
     allocate(out_indptr_ref, out_indptr_ref_h.size());
     allocate(out_indices_ref, out_indices_ref_h.size());
     allocate(out_data_ref, out_data_ref_h.size());
 
-    update_device(out_indptr_ref, out_indptr_ref_h.data(),
-                  out_indptr_ref_h.size(), stream);
-    update_device(out_indices_ref, out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), stream);
-    update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(),
-                  stream);
+    update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream);
+    update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream);
 
     allocate(out_indptr, out_indptr_ref_h.size());
     allocate(out_indices, out_indices_ref_h.size());
     allocate(out_data, out_data_ref_h.size());
   }
 
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      CSRRowSliceInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     make_data();
@@ -101,18 +97,22 @@ class CSRRowSliceTest
     int csr_start_offset;
     int csr_stop_offset;
 
-    raft::sparse::op::csr_row_slice_indptr(
-      params.start_row, params.stop_row, indptr, out_indptr, &csr_start_offset,
-      &csr_stop_offset, stream);
+    raft::sparse::op::csr_row_slice_indptr(params.start_row,
+                                           params.stop_row,
+                                           indptr,
+                                           out_indptr,
+                                           &csr_start_offset,
+                                           &csr_stop_offset,
+                                           stream);
 
-    raft::sparse::op::csr_row_slice_populate(csr_start_offset, csr_stop_offset,
-                                             indices, data, out_indices,
-                                             out_data, stream);
+    raft::sparse::op::csr_row_slice_populate(
+      csr_start_offset, csr_stop_offset, indices, data, out_indices, out_data, stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
@@ -125,15 +125,14 @@ class CSRRowSliceTest
     CUDA_CHECK(cudaFree(out_data_ref));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref,
-                            params.out_indptr_ref_h.size(),
-                            Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref,
-                            params.out_indices_ref_h.size(),
-                            Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_data, out_data_ref,
-                            params.out_data_ref_h.size(), Compare<value_t>()));
+  void compare()
+  {
+    ASSERT_TRUE(
+      devArrMatch(out_indptr, out_indptr_ref, params.out_indptr_ref_h.size(), Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(
+      out_indices, out_indices_ref, params.out_indices_ref_h.size(), Compare<value_t>()));
+    ASSERT_TRUE(
+      devArrMatch(out_data, out_data_ref, params.out_data_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
@@ -141,15 +140,15 @@ class CSRRowSliceTest
 
   // input data
   value_idx *indptr, *indices;
-  value_t *data;
+  value_t* data;
 
   // output data
   value_idx *out_indptr, *out_indices;
-  value_t *out_data;
+  value_t* out_data;
 
   // expected output data
   value_idx *out_indptr_ref, *out_indices_ref;
-  value_t *out_data_ref;
+  value_t* out_data_ref;
 
   CSRRowSliceInputs<value_idx, value_t> params;
 };
@@ -177,8 +176,7 @@ const std::vector<CSRRowSliceInputs<int, float>> inputs_i32_f = {
 };
 typedef CSRRowSliceTest<int, float> CSRRowSliceTestF;
 TEST_P(CSRRowSliceTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 5535df4fe3..286493ada7 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -43,19 +43,19 @@ struct CSRToDenseInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const CSRToDenseInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const CSRToDenseInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class CSRToDenseTest
-  : public ::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>> {
+class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>> {
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
@@ -74,24 +74,24 @@ class CSRToDenseTest
     allocate(out, out_ref_h.size());
   }
 
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      CSRToDenseInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
     make_data();
 
-    convert::csr_to_dense(handle, params.nrows, params.ncols, indptr, indices,
-                          data, params.nrows, out, stream, true);
+    convert::csr_to_dense(
+      handle, params.nrows, params.ncols, indptr, indices, data, params.nrows, out, stream, true);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
@@ -100,9 +100,9 @@ class CSRToDenseTest
     CUDA_CHECK(cudaFree(out_ref));
   }
 
-  void compare() {
-    ASSERT_TRUE(
-      devArrMatch(out, out_ref, params.out_ref_h.size(), Compare<value_t>()));
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out, out_ref, params.out_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
@@ -111,13 +111,13 @@ class CSRToDenseTest
 
   // input data
   value_idx *indptr, *indices;
-  value_t *data;
+  value_t* data;
 
   // output data
-  value_t *out;
+  value_t* out;
 
   // expected output data
-  value_t *out_ref;
+  value_t* out_ref;
 
   CSRToDenseInputs<value_idx, value_t> params;
 };
@@ -128,13 +128,26 @@ const std::vector<CSRToDenseInputs<int, float>> inputs_i32_f = {
    {0, 2, 4, 6, 8},
    {0, 1, 2, 3, 0, 1, 2, 3},  // indices
    {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
-   {1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 5.0f, 50.0f, 28.0f, 0.0f, 0.0f,
-    0.0f, 0.0f, 16.0f, 2.0f}},
+   {1.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    1.0f,
+    5.0f,
+    50.0f,
+    28.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    16.0f,
+    2.0f}},
 };
 typedef CSRToDenseTest<int, float> CSRToDenseTestF;
 TEST_P(CSRToDenseTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index c257d6eb3c..87b8b17073 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -49,19 +49,19 @@ struct CSRTransposeInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const CSRTransposeInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const CSRTransposeInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class CSRTransposeTest
-  : public ::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>> {
+class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>> {
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
@@ -71,45 +71,51 @@ class CSRTransposeTest
     update_device(indices, indices_h.data(), indices_h.size(), stream);
     update_device(data, data_h.data(), data_h.size(), stream);
 
-    std::vector<value_idx> out_indptr_ref_h = params.out_indptr_ref_h;
+    std::vector<value_idx> out_indptr_ref_h  = params.out_indptr_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
-    std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
+    std::vector<value_t> out_data_ref_h      = params.out_data_ref_h;
 
     allocate(out_indptr_ref, out_indptr_ref_h.size());
     allocate(out_indices_ref, out_indices_ref_h.size());
     allocate(out_data_ref, out_data_ref_h.size());
 
-    update_device(out_indptr_ref, out_indptr_ref_h.data(),
-                  out_indptr_ref_h.size(), stream);
-    update_device(out_indices_ref, out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), stream);
-    update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(),
-                  stream);
+    update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream);
+    update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream);
 
     allocate(out_indptr, out_indptr_ref_h.size());
     allocate(out_indices, out_indices_ref_h.size());
     allocate(out_data, out_data_ref_h.size());
   }
 
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      CSRTransposeInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
     make_data();
 
-    raft::sparse::linalg::csr_transpose(
-      handle, indptr, indices, data, out_indptr, out_indices, out_data,
-      params.nrows, params.ncols, params.nnz, alloc, stream);
+    raft::sparse::linalg::csr_transpose(handle,
+                                        indptr,
+                                        indices,
+                                        data,
+                                        out_indptr,
+                                        out_indices,
+                                        out_data,
+                                        params.nrows,
+                                        params.ncols,
+                                        params.nnz,
+                                        alloc,
+                                        stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
@@ -122,15 +128,14 @@ class CSRTransposeTest
     CUDA_CHECK(cudaFree(out_data_ref));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref,
-                            params.out_indptr_ref_h.size(),
-                            Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref,
-                            params.out_indices_ref_h.size(),
-                            Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_data, out_data_ref,
-                            params.out_data_ref_h.size(), Compare<value_t>()));
+  void compare()
+  {
+    ASSERT_TRUE(
+      devArrMatch(out_indptr, out_indptr_ref, params.out_indptr_ref_h.size(), Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(
+      out_indices, out_indices_ref, params.out_indices_ref_h.size(), Compare<value_t>()));
+    ASSERT_TRUE(
+      devArrMatch(out_data, out_data_ref, params.out_data_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
@@ -139,15 +144,15 @@ class CSRTransposeTest
 
   // input data
   value_idx *indptr, *indices;
-  value_t *data;
+  value_t* data;
 
   // output data
   value_idx *out_indptr, *out_indices;
-  value_t *out_data;
+  value_t* out_data;
 
   // expected output data
   value_idx *out_indptr_ref, *out_indices_ref;
-  value_t *out_data_ref;
+  value_t* out_data_ref;
 
   CSRTransposeInputs<value_idx, value_t> params;
 };
@@ -167,8 +172,7 @@ const std::vector<CSRTransposeInputs<int, float>> inputs_i32_f = {
 };
 typedef CSRTransposeTest<int, float> CSRTransposeTestF;
 TEST_P(CSRTransposeTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index 5d687ad92b..c6b2a27273 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -33,8 +33,7 @@ struct SparseDegreeInputs {
 };
 
 template <typename T>
-class SparseDegreeTests
-  : public ::testing::TestWithParam<SparseDegreeInputs<T>> {
+class SparseDegreeTests : public ::testing::TestWithParam<SparseDegreeInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -47,11 +46,12 @@ class SparseDegreeTests
 const std::vector<SparseDegreeInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseDegreeTests<float> COODegree;
-TEST_P(COODegree, Result) {
+TEST_P(COODegree, Result)
+{
   int *in_rows, *verify, *results;
 
   int in_rows_h[5] = {0, 0, 1, 2, 2};
-  int verify_h[5] = {2, 1, 2, 0, 0};
+  int verify_h[5]  = {2, 1, 2, 0, 0};
 
   raft::allocate(in_rows, 5);
   raft::allocate(verify, 5, true);
@@ -70,16 +70,17 @@ TEST_P(COODegree, Result) {
 }
 
 typedef SparseDegreeTests<float> COODegreeNonzero;
-TEST_P(COODegreeNonzero, Result) {
+TEST_P(COODegreeNonzero, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
   int *in_rows, *verify, *results;
-  float *in_vals;
+  float* in_vals;
 
-  int in_rows_h[5] = {0, 0, 1, 2, 2};
+  int in_rows_h[5]   = {0, 0, 1, 2, 2};
   float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0};
-  int verify_h[5] = {1, 0, 2, 0, 0};
+  int verify_h[5]    = {1, 0, 2, 0, 0};
 
   raft::allocate(in_rows, 5);
   raft::allocate(verify, 5, true);
@@ -101,10 +102,8 @@ TEST_P(COODegreeNonzero, Result) {
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree,
-                        ::testing::ValuesIn(inputsf));
-INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, ::testing::ValuesIn(inputsf));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index a83b93f83f..7c0db49a04 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -55,71 +55,82 @@ struct InputConfiguration {
 };
 
 using dense_smem_strategy_t = dense_smem_strategy<int, float, 1024>;
-using hash_strategy_t = hash_strategy<int, float, 1024>;
+using hash_strategy_t       = hash_strategy<int, float, 1024>;
 
 template <typename value_idx, typename value_t, typename strategy_t>
 struct SparseDistanceCOOSPMVInputs {
   InputConfiguration<value_idx, value_t> input_configuration;
 
   float capacity_threshold = 0.5;
-  int map_size = hash_strategy<value_idx, value_t, 1024>::get_map_size();
+  int map_size             = hash_strategy<value_idx, value_t, 1024>::get_map_size();
 };
 
 template <typename value_idx, typename value_t, typename strategy_t>
-::std::ostream &operator<<(
-  ::std::ostream &os,
-  const SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os,
+                           const SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t, typename strategy_t>
 class SparseDistanceCOOSPMVTest
-  : public ::testing::TestWithParam<
-      SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>> {
+  : public ::testing::TestWithParam<SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>> {
  public:
   SparseDistanceCOOSPMVTest() : dist_config(handle) {}
 
-  template <typename U,
-            std::enable_if_t<std::is_same_v<U, hash_strategy_t>> * = nullptr>
-  U make_strategy() {
+  template <typename U, std::enable_if_t<std::is_same_v<U, hash_strategy_t>>* = nullptr>
+  U make_strategy()
+  {
     return strategy_t(dist_config, params.capacity_threshold, params.map_size);
   }
 
-  template <typename U, std::enable_if_t<
-                          std::is_same_v<U, dense_smem_strategy_t>> * = nullptr>
-  U make_strategy() {
+  template <typename U, std::enable_if_t<std::is_same_v<U, dense_smem_strategy_t>>* = nullptr>
+  U make_strategy()
+  {
     return strategy_t(dist_config);
   }
 
   template <typename reduce_f, typename accum_f, typename write_f>
-  void compute_dist(reduce_f reduce_func, accum_f accum_func,
-                    write_f write_func, bool rev = true) {
-    raft::mr::device::buffer<value_idx> coo_rows(
-      dist_config.handle.get_device_allocator(),
-      dist_config.handle.get_stream(),
-      max(dist_config.b_nnz, dist_config.a_nnz));
-
-    raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows,
-                                      coo_rows.data(), dist_config.b_nnz,
+  void compute_dist(reduce_f reduce_func, accum_f accum_func, write_f write_func, bool rev = true)
+  {
+    raft::mr::device::buffer<value_idx> coo_rows(dist_config.handle.get_device_allocator(),
+                                                 dist_config.handle.get_stream(),
+                                                 max(dist_config.b_nnz, dist_config.a_nnz));
+
+    raft::sparse::convert::csr_to_coo(dist_config.b_indptr,
+                                      dist_config.b_nrows,
+                                      coo_rows.data(),
+                                      dist_config.b_nnz,
                                       dist_config.handle.get_stream());
 
     strategy_t selected_strategy = make_strategy<strategy_t>();
-    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_dists, dist_config, coo_rows.data(), reduce_func, accum_func,
-      write_func, selected_strategy);
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(out_dists,
+                                                               dist_config,
+                                                               coo_rows.data(),
+                                                               reduce_func,
+                                                               accum_func,
+                                                               write_func,
+                                                               selected_strategy);
 
     if (rev) {
-      raft::sparse::convert::csr_to_coo(
-        dist_config.a_indptr, dist_config.a_nrows, coo_rows.data(),
-        dist_config.a_nnz, dist_config.handle.get_stream());
-
-      balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
-        out_dists, dist_config, coo_rows.data(), reduce_func, accum_func,
-        write_func, selected_strategy);
+      raft::sparse::convert::csr_to_coo(dist_config.a_indptr,
+                                        dist_config.a_nrows,
+                                        coo_rows.data(),
+                                        dist_config.a_nnz,
+                                        dist_config.handle.get_stream());
+
+      balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(out_dists,
+                                                                     dist_config,
+                                                                     coo_rows.data(),
+                                                                     reduce_func,
+                                                                     accum_func,
+                                                                     write_func,
+                                                                     selected_strategy);
     }
   }
 
-  void run_spmv() {
+  void run_spmv()
+  {
     switch (params.input_configuration.metric) {
       case raft::distance::DistanceType::InnerProduct:
         compute_dist(Product(), Sum(), AtomicAdd(), true);
@@ -129,75 +140,69 @@ class SparseDistanceCOOSPMVTest
         break;
       case raft::distance::DistanceType::Canberra:
         compute_dist(
-          [] __device__(value_t a, value_t b) {
-            return fabsf(a - b) / (fabsf(a) + fabsf(b));
-          },
-          Sum(), AtomicAdd());
-        break;
-      case raft::distance::DistanceType::L1:
-        compute_dist(AbsDiff(), Sum(), AtomicAdd());
-        break;
-      case raft::distance::DistanceType::Linf:
-        compute_dist(AbsDiff(), Max(), AtomicMax());
+          [] __device__(value_t a, value_t b) { return fabsf(a - b) / (fabsf(a) + fabsf(b)); },
+          Sum(),
+          AtomicAdd());
         break;
+      case raft::distance::DistanceType::L1: compute_dist(AbsDiff(), Sum(), AtomicAdd()); break;
+      case raft::distance::DistanceType::Linf: compute_dist(AbsDiff(), Max(), AtomicMax()); break;
       case raft::distance::DistanceType::LpUnexpanded: {
-        compute_dist(PDiff(params.input_configuration.metric_arg), Sum(),
-                     AtomicAdd());
+        compute_dist(PDiff(params.input_configuration.metric_arg), Sum(), AtomicAdd());
         float p = 1.0f / params.input_configuration.metric_arg;
         raft::linalg::unaryOp<value_t>(
-          out_dists, out_dists, dist_config.a_nrows * dist_config.b_nrows,
+          out_dists,
+          out_dists,
+          dist_config.a_nrows * dist_config.b_nrows,
           [=] __device__(value_t input) { return powf(input, p); },
           dist_config.handle.get_stream());
 
       } break;
-      default:
-        throw raft::exception("Unknown distance");
+      default: throw raft::exception("Unknown distance");
     }
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.input_configuration.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.input_configuration.indptr_h;
     std::vector<value_idx> indices_h = params.input_configuration.indices_h;
-    std::vector<value_t> data_h = params.input_configuration.data_h;
+    std::vector<value_t> data_h      = params.input_configuration.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
     allocate(data, data_h.size());
 
-    update_device(indptr, indptr_h.data(), indptr_h.size(),
-                  handle.get_stream());
-    update_device(indices, indices_h.data(), indices_h.size(),
-                  handle.get_stream());
+    update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream());
+    update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream());
     update_device(data, data_h.data(), data_h.size(), handle.get_stream());
 
-    std::vector<value_t> out_dists_ref_h =
-      params.input_configuration.out_dists_ref_h;
+    std::vector<value_t> out_dists_ref_h = params.input_configuration.out_dists_ref_h;
 
     allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1));
 
-    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
-                  handle.get_stream());
+    update_device(
+      out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream());
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<
       SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>>::GetParam();
 
     make_data();
 
-    dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1;
-    dist_config.b_ncols = params.input_configuration.n_cols;
-    dist_config.b_nnz = params.input_configuration.indices_h.size();
-    dist_config.b_indptr = indptr;
+    dist_config.b_nrows   = params.input_configuration.indptr_h.size() - 1;
+    dist_config.b_ncols   = params.input_configuration.n_cols;
+    dist_config.b_nnz     = params.input_configuration.indices_h.size();
+    dist_config.b_indptr  = indptr;
     dist_config.b_indices = indices;
-    dist_config.b_data = data;
-    dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1;
-    dist_config.a_ncols = params.input_configuration.n_cols;
-    dist_config.a_nnz = params.input_configuration.indices_h.size();
-    dist_config.a_indptr = indptr;
+    dist_config.b_data    = data;
+    dist_config.a_nrows   = params.input_configuration.indptr_h.size() - 1;
+    dist_config.a_ncols   = params.input_configuration.n_cols;
+    dist_config.a_nnz     = params.input_configuration.indices_h.size();
+    dist_config.a_indptr  = indptr;
     dist_config.a_indices = indices;
-    dist_config.a_data = data;
+    dist_config.a_data    = data;
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
@@ -208,7 +213,8 @@ class SparseDistanceCOOSPMVTest
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
@@ -217,8 +223,10 @@ class SparseDistanceCOOSPMVTest
     CUDA_CHECK(cudaFree(out_dists_ref));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_dists_ref,
+                            out_dists,
                             params.input_configuration.out_dists_ref_h.size(),
                             CompareApprox<value_t>(1e-3)));
   }
@@ -228,7 +236,7 @@ class SparseDistanceCOOSPMVTest
 
   // input data
   value_idx *indptr, *indices;
-  value_t *data;
+  value_t* data;
 
   // output data
   value_t *out_dists, *out_dists_ref;
@@ -243,8 +251,7 @@ const InputConfiguration<int, float> input_inner_product = {
   {0, 2, 4, 6, 8},
   {0, 1, 0, 1, 0, 1, 0, 1},
   {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
-  {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
-   5.0},
+  {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0},
   raft::distance::DistanceType::InnerProduct,
   0.0};
 
@@ -275,384 +282,379 @@ const InputConfiguration<int, float> input_l2_unexpanded = {
   raft::distance::DistanceType::L2Unexpanded,
   0.0};
 
-const InputConfiguration<int, float> input_canberra =
-  {10,
-   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.0,
-    3.3954660629919076,
-    5.6469232737388815,
-    6.373112846266441,
-    4.0212880272531715,
-    6.916281504639404,
-    5.741508386786526,
-    5.411470999663036,
-    9.0,
-    4.977014354725805,
-    3.3954660629919076,
-    0.0,
-    7.56256082439209,
-    5.540261147481582,
-    4.832322929216881,
-    4.62003193872216,
-    6.498056792320361,
-    4.309846252268695,
-    6.317531174829905,
-    6.016362684141827,
-    5.6469232737388815,
-    7.56256082439209,
-    0.0,
-    5.974878731322299,
-    4.898357301336036,
-    6.442097410320605,
-    5.227077347287883,
-    7.134101195584642,
-    5.457753923371659,
-    7.0,
-    6.373112846266441,
-    5.540261147481582,
-    5.974878731322299,
-    0.0,
-    5.5507273748583,
-    4.897749658726415,
-    9.0,
-    8.398776718824767,
-    3.908281400328807,
-    4.83431066343688,
-    4.0212880272531715,
-    4.832322929216881,
-    4.898357301336036,
-    5.5507273748583,
-    0.0,
-    6.632989819428174,
-    7.438852294822894,
-    5.6631570310967465,
-    7.579428202635459,
-    6.760811985364303,
-    6.916281504639404,
-    4.62003193872216,
-    6.442097410320605,
-    4.897749658726415,
-    6.632989819428174,
-    0.0,
-    5.249404187382862,
-    6.072559523278559,
-    4.07661278488929,
-    6.19678948003145,
-    5.741508386786526,
-    6.498056792320361,
-    5.227077347287883,
-    9.0,
-    7.438852294822894,
-    5.249404187382862,
-    0.0,
-    3.854811639654704,
-    6.652724827169063,
-    5.298236851430971,
-    5.411470999663036,
-    4.309846252268695,
-    7.134101195584642,
-    8.398776718824767,
-    5.6631570310967465,
-    6.072559523278559,
-    3.854811639654704,
-    0.0,
-    7.529184598969917,
-    6.903282911791188,
-    9.0,
-    6.317531174829905,
-    5.457753923371659,
-    3.908281400328807,
-    7.579428202635459,
-    4.07661278488929,
-    6.652724827169063,
-    7.529184598969917,
-    0.0,
-    7.0,
-    4.977014354725805,
-    6.016362684141827,
-    7.0,
-    4.83431066343688,
-    6.760811985364303,
-    6.19678948003145,
-    5.298236851430971,
-    6.903282911791188,
-    7.0,
-    0.0},
-   raft::distance::DistanceType::Canberra,
-   0.0};
-
-const InputConfiguration<int, float> input_lp_unexpanded =
-  {10,
-   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.0,
-    1.31462855332296,
-    1.3690307816129905,
-    1.698603990921237,
-    1.3460470789553531,
-    1.6636670712582544,
-    1.2651744044972217,
-    1.1938329352055201,
-    1.8811409082590185,
-    1.3653115050624267,
-    1.31462855332296,
-    0.0,
-    1.9447722703291133,
-    1.42818777206562,
-    1.4685491458946494,
-    1.3071999866010466,
-    1.4988622861692171,
-    0.9698559287406783,
-    1.4972023224597841,
-    1.5243383567266802,
-    1.3690307816129905,
-    1.9447722703291133,
-    0.0,
-    1.2748400840107568,
-    1.0599569946448246,
-    1.546591282841402,
-    1.147526531928459,
-    1.447002179128145,
-    1.5982242387673176,
-    1.3112533607072414,
-    1.698603990921237,
-    1.42818777206562,
-    1.2748400840107568,
-    0.0,
-    1.038121552545461,
-    1.011788365364402,
-    1.3907391109256988,
-    1.3128200942311496,
-    1.19595706584447,
-    1.3233328139624725,
-    1.3460470789553531,
-    1.4685491458946494,
-    1.0599569946448246,
-    1.038121552545461,
-    0.0,
-    1.3642741698145529,
-    1.3493868683808095,
-    1.394942694628328,
-    1.572881849642552,
-    1.380122665319464,
-    1.6636670712582544,
-    1.3071999866010466,
-    1.546591282841402,
-    1.011788365364402,
-    1.3642741698145529,
-    0.0,
-    1.018961640373018,
-    1.0114394258945634,
-    0.8338711034820684,
-    1.1247823842299223,
-    1.2651744044972217,
-    1.4988622861692171,
-    1.147526531928459,
-    1.3907391109256988,
-    1.3493868683808095,
-    1.018961640373018,
-    0.0,
-    0.7701238110357329,
-    1.245486437864406,
-    0.5551259549534626,
-    1.1938329352055201,
-    0.9698559287406783,
-    1.447002179128145,
-    1.3128200942311496,
-    1.394942694628328,
-    1.0114394258945634,
-    0.7701238110357329,
-    0.0,
-    1.1886800117391216,
-    1.0083692448135637,
-    1.8811409082590185,
-    1.4972023224597841,
-    1.5982242387673176,
-    1.19595706584447,
-    1.572881849642552,
-    0.8338711034820684,
-    1.245486437864406,
-    1.1886800117391216,
-    0.0,
-    1.3661374102525012,
-    1.3653115050624267,
-    1.5243383567266802,
-    1.3112533607072414,
-    1.3233328139624725,
-    1.380122665319464,
-    1.1247823842299223,
-    0.5551259549534626,
-    1.0083692448135637,
-    1.3661374102525012,
-    0.0},
-   raft::distance::DistanceType::LpUnexpanded,
-   2.0};
-
-const InputConfiguration<int, float> input_linf =
-  {10,
-   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.0,
-    0.9251771844789913,
-    0.9036452083899731,
-    0.9251771844789913,
-    0.8706483735804971,
-    0.9251771844789913,
-    0.717493881903289,
-    0.6920214832303888,
-    0.9251771844789913,
-    0.9251771844789913,
-    0.9251771844789913,
-    0.0,
-    0.9036452083899731,
-    0.8655339692155823,
-    0.8706483735804971,
-    0.8655339692155823,
-    0.8655339692155823,
-    0.6329837991017668,
-    0.8655339692155823,
-    0.8655339692155823,
-    0.9036452083899731,
-    0.9036452083899731,
-    0.0,
-    0.7988276152181608,
-    0.7028075145996631,
-    0.9036452083899731,
-    0.9036452083899731,
-    0.9036452083899731,
-    0.8429599432532096,
-    0.9036452083899731,
-    0.9251771844789913,
-    0.8655339692155823,
-    0.7988276152181608,
-    0.0,
-    0.48376552205293305,
-    0.8206394616536681,
-    0.8206394616536681,
-    0.8206394616536681,
-    0.8429599432532096,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.8706483735804971,
-    0.7028075145996631,
-    0.48376552205293305,
-    0.0,
-    0.8706483735804971,
-    0.8706483735804971,
-    0.8706483735804971,
-    0.8429599432532096,
-    0.8706483735804971,
-    0.9251771844789913,
-    0.8655339692155823,
-    0.9036452083899731,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.0,
-    0.8853924473642432,
-    0.535821510936138,
-    0.6497196601457607,
-    0.8853924473642432,
-    0.717493881903289,
-    0.8655339692155823,
-    0.9036452083899731,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.8853924473642432,
-    0.0,
-    0.5279604218147174,
-    0.6658348373853169,
-    0.33799874888632914,
-    0.6920214832303888,
-    0.6329837991017668,
-    0.9036452083899731,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.535821510936138,
-    0.5279604218147174,
-    0.0,
-    0.662579808115858,
-    0.5079750812968089,
-    0.9251771844789913,
-    0.8655339692155823,
-    0.8429599432532096,
-    0.8429599432532096,
-    0.8429599432532096,
-    0.6497196601457607,
-    0.6658348373853169,
-    0.662579808115858,
-    0.0,
-    0.8429599432532096,
-    0.9251771844789913,
-    0.8655339692155823,
-    0.9036452083899731,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.8853924473642432,
-    0.33799874888632914,
-    0.5079750812968089,
-    0.8429599432532096,
-    0.0},
-   raft::distance::DistanceType::Linf,
-   0.0};
-
-const InputConfiguration<int, float> input_l1 = {
-  4,
-  {0, 1, 1, 2, 4},
-  {3, 2, 0, 1},  // indices
-  {0.99296, 0.42180, 0.11687, 0.305869},
-  {
-    // dense output
-    0.0,
-    0.99296,
-    1.41476,
-    1.415707,
-    0.99296,
-    0.0,
-    0.42180,
-    0.42274,
-    1.41476,
-    0.42180,
-    0.0,
-    0.84454,
-    1.41570,
-    0.42274,
-    0.84454,
-    0.0,
-  },
-  raft::distance::DistanceType::L1,
+const InputConfiguration<int, float> input_canberra = {
+  10,
+  {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+  {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+   6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+  {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+   0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+   0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+   0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+   0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+  {0.0,
+   3.3954660629919076,
+   5.6469232737388815,
+   6.373112846266441,
+   4.0212880272531715,
+   6.916281504639404,
+   5.741508386786526,
+   5.411470999663036,
+   9.0,
+   4.977014354725805,
+   3.3954660629919076,
+   0.0,
+   7.56256082439209,
+   5.540261147481582,
+   4.832322929216881,
+   4.62003193872216,
+   6.498056792320361,
+   4.309846252268695,
+   6.317531174829905,
+   6.016362684141827,
+   5.6469232737388815,
+   7.56256082439209,
+   0.0,
+   5.974878731322299,
+   4.898357301336036,
+   6.442097410320605,
+   5.227077347287883,
+   7.134101195584642,
+   5.457753923371659,
+   7.0,
+   6.373112846266441,
+   5.540261147481582,
+   5.974878731322299,
+   0.0,
+   5.5507273748583,
+   4.897749658726415,
+   9.0,
+   8.398776718824767,
+   3.908281400328807,
+   4.83431066343688,
+   4.0212880272531715,
+   4.832322929216881,
+   4.898357301336036,
+   5.5507273748583,
+   0.0,
+   6.632989819428174,
+   7.438852294822894,
+   5.6631570310967465,
+   7.579428202635459,
+   6.760811985364303,
+   6.916281504639404,
+   4.62003193872216,
+   6.442097410320605,
+   4.897749658726415,
+   6.632989819428174,
+   0.0,
+   5.249404187382862,
+   6.072559523278559,
+   4.07661278488929,
+   6.19678948003145,
+   5.741508386786526,
+   6.498056792320361,
+   5.227077347287883,
+   9.0,
+   7.438852294822894,
+   5.249404187382862,
+   0.0,
+   3.854811639654704,
+   6.652724827169063,
+   5.298236851430971,
+   5.411470999663036,
+   4.309846252268695,
+   7.134101195584642,
+   8.398776718824767,
+   5.6631570310967465,
+   6.072559523278559,
+   3.854811639654704,
+   0.0,
+   7.529184598969917,
+   6.903282911791188,
+   9.0,
+   6.317531174829905,
+   5.457753923371659,
+   3.908281400328807,
+   7.579428202635459,
+   4.07661278488929,
+   6.652724827169063,
+   7.529184598969917,
+   0.0,
+   7.0,
+   4.977014354725805,
+   6.016362684141827,
+   7.0,
+   4.83431066343688,
+   6.760811985364303,
+   6.19678948003145,
+   5.298236851430971,
+   6.903282911791188,
+   7.0,
+   0.0},
+  raft::distance::DistanceType::Canberra,
   0.0};
 
+const InputConfiguration<int, float> input_lp_unexpanded = {
+  10,
+  {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+  {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+   6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+  {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+   0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+   0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+   0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+   0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+  {0.0,
+   1.31462855332296,
+   1.3690307816129905,
+   1.698603990921237,
+   1.3460470789553531,
+   1.6636670712582544,
+   1.2651744044972217,
+   1.1938329352055201,
+   1.8811409082590185,
+   1.3653115050624267,
+   1.31462855332296,
+   0.0,
+   1.9447722703291133,
+   1.42818777206562,
+   1.4685491458946494,
+   1.3071999866010466,
+   1.4988622861692171,
+   0.9698559287406783,
+   1.4972023224597841,
+   1.5243383567266802,
+   1.3690307816129905,
+   1.9447722703291133,
+   0.0,
+   1.2748400840107568,
+   1.0599569946448246,
+   1.546591282841402,
+   1.147526531928459,
+   1.447002179128145,
+   1.5982242387673176,
+   1.3112533607072414,
+   1.698603990921237,
+   1.42818777206562,
+   1.2748400840107568,
+   0.0,
+   1.038121552545461,
+   1.011788365364402,
+   1.3907391109256988,
+   1.3128200942311496,
+   1.19595706584447,
+   1.3233328139624725,
+   1.3460470789553531,
+   1.4685491458946494,
+   1.0599569946448246,
+   1.038121552545461,
+   0.0,
+   1.3642741698145529,
+   1.3493868683808095,
+   1.394942694628328,
+   1.572881849642552,
+   1.380122665319464,
+   1.6636670712582544,
+   1.3071999866010466,
+   1.546591282841402,
+   1.011788365364402,
+   1.3642741698145529,
+   0.0,
+   1.018961640373018,
+   1.0114394258945634,
+   0.8338711034820684,
+   1.1247823842299223,
+   1.2651744044972217,
+   1.4988622861692171,
+   1.147526531928459,
+   1.3907391109256988,
+   1.3493868683808095,
+   1.018961640373018,
+   0.0,
+   0.7701238110357329,
+   1.245486437864406,
+   0.5551259549534626,
+   1.1938329352055201,
+   0.9698559287406783,
+   1.447002179128145,
+   1.3128200942311496,
+   1.394942694628328,
+   1.0114394258945634,
+   0.7701238110357329,
+   0.0,
+   1.1886800117391216,
+   1.0083692448135637,
+   1.8811409082590185,
+   1.4972023224597841,
+   1.5982242387673176,
+   1.19595706584447,
+   1.572881849642552,
+   0.8338711034820684,
+   1.245486437864406,
+   1.1886800117391216,
+   0.0,
+   1.3661374102525012,
+   1.3653115050624267,
+   1.5243383567266802,
+   1.3112533607072414,
+   1.3233328139624725,
+   1.380122665319464,
+   1.1247823842299223,
+   0.5551259549534626,
+   1.0083692448135637,
+   1.3661374102525012,
+   0.0},
+  raft::distance::DistanceType::LpUnexpanded,
+  2.0};
+
+const InputConfiguration<int, float> input_linf = {
+  10,
+  {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+  {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+   6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+  {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+   0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+   0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+   0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+   0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+  {0.0,
+   0.9251771844789913,
+   0.9036452083899731,
+   0.9251771844789913,
+   0.8706483735804971,
+   0.9251771844789913,
+   0.717493881903289,
+   0.6920214832303888,
+   0.9251771844789913,
+   0.9251771844789913,
+   0.9251771844789913,
+   0.0,
+   0.9036452083899731,
+   0.8655339692155823,
+   0.8706483735804971,
+   0.8655339692155823,
+   0.8655339692155823,
+   0.6329837991017668,
+   0.8655339692155823,
+   0.8655339692155823,
+   0.9036452083899731,
+   0.9036452083899731,
+   0.0,
+   0.7988276152181608,
+   0.7028075145996631,
+   0.9036452083899731,
+   0.9036452083899731,
+   0.9036452083899731,
+   0.8429599432532096,
+   0.9036452083899731,
+   0.9251771844789913,
+   0.8655339692155823,
+   0.7988276152181608,
+   0.0,
+   0.48376552205293305,
+   0.8206394616536681,
+   0.8206394616536681,
+   0.8206394616536681,
+   0.8429599432532096,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.8706483735804971,
+   0.7028075145996631,
+   0.48376552205293305,
+   0.0,
+   0.8706483735804971,
+   0.8706483735804971,
+   0.8706483735804971,
+   0.8429599432532096,
+   0.8706483735804971,
+   0.9251771844789913,
+   0.8655339692155823,
+   0.9036452083899731,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.0,
+   0.8853924473642432,
+   0.535821510936138,
+   0.6497196601457607,
+   0.8853924473642432,
+   0.717493881903289,
+   0.8655339692155823,
+   0.9036452083899731,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.8853924473642432,
+   0.0,
+   0.5279604218147174,
+   0.6658348373853169,
+   0.33799874888632914,
+   0.6920214832303888,
+   0.6329837991017668,
+   0.9036452083899731,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.535821510936138,
+   0.5279604218147174,
+   0.0,
+   0.662579808115858,
+   0.5079750812968089,
+   0.9251771844789913,
+   0.8655339692155823,
+   0.8429599432532096,
+   0.8429599432532096,
+   0.8429599432532096,
+   0.6497196601457607,
+   0.6658348373853169,
+   0.662579808115858,
+   0.0,
+   0.8429599432532096,
+   0.9251771844789913,
+   0.8655339692155823,
+   0.9036452083899731,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.8853924473642432,
+   0.33799874888632914,
+   0.5079750812968089,
+   0.8429599432532096,
+   0.0},
+  raft::distance::DistanceType::Linf,
+  0.0};
+
+const InputConfiguration<int, float> input_l1 = {4,
+                                                 {0, 1, 1, 2, 4},
+                                                 {3, 2, 0, 1},  // indices
+                                                 {0.99296, 0.42180, 0.11687, 0.305869},
+                                                 {
+                                                   // dense output
+                                                   0.0,
+                                                   0.99296,
+                                                   1.41476,
+                                                   1.415707,
+                                                   0.99296,
+                                                   0.0,
+                                                   0.42180,
+                                                   0.42274,
+                                                   1.41476,
+                                                   0.42180,
+                                                   0.0,
+                                                   0.84454,
+                                                   1.41570,
+                                                   0.42274,
+                                                   0.84454,
+                                                   0.0,
+                                                 },
+                                                 raft::distance::DistanceType::L1,
+                                                 0.0};
+
 // test dense smem strategy
-const std::vector<
-  SparseDistanceCOOSPMVInputs<int, float, dense_smem_strategy_t>>
-  inputs_dense_strategy = {{input_inner_product}, {input_l2_unexpanded},
-                           {input_canberra},      {input_lp_unexpanded},
-                           {input_linf},          {input_l1}};
+const std::vector<SparseDistanceCOOSPMVInputs<int, float, dense_smem_strategy_t>>
+  inputs_dense_strategy = {{input_inner_product},
+                           {input_l2_unexpanded},
+                           {input_canberra},
+                           {input_lp_unexpanded},
+                           {input_linf},
+                           {input_l1}};
 
 typedef SparseDistanceCOOSPMVTest<int, float, dense_smem_strategy_t>
   SparseDistanceCOOSPMVTestDenseStrategyF;
@@ -662,22 +664,22 @@ INSTANTIATE_TEST_CASE_P(SparseDistanceCOOSPMVTests,
                         ::testing::ValuesIn(inputs_dense_strategy));
 
 // test hash and chunk strategy
-const std::vector<SparseDistanceCOOSPMVInputs<int, float, hash_strategy_t>>
-  inputs_hash_strategy = {{input_inner_product},
-                          {input_inner_product, 0.5, 2},
-                          {input_l2_unexpanded},
-                          {input_l2_unexpanded, 0.5, 2},
-                          {input_canberra},
-                          {input_canberra, 0.5, 2},
-                          {input_canberra, 0.5, 6},
-                          {input_lp_unexpanded},
-                          {input_lp_unexpanded, 0.5, 2},
-                          {input_lp_unexpanded, 0.5, 6},
-                          {input_linf},
-                          {input_linf, 0.5, 2},
-                          {input_linf, 0.5, 6},
-                          {input_l1},
-                          {input_l1, 0.5, 2}};
+const std::vector<SparseDistanceCOOSPMVInputs<int, float, hash_strategy_t>> inputs_hash_strategy = {
+  {input_inner_product},
+  {input_inner_product, 0.5, 2},
+  {input_l2_unexpanded},
+  {input_l2_unexpanded, 0.5, 2},
+  {input_canberra},
+  {input_canberra, 0.5, 2},
+  {input_canberra, 0.5, 6},
+  {input_lp_unexpanded},
+  {input_lp_unexpanded, 0.5, 2},
+  {input_lp_unexpanded, 0.5, 6},
+  {input_linf},
+  {input_linf, 0.5, 2},
+  {input_linf, 0.5, 6},
+  {input_l1},
+  {input_l1, 0.5, 2}};
 
 typedef SparseDistanceCOOSPMVTest<int, float, hash_strategy_t>
   SparseDistanceCOOSPMVTestHashStrategyF;
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 0589637061..8d6675f954 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -50,8 +50,8 @@ struct SparseDistanceInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(
-  ::std::ostream &os, const SparseDistanceInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
@@ -61,24 +61,24 @@ class SparseDistanceTest
  public:
   SparseDistanceTest() : dist_config(handle) {}
 
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      SparseDistanceInputs<value_idx, value_t>>::GetParam();
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>>::GetParam();
 
     make_data();
 
-    dist_config.b_nrows = params.indptr_h.size() - 1;
-    dist_config.b_ncols = params.n_cols;
-    dist_config.b_nnz = params.indices_h.size();
-    dist_config.b_indptr = indptr;
+    dist_config.b_nrows   = params.indptr_h.size() - 1;
+    dist_config.b_ncols   = params.n_cols;
+    dist_config.b_nnz     = params.indices_h.size();
+    dist_config.b_indptr  = indptr;
     dist_config.b_indices = indices;
-    dist_config.b_data = data;
-    dist_config.a_nrows = params.indptr_h.size() - 1;
-    dist_config.a_ncols = params.n_cols;
-    dist_config.a_nnz = params.indices_h.size();
-    dist_config.a_indptr = indptr;
+    dist_config.b_data    = data;
+    dist_config.a_nrows   = params.indptr_h.size() - 1;
+    dist_config.a_ncols   = params.n_cols;
+    dist_config.a_nnz     = params.indices_h.size();
+    dist_config.a_indptr  = indptr;
     dist_config.a_indices = indices;
-    dist_config.a_data = data;
+    dist_config.a_data    = data;
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
@@ -89,7 +89,8 @@ class SparseDistanceTest
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
@@ -98,33 +99,34 @@ class SparseDistanceTest
     CUDA_CHECK(cudaFree(out_dists_ref));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
-                            params.out_dists_ref_h.size(),
-                            CompareApprox<value_t>(1e-3)));
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(
+      out_dists_ref, out_dists, params.out_dists_ref_h.size(), CompareApprox<value_t>(1e-3)));
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
     allocate(data, data_h.size());
 
-    update_device(indptr, indptr_h.data(), indptr_h.size(),
-                  handle.get_stream());
-    update_device(indices, indices_h.data(), indices_h.size(),
-                  handle.get_stream());
+    update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream());
+    update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream());
     update_device(data, data_h.data(), data_h.size(), handle.get_stream());
 
     std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
 
     allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1));
 
-    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
+    update_device(out_dists_ref,
+                  out_dists_ref_h.data(),
+                  out_dists_ref_h.size(),
                   dist_config.handle.get_stream());
   }
 
@@ -132,7 +134,7 @@ class SparseDistanceTest
 
   // input data
   value_idx *indptr, *indices;
-  value_t *data;
+  value_t* data;
 
   // output data
   value_t *out_dists, *out_dists_ref;
@@ -187,8 +189,7 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {0, 2, 4, 6, 8},
    {0, 1, 0, 1, 0, 1, 0, 1},
    {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
-   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
-    5.0},
+   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0},
    raft::distance::DistanceType::InnerProduct,
    0.0},
   {2,
@@ -219,40 +220,33 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.,         0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219,
-    0.58146987, 0.44940102, 1.,         0.76978799, 0.39419924, 0.,
-    0.97577154, 0.48904013, 0.48300801, 0.45087445, 0.73323749, 0.21050481,
-    0.54847744, 0.78021386, 0.54823225, 0.97577154, 0.,         0.51413997,
-    0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819,  1.,
-    0.79593037, 0.48904013, 0.51413997, 0.,         0.28605559, 0.35772784,
-    1.,         0.60889396, 0.43324829, 0.84923694, 0.45658883, 0.48300801,
-    0.31195441, 0.28605559, 0.,         0.58623212, 0.6745457,  0.60287165,
-    0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784,
-    0.58623212, 0.,         0.77917274, 0.48390993, 0.24558392, 0.99166225,
-    0.58146987, 0.73323749, 0.67534399, 1.,         0.6745457,  0.77917274,
-    0.,         0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481,
-    0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0.,
-    0.51360432, 0.68185144, 1.,         0.54847744, 0.8321819,  0.43324829,
-    0.67676228, 0.24558392, 0.76064776, 0.51360432, 0.,         1.,
-    0.76978799, 0.78021386, 1.,         0.84923694, 0.73155632, 0.99166225,
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.,         0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102,
+    1.,         0.76978799, 0.39419924, 0.,         0.97577154, 0.48904013, 0.48300801, 0.45087445,
+    0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0.,         0.51413997,
+    0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819,  1.,         0.79593037, 0.48904013,
+    0.51413997, 0.,         0.28605559, 0.35772784, 1.,         0.60889396, 0.43324829, 0.84923694,
+    0.45658883, 0.48300801, 0.31195441, 0.28605559, 0.,         0.58623212, 0.6745457,  0.60287165,
+    0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0.,
+    0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1.,
+    0.6745457,  0.77917274, 0.,         0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481,
+    0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0.,         0.51360432, 0.68185144,
+    1.,         0.54847744, 0.8321819,  0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432,
+    0.,         1.,         0.76978799, 0.78021386, 1.,         0.84923694, 0.73155632, 0.99166225,
     0.61547536, 0.68185144, 1.,         0.},
    raft::distance::DistanceType::CosineExpanded,
    0.0},
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
    {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
     1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
     1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
@@ -361,15 +355,13 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
    {0.0,
     3.3954660629919076,
     5.6469232737388815,
@@ -475,15 +467,13 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
    {0.0,
     1.31462855332296,
     1.3690307816129905,
@@ -589,15 +579,13 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
    {0.0,
     0.9251771844789913,
     0.9036452083899731,
@@ -703,17 +691,14 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {15,
    {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45},
-   {0,  1, 5,  6,  9,  1, 4, 14, 7, 3, 4,  7, 9, 11, 14,
-    0,  3, 7,  8,  12, 0, 2, 5,  7, 8, 14, 4, 9, 10, 11,
-    13, 4, 10, 14, 5,  6, 8, 9,  0, 2, 3,  4, 6, 10, 11},
-   {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507,
-    0.73789274, 0.08450219, 1.,         0.20184723, 0.18036963, 0.12581403,
-    0.13867603, 0.24040536, 0.11288773, 0.00290246, 0.09120187, 0.31190555,
-    0.43245423, 0.16153588, 0.3233026,  0.05279589, 0.1387149,  0.05962761,
-    0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881,
-    0.15605804, 0.3867739,  0.24908977, 0.36413632, 0.37643732, 0.28910679,
-    0.0198409,  0.31461499, 0.24412279, 0.08327667, 0.04444576, 0.05047969,
-    0.26190054, 0.2077349,  0.10803964},
+   {0, 1, 5,  6, 9, 1,  4,  14, 7, 3,  4,  7, 9, 11, 14, 0, 3, 7, 8, 12, 0,  2, 5,
+    7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8,  9,  0, 2, 3, 4, 6,  10, 11},
+   {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219,
+    1.,         0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246,
+    0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026,  0.05279589, 0.1387149,  0.05962761,
+    0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739,
+    0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409,  0.31461499, 0.24412279, 0.08327667,
+    0.04444576, 0.05047969, 0.26190054, 0.2077349,  0.10803964},
    {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01,
     9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00,
     6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08,
@@ -772,31 +757,25 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45},
    {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2,
     3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4},
-   {0.70862347, 0.8232774,  0.12108795, 0.84527547, 0.94937088, 0.03258545,
-    0.99584118, 0.76835667, 0.34426657, 0.2357925,  0.01274851, 0.11422017,
-    0.3437756,  0.31967718, 0.5956055,  0.31610373, 0.04147273, 0.03724415,
-    0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529,
-    0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329,
-    0.61364678, 0.22837736, 0.56609561, 0.29809423, 0.76736686, 0.56460608,
-    0.98165371, 0.02140123, 0.19881268, 0.26057815, 0.31648823, 0.89874295,
-    0.27366735, 0.5119944,  0.11416134},
+   {0.70862347, 0.8232774,  0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667,
+    0.34426657, 0.2357925,  0.01274851, 0.11422017, 0.3437756,  0.31967718, 0.5956055,  0.31610373,
+    0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529,
+    0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736,
+    0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815,
+    0.31648823, 0.89874295, 0.27366735, 0.5119944,  0.11416134},
    {// dense output
-    0.,         0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794,
-    0.76962708, 1.122858,   1.1232498,  1.08166081, 0.48769777, 0.,
-    1.31332116, 0.98318907, 0.42661815, 0.09279052, 1.35187836, 1.38429055,
-    0.40658897, 0.56136388, 1.88014197, 1.31332116, 0.,         1.82943642,
-    1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848,
-    0.26127048, 0.98318907, 1.82943642, 0.,         0.29945563, 1.08494093,
-    0.22934281, 0.82801925, 1.74288748, 1.50610116, 0.26657011, 0.42661815,
-    1.54826077, 0.29945563, 0.,         0.45060069, 0.77814948, 1.45245711,
-    1.18328348, 0.82486987, 0.7874794,  0.09279052, 1.05918884, 1.08494093,
-    0.45060069, 0.,         1.29899154, 1.40683824, 0.48505269, 0.53862363,
-    0.76962708, 1.35187836, 1.59360067, 0.22934281, 0.77814948, 1.29899154,
-    0.,         0.33202426, 1.92108999, 1.88812175, 1.122858,   1.38429055,
-    1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0.,
-    1.47318624, 1.92660889, 1.1232498,  0.40658897, 0.60215168, 1.74288748,
-    1.18328348, 0.48505269, 1.92108999, 1.47318624, 0.,         0.24992619,
-    1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363,
+    0.,         0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794,  0.76962708, 1.122858,
+    1.1232498,  1.08166081, 0.48769777, 0.,         1.31332116, 0.98318907, 0.42661815, 0.09279052,
+    1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0.,         1.82943642,
+    1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907,
+    1.82943642, 0.,         0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116,
+    0.26657011, 0.42661815, 1.54826077, 0.29945563, 0.,         0.45060069, 0.77814948, 1.45245711,
+    1.18328348, 0.82486987, 0.7874794,  0.09279052, 1.05918884, 1.08494093, 0.45060069, 0.,
+    1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281,
+    0.77814948, 1.29899154, 0.,         0.33202426, 1.92108999, 1.88812175, 1.122858,   1.38429055,
+    1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0.,         1.47318624, 1.92660889,
+    1.1232498,  0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624,
+    0.,         0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363,
     1.88812175, 1.92660889, 0.24992619, 0.},
    raft::distance::DistanceType::CorrelationExpanded,
    0.0},
@@ -805,12 +784,11 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {1, 4, 0, 4, 1, 3, 0, 1, 3, 0},
    {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
    {// dense output
-    0., 1.,  1.,  1., 0.8, 1., 1.,  0.8, 1., 1.,  1., 0.,  0.8, 1.,  1., 1., 1.,
-    1., 1.,  1.,  1., 0.8, 0., 1.,  1.,  1., 0.8, 1., 1.,  0.8, 1.,  1., 1., 0.,
-    1., 1.,  1.,  1., 1.,  1., 0.8, 1.,  1., 1.,  0., 1.,  1.,  0.8, 1., 1., 1.,
-    1., 1.,  1.,  1., 0.,  1., 0.8, 1.,  1., 1.,  1., 0.8, 1.,  1.,  1., 0., 1.,
-    1., 0.8, 0.8, 1., 1.,  1., 0.8, 0.8, 1., 0.,  1., 1.,  1.,  1.,  1., 1., 1.,
-    1., 1.,  1.,  0., 1.,  1., 1.,  0.8, 1., 1.,  1., 0.8, 1.,  1.,  0.},
+    0.,  1.,  1.,  1., 0.8, 1., 1.,  0.8, 1., 1.,  1.,  0., 0.8, 1., 1.,  1.,  1.,  1.,  1., 1.,
+    1.,  0.8, 0.,  1., 1.,  1., 0.8, 1.,  1., 0.8, 1.,  1., 1.,  0., 1.,  1.,  1.,  1.,  1., 1.,
+    0.8, 1.,  1.,  1., 0.,  1., 1.,  0.8, 1., 1.,  1.,  1., 1.,  1., 1.,  0.,  1.,  0.8, 1., 1.,
+    1.,  1.,  0.8, 1., 1.,  1., 0.,  1.,  1., 0.8, 0.8, 1., 1.,  1., 0.8, 0.8, 1.,  0.,  1., 1.,
+    1.,  1.,  1.,  1., 1.,  1., 1.,  1.,  0., 1.,  1.,  1., 0.8, 1., 1.,  1.,  0.8, 1.,  1., 0.},
    raft::distance::DistanceType::RusselRaoExpanded,
    0.0},
   {5,
@@ -818,13 +796,12 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {0, 3, 4, 4, 2, 3, 0, 2, 3, 2},
    {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
    {// dense output
-    0.,  0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2,
-    0.,  0.4, 0.6, 0.2, 0.,  0.6, 0.4, 0.,  0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4,
-    0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.4, 0.2, 0.2, 0.2, 0.,
-    0.2, 0.6, 0.8, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,
-    0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0.,  0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8,
-    0.6, 0.2, 0.,  0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, 0.2, 0.2, 0.4, 0.,  0.2,
-    0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.},
+    0.,  0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4,
+    0.6, 0.2, 0.,  0.6, 0.4, 0.,  0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0.,  0.4, 0.,
+    0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.4, 0.2, 0.2, 0.2, 0.,  0.2, 0.6, 0.8, 0.4, 0.2, 0.2,
+    0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0.,  0.2,
+    0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0.,  0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4,
+    0.2, 0.2, 0.4, 0.,  0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.},
    raft::distance::DistanceType::HammingUnexpanded,
    0.0},
   {3,
@@ -868,7 +845,8 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
 typedef SparseDistanceTest<int, float> SparseDistanceTestF;
 TEST_P(SparseDistanceTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseDistanceTests, SparseDistanceTestF,
+INSTANTIATE_TEST_CASE_P(SparseDistanceTests,
+                        SparseDistanceTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
 };  // namespace distance
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index f7954f899f..02be95c8a8 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -36,8 +36,7 @@ struct SparseFilterInputs {
 };
 
 template <typename T>
-class SparseFilterTests
-  : public ::testing::TestWithParam<SparseFilterInputs<T>> {
+class SparseFilterTests : public ::testing::TestWithParam<SparseFilterInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -50,14 +49,14 @@ class SparseFilterTests
 const std::vector<SparseFilterInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseFilterTests<float> COORemoveZeros;
-TEST_P(COORemoveZeros, Result) {
+TEST_P(COORemoveZeros, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  std::shared_ptr<raft::mr::device::allocator> alloc(
-    new raft::mr::device::default_allocator);
+  std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
   params = ::testing::TestWithParam<SparseFilterInputs<float>>::GetParam();
 
-  float *in_h_vals = new float[params.nnz];
+  float* in_h_vals = new float[params.nnz];
 
   COO<float> in(alloc, stream, params.nnz, 5, 5);
 
@@ -70,8 +69,8 @@ TEST_P(COORemoveZeros, Result) {
   in_h_vals[2] = 0;
   in_h_vals[3] = 0;
 
-  int *in_h_rows = new int[params.nnz];
-  int *in_h_cols = new int[params.nnz];
+  int* in_h_rows = new int[params.nnz];
+  int* in_h_cols = new int[params.nnz];
 
   for (int i = 0; i < params.nnz; i++) {
     in_h_rows[i] = params.nnz - i - 1;
@@ -87,9 +86,9 @@ TEST_P(COORemoveZeros, Result) {
   int out_rows_ref_h[2] = {0, 3};
   int out_cols_ref_h[2] = {4, 1};
 
-  float *out_vals_ref_h = (float *)malloc(2 * sizeof(float));
-  out_vals_ref_h[0] = in_h_vals[4];
-  out_vals_ref_h[1] = in_h_vals[1];
+  float* out_vals_ref_h = (float*)malloc(2 * sizeof(float));
+  out_vals_ref_h[0]     = in_h_vals[4];
+  out_vals_ref_h[1]     = in_h_vals[1];
 
   COO<float> out_ref(alloc, stream, 2, 5, 5);
   COO<float> out(alloc, stream);
@@ -100,12 +99,9 @@ TEST_P(COORemoveZeros, Result) {
 
   op::coo_remove_zeros<32, float>(&in, &out, alloc, stream);
 
-  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<float>(out_ref.vals(), out.vals(), 2,
-                                       raft::Compare<float>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<float>(out_ref.vals(), out.vals(), 2, raft::Compare<float>()));
 
   CUDA_CHECK(cudaStreamDestroy(stream));
   free(out_vals_ref_h);
@@ -115,8 +111,7 @@ TEST_P(COORemoveZeros, Result) {
   delete[] in_h_vals;
 }
 
-INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, ::testing::ValuesIn(inputsf));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index 8c3bf36318..ca9da0bc05 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -50,39 +50,53 @@ struct SparseKNNInputs {
   int batch_size_index = 2;
   int batch_size_query = 2;
 
-  raft::distance::DistanceType metric =
-    raft::distance::DistanceType::L2SqrtExpanded;
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded;
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const SparseKNNInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class SparseKNNTest
-  : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
+class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
  public:
-  void SetUp() override {
-    params =
-      ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam();
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam();
 
     n_rows = params.indptr_h.size() - 1;
-    nnz = params.indices_h.size();
-    k = params.k;
+    nnz    = params.indices_h.size();
+    k      = params.k;
 
     make_data();
 
-    raft::sparse::selection::brute_force_knn<value_idx, value_t>(
-      indptr, indices, data, nnz, n_rows, params.n_cols, indptr, indices, data,
-      nnz, n_rows, params.n_cols, out_indices, out_dists, k, handle,
-      params.batch_size_index, params.batch_size_query, params.metric);
+    raft::sparse::selection::brute_force_knn<value_idx, value_t>(indptr,
+                                                                 indices,
+                                                                 data,
+                                                                 nnz,
+                                                                 n_rows,
+                                                                 params.n_cols,
+                                                                 indptr,
+                                                                 indices,
+                                                                 data,
+                                                                 nnz,
+                                                                 n_rows,
+                                                                 params.n_cols,
+                                                                 out_indices,
+                                                                 out_dists,
+                                                                 k,
+                                                                 handle,
+                                                                 params.batch_size_index,
+                                                                 params.batch_size_query,
+                                                                 params.metric);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
     CUDA_CHECK(cudaFree(data));
@@ -92,39 +106,37 @@ class SparseKNNTest
     CUDA_CHECK(cudaFree(out_dists_ref));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k,
-                            CompareApprox<value_t>(1e-4)));
-    ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k,
-                            Compare<value_idx>()));
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, CompareApprox<value_t>(1e-4)));
+    ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, Compare<value_idx>()));
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
     allocate(data, data_h.size());
 
-    update_device(indptr, indptr_h.data(), indptr_h.size(),
-                  handle.get_stream());
-    update_device(indices, indices_h.data(), indices_h.size(),
-                  handle.get_stream());
+    update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream());
+    update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream());
     update_device(data, data_h.data(), data_h.size(), handle.get_stream());
 
-    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
 
     allocate(out_indices_ref, out_indices_ref_h.size());
     allocate(out_dists_ref, out_dists_ref_h.size());
 
-    update_device(out_indices_ref, out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), handle.get_stream());
-    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
-                  handle.get_stream());
+    update_device(
+      out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), handle.get_stream());
+    update_device(
+      out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream());
 
     allocate(out_dists, n_rows * k);
     allocate(out_indices, n_rows * k);
@@ -136,14 +148,14 @@ class SparseKNNTest
 
   // input data
   value_idx *indptr, *indices;
-  value_t *data;
+  value_t* data;
 
   // output data
-  value_idx *out_indices;
-  value_t *out_dists;
+  value_idx* out_indices;
+  value_t* out_dists;
 
-  value_idx *out_indices_ref;
-  value_t *out_dists_ref;
+  value_idx* out_indices_ref;
+  value_t* out_dists_ref;
 
   SparseKNNInputs<value_idx, value_t> params;
 };
@@ -161,8 +173,7 @@ const std::vector<SparseKNNInputs<int, float>> inputs_i32_f = {
    raft::distance::DistanceType::L2SqrtExpanded}};
 typedef SparseKNNTest<int, float> SparseKNNTestF;
 TEST_P(SparseKNNTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace selection
 };  // end namespace sparse
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index ec41b32374..f660e68aa3 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -29,8 +29,9 @@ namespace raft {
 namespace sparse {
 
 template <typename value_idx, typename value_t>
-__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals,
-                                value_idx nnz, value_idx *sum) {
+__global__ void assert_symmetry(
+  value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum)
+{
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid >= nnz) return;
@@ -50,22 +51,21 @@ struct KNNGraphInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const KNNGraphInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const KNNGraphInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class KNNGraphTest
-  : public ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>> {
-  void SetUp() override {
-    params =
-      ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>>::GetParam();
+class KNNGraphTest : public ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>> {
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>>::GetParam();
 
     raft::handle_t handle;
 
     auto alloc = handle.get_device_allocator();
-    stream = handle.get_stream();
+    stream     = handle.get_stream();
 
     out = new raft::sparse::COO<value_t, value_idx>(alloc, stream);
 
@@ -74,8 +74,7 @@ class KNNGraphTest
     update_device(X, params.X.data(), params.X.size(), stream);
 
     raft::sparse::selection::knn_graph(
-      handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded,
-      *out);
+      handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded, *out);
 
     rmm::device_uvector<value_idx> sum(1, stream);
 
@@ -91,7 +90,8 @@ class KNNGraphTest
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(X));
 
     delete out;
@@ -101,9 +101,9 @@ class KNNGraphTest
   cudaStream_t stream;
 
   // input data
-  raft::sparse::COO<value_t, value_idx> *out;
+  raft::sparse::COO<value_t, value_idx>* out;
 
-  value_t *X;
+  value_t* X;
 
   value_idx sum_h;
 
@@ -115,13 +115,15 @@ const std::vector<KNNGraphInputs<int, float>> knn_graph_inputs_fint = {
   {4, 2, {0, 100, 0.01, 0.02, 5000, 10000, -5, -2}, 2}};
 
 typedef KNNGraphTest<int, float> KNNGraphTestF_int;
-TEST_P(KNNGraphTestF_int, Result) {
+TEST_P(KNNGraphTestF_int, Result)
+{
   // nnz should not be larger than twice m * k
   ASSERT_TRUE(out->nnz <= (params.m * params.k * 2));
   ASSERT_TRUE(sum_h == 0);
 }
 
-INSTANTIATE_TEST_CASE_P(KNNGraphTest, KNNGraphTestF_int,
+INSTANTIATE_TEST_CASE_P(KNNGraphTest,
+                        KNNGraphTestF_int,
                         ::testing::ValuesIn(knn_graph_inputs_fint));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index ce567e4298..0ca7cec4e9 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -55,45 +55,44 @@ struct LinkageInputs {
  * @param b: number of pairs of points that both the clusters have classified differently
  */
 template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void computeTheNumerator(const T* firstClusterArray,
-                                    const T* secondClusterArray, uint64_t size,
-                                    uint64_t* a, uint64_t* b) {
-  //calculating the indices of pairs of datapoints compared by the current thread
+__global__ void computeTheNumerator(
+  const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b)
+{
+  // calculating the indices of pairs of datapoints compared by the current thread
   uint64_t j = threadIdx.x + blockIdx.x * blockDim.x;
   uint64_t i = threadIdx.y + blockIdx.y * blockDim.y;
 
-  //thread-local variables to count a and b
+  // thread-local variables to count a and b
   uint64_t myA = 0, myB = 0;
 
   if (i < size && j < size && j < i) {
-    //checking if the pair have been classified the same by both the clusters
+    // checking if the pair have been classified the same by both the clusters
     if (firstClusterArray[i] == firstClusterArray[j] &&
         secondClusterArray[i] == secondClusterArray[j]) {
       ++myA;
     }
 
-    //checking if the pair have been classified differently by both the clusters
+    // checking if the pair have been classified differently by both the clusters
     else if (firstClusterArray[i] != firstClusterArray[j] &&
              secondClusterArray[i] != secondClusterArray[j]) {
       ++myB;
     }
   }
 
-  //specialize blockReduce for a 2D block of 1024 threads of type uint64_t
-  typedef cub::BlockReduce<uint64_t, BLOCK_DIM_X,
-                           cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
+  // specialize blockReduce for a 2D block of 1024 threads of type uint64_t
+  typedef cub::BlockReduce<uint64_t, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
     BlockReduce;
 
-  //Allocate shared memory for blockReduce
+  // Allocate shared memory for blockReduce
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
-  //summing up thread-local counts specific to a block
+  // summing up thread-local counts specific to a block
   myA = BlockReduce(temp_storage).Sum(myA);
   __syncthreads();
   myB = BlockReduce(temp_storage).Sum(myB);
   __syncthreads();
 
-  //executed once per block
+  // executed once per block
   if (threadIdx.x == 0 && threadIdx.y == 0) {
     raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)a, myA);
     raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)b, myB);
@@ -101,102 +100,105 @@ __global__ void computeTheNumerator(const T* firstClusterArray,
 }
 
 /**
-* @brief Function to calculate RandIndex
-* <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
-* @param firstClusterArray: the array of classes of type T
-* @param secondClusterArray: the array of classes of type T
-* @param size: the size of the data points of type uint64_t
-* @param allocator: object that takes care of temporary device memory allocation of type std::shared_ptr<MLCommon::deviceAllocator>
-* @param stream: the cudaStream object
-*/
+ * @brief Function to calculate RandIndex
+ * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points of type uint64_t
+ * @param allocator: object that takes care of temporary device memory allocation of type
+ * std::shared_ptr<MLCommon::deviceAllocator>
+ * @param stream: the cudaStream object
+ */
 template <typename T>
-double compute_rand_index(
-  T* firstClusterArray, T* secondClusterArray, uint64_t size,
-  std::shared_ptr<raft::mr::device::allocator> allocator, cudaStream_t stream) {
-  //rand index for size less than 2 is not defined
+double compute_rand_index(T* firstClusterArray,
+                          T* secondClusterArray,
+                          uint64_t size,
+                          std::shared_ptr<raft::mr::device::allocator> allocator,
+                          cudaStream_t stream)
+{
+  // rand index for size less than 2 is not defined
   ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
 
-  //allocating and initializing memory for a and b in the GPU
+  // allocating and initializing memory for a and b in the GPU
   raft::mr::device::buffer<uint64_t> arr_buf(allocator, stream, 2);
   CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
 
-  //kernel configuration
+  // kernel configuration
   static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16;
   dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y);
   dim3 numBlocks(raft::ceildiv<int>(size, numThreadsPerBlock.x),
                  raft::ceildiv<int>(size, numThreadsPerBlock.y));
 
-  //calling the kernel
-  computeTheNumerator<T, BLOCK_DIM_X, BLOCK_DIM_Y>
-    <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
-      firstClusterArray, secondClusterArray, size, arr_buf.data(),
-      arr_buf.data() + 1);
+  // calling the kernel
+  computeTheNumerator<T, BLOCK_DIM_X, BLOCK_DIM_Y><<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+    firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1);
 
-  //synchronizing and updating the calculated values of a and b from device to host
+  // synchronizing and updating the calculated values of a and b from device to host
   uint64_t ab_host[2] = {0};
   raft::update_host(ab_host, arr_buf.data(), 2, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
-  //error handling
+  // error handling
   CUDA_CHECK(cudaGetLastError());
 
-  //denominator
+  // denominator
   uint64_t nChooseTwo = size * (size - 1) / 2;
 
-  //calculating the rand_index
+  // calculating the rand_index
   return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo);
 }
 
 template <typename T, typename IdxT>
-::std::ostream& operator<<(::std::ostream& os,
-                           const LinkageInputs<T, IdxT>& dims) {
+::std::ostream& operator<<(::std::ostream& os, const LinkageInputs<T, IdxT>& dims)
+{
   return os;
 }
 
 template <typename T, typename IdxT>
 class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     raft::handle_t handle;
 
     params = ::testing::TestWithParam<LinkageInputs<T, IdxT>>::GetParam();
 
-    rmm::device_uvector<T> data(params.n_row * params.n_col,
-                                handle.get_stream());
+    rmm::device_uvector<T> data(params.n_row * params.n_col, handle.get_stream());
 
     // Allocate result labels and expected labels on device
     raft::allocate(labels, params.n_row);
     raft::allocate(labels_ref, params.n_row);
 
-    raft::copy(data.data(), params.data.data(), data.size(),
-               handle.get_stream());
-    raft::copy(labels_ref, params.expected_labels.data(), params.n_row,
-               handle.get_stream());
+    raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream());
+    raft::copy(labels_ref, params.expected_labels.data(), params.n_row, handle.get_stream());
 
     raft::hierarchy::linkage_output<IdxT, T> out_arrs;
     out_arrs.labels = labels;
 
-    rmm::device_uvector<IdxT> out_children(params.n_row * 2,
-                                           handle.get_stream());
+    rmm::device_uvector<IdxT> out_children(params.n_row * 2, handle.get_stream());
 
     out_arrs.children = out_children.data();
 
-    raft::hierarchy::single_linkage<
-      IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>(
-      handle, data.data(), params.n_row, params.n_col,
-      raft::distance::DistanceType::L2SqrtExpanded, &out_arrs, params.c,
+    raft::hierarchy::single_linkage<IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>(
+      handle,
+      data.data(),
+      params.n_row,
+      params.n_col,
+      raft::distance::DistanceType::L2SqrtExpanded,
+      &out_arrs,
+      params.c,
       params.n_clusters);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
-    score =
-      compute_rand_index(labels, labels_ref, params.n_row,
-                         handle.get_device_allocator(), handle.get_stream());
+    score = compute_rand_index(
+      labels, labels_ref, params.n_row, handle.get_device_allocator(), handle.get_stream());
   }
 
   void SetUp() override { basicTest(); }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(labels));
     CUDA_CHECK(cudaFree(labels_ref));
   }
@@ -212,14 +214,12 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   // Test n_clusters == n_points
   {10,
    5,
-   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392,
-    0.77782677, 0.43772379, 0.4035871,  0.3282796,  0.47544681, 0.59862974,
-    0.12319357, 0.06239463, 0.28200272, 0.1345717,  0.50498218, 0.5113505,
-    0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
-    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,
-    0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792,
-    0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692,
-    0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
     0.76166195, 0.66613745},
    {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
    10,
@@ -227,8 +227,7 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   //  // Test outlier points
   {9,
    2,
-   {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000,
-    10, 50, 30, 5},
+   {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5},
    {6, 0, 5, 0, 0, 4, 3, 2, 1},
    7,
    -1},
@@ -236,14 +235,12 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   // Test n_clusters == (n_points / 2)
   {10,
    5,
-   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392,
-    0.77782677, 0.43772379, 0.4035871,  0.3282796,  0.47544681, 0.59862974,
-    0.12319357, 0.06239463, 0.28200272, 0.1345717,  0.50498218, 0.5113505,
-    0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
-    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,
-    0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792,
-    0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692,
-    0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
     0.76166195, 0.66613745},
    {1, 0, 4, 0, 0, 3, 2, 0, 2, 1},
    5,
@@ -252,340 +249,173 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   // Test n_points == 100
   {100,
    10,
-   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01,
-    2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
-    6.88942598e-01, 5.79163537e-01, 6.70341547e-01,
-    2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
-    9.89948537e-01, 7.75253347e-01, 1.34491522e-02,
-    2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
-    7.86373507e-01, 7.18748577e-01, 8.66998621e-01,
-    6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
-    3.76246281e-01, 4.86828710e-01, 5.67464772e-01,
-    5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
-    9.49339111e-01, 3.55248484e-01, 9.06046929e-01,
-    4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
-    7.74840000e-01, 5.21046603e-01, 4.66423971e-02,
-    5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
-    4.31536306e-01, 5.83857744e-01, 4.41787364e-01,
-    4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
-    3.19650588e-01, 6.12579596e-01, 6.49126442e-02,
-    8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
-    9.46507115e-01, 8.58440748e-01, 3.61528940e-01,
-    2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
-    8.82216988e-01, 8.31498633e-01, 7.23474381e-01,
-    7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
-    5.13985168e-01, 3.00686418e-01, 8.70109949e-01,
-    2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
-    8.70985521e-01, 8.77491176e-01, 6.72537226e-01,
-    3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
-    6.18239142e-01, 2.64768597e-01, 5.76145451e-01,
-    8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
-    1.27645356e-01, 4.51004673e-01, 3.92292980e-01,
-    4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
-    5.71832605e-02, 2.06763039e-01, 3.70116249e-01,
-    2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
-    9.84156240e-02, 2.66249156e-01, 3.87635103e-01,
-    2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
-    6.86227676e-01, 1.08848960e-01, 5.96731841e-02,
-    3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
-    9.00700636e-01, 8.76363105e-01, 2.67334632e-01,
-    1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
-    1.09372387e-01, 8.74028108e-01, 6.46403232e-01,
-    4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
-    8.83865057e-01, 3.15879821e-01, 2.27043992e-01,
-    9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
-    2.40548962e-01, 3.21795663e-01, 8.75087904e-02,
-    8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
-    1.21958818e-01, 3.44348628e-02, 8.72630414e-01,
-    3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
-    5.33896401e-01, 6.21642973e-01, 4.93062535e-01,
-    4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
-    4.43447610e-01, 8.95646149e-01, 6.05220676e-01,
-    1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
-    6.92582693e-01, 7.55946922e-01, 7.95086143e-01,
-    6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
-    9.81114529e-01, 4.98266428e-01, 6.37127930e-03,
-    1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
-    7.38827633e-01, 8.93214770e-01, 2.16494306e-01,
-    9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
-    7.86240041e-01, 7.06854594e-01, 2.13725879e-02,
-    7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
-    5.01989826e-03, 4.22081572e-02, 1.65337732e-01,
-    8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
-    1.14028379e-01, 3.69739861e-01, 1.32955599e-01,
-    2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
-    6.88449594e-01, 4.44921417e-01, 8.23296587e-01,
-    1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
-    3.42600285e-01, 5.64505195e-01, 5.57594559e-01,
-    7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
-    3.21010077e-01, 8.55081359e-01, 4.30105779e-01,
-    1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
-    4.50880592e-01, 2.72289598e-01, 6.31615256e-01,
-    8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
-    2.68767748e-02, 2.43374608e-01, 4.02141103e-01,
-    4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
-    7.16149148e-01, 4.19664401e-01, 2.29335357e-01,
-    2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
-    5.69849716e-01, 5.86454477e-01, 3.54474989e-01,
-    9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
-    7.88039746e-02, 2.04814126e-01, 7.82251754e-01,
-    2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
-    2.95349590e-01, 6.57991826e-01, 8.81214312e-01,
-    5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
-    7.69797417e-02, 6.44792402e-01, 9.46950998e-01,
-    7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
-    1.67498426e-01, 2.66514296e-01, 6.50140368e-01,
-    1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
-    9.85033484e-01, 2.92909152e-01, 8.65816607e-01,
-    1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
-    2.89234322e-01, 8.18668708e-01, 4.71706924e-01,
-    9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
-    4.13915748e-01, 9.31274932e-02, 6.66322195e-01,
-    9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
-    5.03096313e-02, 6.95225201e-01, 5.78469859e-01,
-    6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
-    6.80663678e-01, 6.34607157e-01, 6.42765834e-01,
-    1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
-    4.68676824e-01, 2.86003928e-01, 7.18608322e-01,
-    8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
-    5.24379196e-01, 2.13526524e-01, 5.88375435e-01,
-    9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
-    9.53760881e-01, 5.27151288e-01, 7.03017278e-01,
-    3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
-    1.98979011e-01, 4.24917361e-01, 5.73172761e-01,
-    2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
-    9.29665524e-01, 2.26135696e-01, 9.20563384e-01,
-    7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
-    3.78559302e-03, 9.15219382e-01, 3.55705698e-01,
-    6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
-    2.49478206e-01, 7.93679304e-01, 4.75830027e-01,
-    4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
-    1.70386675e-01, 7.04056121e-01, 4.85963102e-01,
-    9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
-    2.58915007e-01, 6.70052890e-01, 2.61945109e-01,
-    8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
-    2.45776933e-01, 2.66658783e-01, 3.71724077e-01,
-    4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
-    8.07997684e-01, 1.64296275e-01, 6.01638065e-01,
-    8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
-    9.75844338e-01, 7.81226782e-01, 2.20925515e-01,
-    7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
-    9.08058083e-01, 6.88010677e-01, 8.14271847e-01,
-    5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
-    9.17455497e-01, 2.12052706e-01, 7.04074603e-01,
-    8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
-    8.54801557e-01, 2.49729159e-01, 9.76594604e-01,
-    2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
-    4.25193986e-01, 7.61869994e-01, 5.13334255e-01,
-    6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
-    1.08154647e-01, 8.78446825e-01, 2.43833016e-01,
-    9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
-    3.74510294e-01, 4.08451278e-02, 9.78392777e-01,
-    3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
-    5.25978080e-01, 1.42803678e-01, 4.05451674e-01,
-    7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
-    1.43159543e-02, 1.80363779e-01, 5.05096904e-01,
-    2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
-    8.73223968e-01, 4.38545619e-01, 4.81348800e-01,
-    6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
-    1.89869550e-01, 2.34083070e-01, 2.94066207e-01,
-    5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
-    6.72650672e-02, 8.47345378e-01, 2.80916761e-01,
-    7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
-    8.48781331e-01, 8.83225408e-01, 7.34398275e-01,
-    7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
-    5.40732486e-01, 3.69704071e-01, 5.77305837e-01,
-    2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
-    3.49496706e-01, 8.34948910e-01, 1.56403291e-02,
-    6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
-    1.43943153e-01, 3.49618530e-01, 2.10440392e-01,
-    3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
-    2.72177079e-01, 7.07946300e-01, 4.33717726e-02,
-    3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
-    6.22777789e-01, 2.95989228e-02, 4.32855769e-01,
-    7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
-    6.43721247e-01, 6.58025802e-01, 1.05247633e-02,
-    5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
-    6.62634841e-01, 8.25936616e-01, 9.91253704e-01,
-    6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
-    3.32139049e-01, 7.98732398e-01, 7.38865223e-01,
-    9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
-    1.83778839e-01, 7.27558919e-02, 5.91602822e-01,
-    3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
-    9.18556407e-01, 9.35373324e-01, 6.89209070e-01,
-    2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
-    9.84983432e-01, 6.62322741e-01, 2.04144457e-01,
-    3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
-    3.14043787e-01, 5.91072666e-01, 7.44703771e-01,
-    8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
-    1.41526372e-01, 4.14878484e-01, 6.80683651e-01,
-    5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
-    9.03269815e-01, 8.68443745e-01, 9.86939190e-01,
-    7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
-    9.69509248e-01, 1.11908818e-01, 4.49198556e-01,
-    1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
-    2.10747488e-01, 9.53884090e-01, 8.43167950e-01,
-    4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
-    3.55290379e-01, 2.95705968e-01, 1.69622690e-01,
-    1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
-    3.70932500e-01, 9.94292830e-01, 4.62587505e-01,
-    7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
-    5.75768304e-01, 9.71448393e-01, 6.95574827e-02,
-    3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
-    6.73797120e-02, 6.76596969e-01, 5.50948898e-01,
-    3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
-    3.03264879e-01, 7.61037886e-03, 2.72289601e-01,
-    1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
-    1.92088941e-01, 2.19043977e-01, 9.09320161e-01,
-    2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
-    2.23355609e-01, 1.84789435e-01, 4.16104518e-01,
-    4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
-    4.50328256e-01, 8.72199917e-01, 2.51279916e-01,
-    4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
-    1.06187277e-01, 4.92341327e-01, 1.46017513e-01,
-    5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
-    8.72648431e-01, 5.54051490e-01, 1.80745062e-01,
-    2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
-    8.30254678e-01, 5.00003328e-01, 4.69017439e-01,
-    6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
-    9.06516882e-02, 8.52975842e-01, 1.19985883e-01,
-    3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
-    6.28362507e-02, 4.32693501e-01, 3.10500685e-01,
-    6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
-    7.91284868e-01, 7.93054570e-01, 2.93406765e-01,
-    8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
-    8.67523104e-01, 1.47963482e-01, 1.25584706e-01,
-    3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
-    5.75553531e-02, 5.31607516e-01, 2.63869588e-01,
-    9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
-    7.74866558e-01, 5.65210610e-01, 7.28015327e-02,
-    6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
-    1.29932405e-01, 8.64026259e-01, 9.92599934e-01,
-    7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
-    8.11335531e-01, 7.87734900e-01, 9.87344678e-01,
-    5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
-    1.66085871e-01, 1.12937664e-01, 5.24423470e-01,
-    6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
-    3.08722276e-02, 6.26979315e-01, 4.49754105e-01,
-    8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
-    9.22168418e-01, 3.73210378e-01, 8.04432575e-01,
-    5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
-    2.40407640e-01, 5.91631279e-01, 1.59369206e-01,
-    7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
-    6.39105224e-01, 4.85274738e-01, 2.12630838e-01,
-    2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
-    5.23869697e-01, 9.99418314e-01, 8.35331599e-01,
-    4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
-    2.77001890e-02, 5.75809742e-01, 2.78513031e-01,
-    8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
-    7.88311357e-01, 9.64676177e-01, 1.75752651e-01,
-    4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
-    4.06647450e-01, 8.46539387e-01, 2.12620694e-01,
-    9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
-    9.63626055e-01, 5.96689242e-01, 1.63372670e-01,
-    4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
-    2.82327625e-01, 4.75535418e-01, 6.27760926e-01,
-    8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
-    5.05508062e-01, 5.28102944e-01, 6.13045057e-01,
-    7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
-    4.89839179e-01, 3.10496849e-01, 8.82309038e-01,
-    2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
-    4.62955493e-01, 2.38185305e-01, 5.47259907e-02,
-    7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
-    8.77741168e-01, 4.19881322e-01, 4.81222328e-01,
-    1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
-    7.37216484e-01, 5.62134821e-02, 7.14089724e-01,
-    9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
-    4.70237690e-01, 2.66524167e-01, 7.93875484e-01,
-    4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
-    1.70082405e-01, 6.35905179e-01, 3.75379109e-01,
-    4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
-    2.24643800e-01, 2.42142981e-01, 6.57283636e-01,
-    3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
-    9.43856291e-01, 4.47518596e-01, 5.44453573e-01,
-    9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
-    1.01179183e-01, 4.45473958e-01, 4.60327322e-01,
-    4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
-    3.41027487e-01, 1.56175026e-01, 7.58283148e-01,
-    6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
-    3.92517893e-01, 6.70418431e-01, 5.16440832e-01,
-    8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
-    7.39396341e-01, 7.20852434e-01, 2.35653246e-02,
-    3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
-    8.79339335e-01, 7.41599159e-02, 5.62433904e-01,
-    6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
-    5.26845015e-02, 5.58471266e-01, 1.63632233e-01,
-    5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
-    8.99035326e-01, 7.20847756e-01, 5.68954684e-01,
-    7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
-    4.89328290e-01, 5.62208561e-01, 4.97540804e-02,
-    4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
-    7.89548214e-01, 8.46136387e-01, 8.46816189e-01,
-    1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
-    4.50646654e-01, 3.74785037e-01, 4.87196697e-01,
-    4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
-    7.13597697e-01, 1.23641270e-02, 5.10031271e-01,
-    4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
-    1.91165703e-01, 4.51170940e-01, 7.50843157e-01,
-    4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
-    6.55689206e-01, 9.68257670e-02, 1.96528793e-01,
-    8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
-    9.41828079e-01, 4.54397338e-01, 5.61893331e-01,
-    5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
-    1.74888861e-01, 6.65641378e-01, 2.81668336e-01,
-    1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
-    8.25092797e-01, 5.18106324e-01, 1.71904024e-01,
-    3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
-    9.30274827e-01, 2.38198517e-01, 9.52222901e-01,
-    5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
-    4.83356794e-01, 2.73050402e-01, 3.68027050e-01,
-    5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
-    7.13926203e-01, 8.16750052e-01, 1.57890291e-01,
-    6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
-    1.02429784e-01, 9.17488471e-01, 4.03584434e-01,
-    9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
-    2.45200576e-01, 1.28896951e-01, 3.15713052e-01,
-    5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
-    7.74738919e-02, 8.42422142e-01, 3.75598924e-01,
-    3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
-    7.43107867e-01, 9.46182666e-01, 9.44344819e-01,
-    3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
-    3.84060507e-01, 2.91057722e-01, 7.68173662e-02,
-    1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
-    7.21342202e-01, 6.69471294e-03, 9.07298311e-01,
-    5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
-    2.06407453e-01, 2.59590556e-01, 7.58512718e-01,
-    5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
-    2.42829343e-01, 9.19323719e-01, 3.46832864e-01,
-    3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
-    9.58438860e-01, 5.66326411e-01, 6.60292846e-01,
-    5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
-    4.44713264e-01, 2.09732933e-01, 5.22732436e-01,
-    1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
-    4.94036404e-01, 4.09785794e-01, 6.40025507e-01,
-    5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
-    5.41072639e-01, 5.18847173e-01, 1.97093284e-01,
-    8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
-    3.87699807e-01, 4.50705808e-01, 2.49371643e-01,
-    3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
-    9.07275994e-01, 3.73075859e-01, 4.14044139e-03,
-    2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
-    4.50350196e-01, 3.48618117e-01, 5.07193932e-01,
-    5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
-    1.02623450e-01, 3.06088345e-01, 7.80461650e-01,
-    2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
-    3.68286735e-01, 7.39358243e-01, 8.97879394e-01,
-    9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
-    4.23976657e-02, 8.25922012e-01, 2.60956996e-01,
-    2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
-    8.49071471e-01, 3.45835425e-01, 7.65458276e-01,
-    5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
-    5.63368667e-02, 4.26548945e-01, 5.46745780e-01,
-    5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
-    4.46492976e-01, 6.40240123e-01, 2.73246969e-01,
-    2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
-    1.96617189e-01, 6.61271644e-01, 8.12687657e-01,
-    8.66342445e-01
+   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
+    6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
+    9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
+    7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
+    3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
+    9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
+    7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
+    4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
+    3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
+    9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
+    8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
+    5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
+    8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
+    6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
+    1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
+    5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
+    9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
+    6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
+    9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
+    1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
+    8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
+    2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
+    1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
+    5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
+    4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
+    6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
+    9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
+    7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
+    7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
+    5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
+    1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
+    6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
+    3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
+    3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
+    4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
+    2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
+    7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
+    5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
+    7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
+    2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
+    7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
+    1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
+    9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
+    2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
+    4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
+    5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
+    6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
+    4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
+    5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
+    9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
+    1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
+    9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
+    3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
+    2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
+    1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
+    2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
+    2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
+    8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
+    9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
+    9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
+    9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
+    8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
+    4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
+    1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
+    3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
+    5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
+    1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
+    8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
+    1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
+    6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
+    8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
+    5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
+    3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
+    1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
+    2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
+    6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
+    6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
+    6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
+    3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
+    1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
+    9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
+    9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
+    3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
+    1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
+    9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
+    9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
+    2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
+    3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
+    3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
+    5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
+    6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
+    3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
+    1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
+    2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
+    4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
+    1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
+    8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
+    8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
+    9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
+    6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
+    7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
+    8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
+    5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
+    7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
+    1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
+    8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
+    1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
+    3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
+    9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
+    2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
+    6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
+    5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
+    2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
+    7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
+    4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
+    9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
+    2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
+    5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
+    4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
+    4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
+    8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
+    7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
+    4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
+    1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
+    2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
+    9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
+    1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
+    3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
+    3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
+    7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
+    8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
+    5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
+    8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
+    4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
+    7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
+    4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
+    7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
+    1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
+    6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
+    9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
+    1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
+    8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
+    9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
+    4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
+    7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
+    1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
+    2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
+    7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
+    7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
+    3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
+    7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
+    2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
+    2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
+    9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
+    4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
+    4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
+    5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
+    3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
+    9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
+    4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
+    1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
+    3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
+    4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
+    8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
+    5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
+    4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
+    1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01
 
    },
    {0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -598,6 +428,5 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
 typedef LinkageTest<float, int> LinkageTestF_Int;
 TEST_P(LinkageTestF_Int, Result) { EXPECT_TRUE(score == 1.0); }
 
-INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int,
-                        ::testing::ValuesIn(linkage_inputsf2));
+INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, ::testing::ValuesIn(linkage_inputsf2));
 }  // end namespace raft
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 7adbbf8b9a..4897d8194b 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -39,12 +39,11 @@ struct CSRRowNormalizeInputs {
 };
 
 template <typename Type_f, typename Index_>
-class CSRRowNormalizeTest
-  : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
+class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
  protected:
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      CSRRowNormalizeInputs<Type_f, Index_>>::GetParam();
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>>::GetParam();
     cudaStreamCreate(&stream);
 
     raft::allocate(in_vals, params.in_vals.size());
@@ -53,9 +52,10 @@ class CSRRowNormalizeTest
     raft::allocate(result, params.verify.size(), true);
   }
 
-  void Run() {
+  void Run()
+  {
     Index_ n_rows = params.ex_scan.size();
-    Index_ nnz = params.in_vals.size();
+    Index_ nnz    = params.in_vals.size();
 
     raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
     raft::update_device(in_vals, params.in_vals.data(), nnz, stream);
@@ -63,20 +63,18 @@ class CSRRowNormalizeTest
 
     switch (params.method) {
       case MAX:
-        linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows,
-                                                  result, stream);
+        linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows, result, stream);
         break;
       case L1:
-        linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows,
-                                                 result, stream);
+        linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows, result, stream);
         break;
     }
 
-    ASSERT_TRUE(
-      raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
+    ASSERT_TRUE(raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(ex_scan));
     CUDA_CHECK(cudaFree(in_vals));
     CUDA_CHECK(cudaFree(verify));
@@ -87,7 +85,7 @@ class CSRRowNormalizeTest
  protected:
   CSRRowNormalizeInputs<Type_f, Index_> params;
   cudaStream_t stream;
-  Index_ *ex_scan;
+  Index_* ex_scan;
   Type_f *in_vals, *result, *verify;
 };
 
@@ -118,9 +116,11 @@ const std::vector<CSRRowNormalizeInputs<double, int>> csrnormalize_inputs_d = {
    {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestF,
+INSTANTIATE_TEST_CASE_P(SparseNormTest,
+                        CSRRowNormalizeTestF,
                         ::testing::ValuesIn(csrnormalize_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestD,
+INSTANTIATE_TEST_CASE_P(SparseNormTest,
+                        CSRRowNormalizeTestD,
                         ::testing::ValuesIn(csrnormalize_inputs_d));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index 50b5dc5993..44098214d2 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -42,19 +42,19 @@ struct SparseReduceInputs {
 };
 
 template <typename value_t, typename value_idx>
-class SparseReduceTest
-  : public ::testing::TestWithParam<SparseReduceInputs<value_t, value_idx>> {
+class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<value_t, value_idx>> {
  protected:
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      SparseReduceInputs<value_t, value_idx>>::GetParam();
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<SparseReduceInputs<value_t, value_idx>>::GetParam();
   }
 
-  void Run() {
+  void Run()
+  {
     raft::handle_t handle;
 
     auto d_alloc = handle.get_device_allocator();
-    auto stream = handle.get_stream();
+    auto stream  = handle.get_stream();
 
     rmm::device_uvector<value_idx> in_rows(params.in_rows.size(), stream);
     rmm::device_uvector<value_idx> in_cols(params.in_cols.size(), stream);
@@ -63,30 +63,29 @@ class SparseReduceTest
     rmm::device_uvector<value_idx> out_cols(params.out_cols.size(), stream);
     rmm::device_uvector<value_t> out_vals(params.out_vals.size(), stream);
 
-    raft::update_device(in_rows.data(), params.in_rows.data(),
-                        params.in_rows.size(), stream);
-    raft::update_device(in_cols.data(), params.in_cols.data(),
-                        params.in_cols.size(), stream);
-    raft::update_device(in_vals.data(), params.in_vals.data(),
-                        params.in_vals.size(), stream);
-    raft::update_device(out_rows.data(), params.out_rows.data(),
-                        params.out_rows.size(), stream);
-    raft::update_device(out_cols.data(), params.out_cols.data(),
-                        params.out_cols.size(), stream);
-    raft::update_device(out_vals.data(), params.out_vals.data(),
-                        params.out_vals.size(), stream);
+    raft::update_device(in_rows.data(), params.in_rows.data(), params.in_rows.size(), stream);
+    raft::update_device(in_cols.data(), params.in_cols.data(), params.in_cols.size(), stream);
+    raft::update_device(in_vals.data(), params.in_vals.data(), params.in_vals.size(), stream);
+    raft::update_device(out_rows.data(), params.out_rows.data(), params.out_rows.size(), stream);
+    raft::update_device(out_cols.data(), params.out_cols.data(), params.out_cols.size(), stream);
+    raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream);
 
     raft::sparse::COO<value_t, value_idx> out(d_alloc, stream);
-    raft::sparse::op::max_duplicates(handle, out, in_rows.data(),
-                                     in_cols.data(), in_vals.data(),
-                                     params.in_rows.size(), params.m, params.n);
+    raft::sparse::op::max_duplicates(handle,
+                                     out,
+                                     in_rows.data(),
+                                     in_cols.data(),
+                                     in_vals.data(),
+                                     params.in_rows.size(),
+                                     params.m,
+                                     params.n);
 
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_rows.data(), out.rows(), out.nnz, raft::Compare<value_idx>()));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_cols.data(), out.cols(), out.nnz, raft::Compare<value_idx>()));
-    ASSERT_TRUE(raft::devArrMatch<value_t>(out_vals.data(), out.vals(), out.nnz,
-                                           raft::Compare<value_t>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<value_t>(out_vals.data(), out.vals(), out.nnz, raft::Compare<value_t>()));
   }
 
   void TearDown() override {}
@@ -115,7 +114,8 @@ const std::vector<SparseReduceInputs<float, int>> max_reduce_inputs_f = {
    4},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseReduceTest, SparseReduceTestF,
+INSTANTIATE_TEST_CASE_P(SparseReduceTest,
+                        SparseReduceTestF,
                         ::testing::ValuesIn(max_reduce_inputs_f));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index b64fa25883..feefa7baa3 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -38,43 +38,47 @@ struct CSRRowOpInputs {
 /** Wrapper to call csr_row_op because the enclosing function of a __device__
  *  lambda cannot have private ot protected access within the class. */
 template <typename Type_f, typename Index_>
-void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz,
-                        Type_f *result, cudaStream_t stream) {
+void csr_row_op_wrapper(
+  const Index_* row_ind, Index_ n_rows, Index_ nnz, Type_f* result, cudaStream_t stream)
+{
   op::csr_row_op<Index_, 32>(
-    row_ind, n_rows, nnz,
+    row_ind,
+    n_rows,
+    nnz,
     [result] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {
-      for (Index_ i = start_idx; i < stop_idx; i++) result[i] = row;
+      for (Index_ i = start_idx; i < stop_idx; i++)
+        result[i] = row;
     },
     stream);
 }
 
 template <typename Type_f, typename Index_>
-class CSRRowOpTest
-  : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>> {
+class CSRRowOpTest : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>> {
  protected:
-  void SetUp() override {
-    params =
-      ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>>::GetParam();
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>>::GetParam();
     cudaStreamCreate(&stream);
     n_rows = params.ex_scan.size();
-    nnz = params.verify.size();
+    nnz    = params.verify.size();
 
     raft::allocate(verify, nnz);
     raft::allocate(ex_scan, n_rows);
     raft::allocate(result, nnz, true);
   }
 
-  void Run() {
+  void Run()
+  {
     raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
     raft::update_device(verify, params.verify.data(), nnz, stream);
 
     csr_row_op_wrapper<Type_f, Index_>(ex_scan, n_rows, nnz, result, stream);
 
-    ASSERT_TRUE(
-      raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
+    ASSERT_TRUE(raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(ex_scan));
     CUDA_CHECK(cudaFree(verify));
     CUDA_CHECK(cudaFree(result));
@@ -85,7 +89,7 @@ class CSRRowOpTest
   CSRRowOpInputs<Type_f, Index_> params;
   cudaStream_t stream;
   Index_ n_rows, nnz;
-  Index_ *ex_scan;
+  Index_* ex_scan;
   Type_f *result, *verify;
 };
 
@@ -102,10 +106,8 @@ const std::vector<CSRRowOpInputs<double, int>> csrrowop_inputs_d = {
   {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF,
-                        ::testing::ValuesIn(csrrowop_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD,
-                        ::testing::ValuesIn(csrrowop_inputs_d));
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, ::testing::ValuesIn(csrrowop_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, ::testing::ValuesIn(csrrowop_inputs_d));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu
index 46f2f6a844..5d3b2a8317 100644
--- a/cpp/test/sparse/selection.cu
+++ b/cpp/test/sparse/selection.cu
@@ -45,8 +45,9 @@ struct SparseSelectionInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(
-  ::std::ostream &os, const SparseSelectionInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os,
+                           const SparseSelectionInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
@@ -54,7 +55,8 @@ template <typename value_idx, typename value_t>
 class SparseSelectionTest
   : public ::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>> {
  protected:
-  void make_data() {
+  void make_data()
+  {
     std::vector<value_t> dists_h = params.dists_h;
 
     allocate(dists, n_rows * n_cols);
@@ -63,42 +65,39 @@ class SparseSelectionTest
     allocate(inds, n_rows * n_cols);
     iota_fill(inds, n_rows, n_cols, stream);
 
-    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
 
     allocate(out_indices_ref, out_indices_ref_h.size());
     allocate(out_dists_ref, out_dists_ref_h.size());
 
-    update_device(out_indices_ref, out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), stream);
-    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
-                  stream);
+    update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
 
     allocate(out_dists, n_rows * k);
     allocate(out_indices, n_rows * k);
   }
 
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      SparseSelectionInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(
-      new raft::mr::device::default_allocator);
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     n_rows = params.n_rows;
     n_cols = params.n_cols;
-    k = params.k;
+    k      = params.k;
 
     make_data();
 
-    raft::sparse::selection::select_k(dists, inds, n_rows, n_cols, out_dists,
-                                      out_indices, params.select_min, k,
-                                      stream);
+    raft::sparse::selection::select_k(
+      dists, inds, n_rows, n_cols, out_dists, out_indices, params.select_min, k, stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     CUDA_CHECK(cudaFree(dists));
@@ -111,11 +110,10 @@ class SparseSelectionTest
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void compare() {
-    ASSERT_TRUE(
-      devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k,
-                            Compare<value_idx>()));
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, Compare<value_idx>()));
   }
 
  protected:
@@ -124,15 +122,15 @@ class SparseSelectionTest
   int n_rows, n_cols, k;
 
   // input data
-  value_t *dists;
-  value_idx *inds;
+  value_t* dists;
+  value_idx* inds;
 
   // output data
-  value_idx *out_indices;
-  value_t *out_dists;
+  value_idx* out_indices;
+  value_t* out_dists;
 
-  value_idx *out_indices_ref;
-  value_t *out_dists_ref;
+  value_idx* out_indices_ref;
+  value_t* out_dists_ref;
 
   SparseSelectionInputs<value_idx, value_t> params;
 };
@@ -149,7 +147,8 @@ const std::vector<SparseSelectionInputs<int, float>> inputs_i32_f = {
    true}};
 typedef SparseSelectionTest<int, float> SparseSelectionTestF;
 TEST_P(SparseSelectionTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF,
+INSTANTIATE_TEST_CASE_P(SparseSelectionTest,
+                        SparseSelectionTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace selection
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index b9a8b849eb..e154d19d34 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -47,27 +47,27 @@ class SparseSortTest : public ::testing::TestWithParam<SparseSortInput<T>> {
 const std::vector<SparseSortInput<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseSortTest<float> COOSort;
-TEST_P(COOSort, Result) {
+TEST_P(COOSort, Result)
+{
   int *in_rows, *in_cols, *verify;
-  float *in_vals;
+  float* in_vals;
 
   params = ::testing::TestWithParam<SparseSortInput<float>>::GetParam();
   raft::random::Rng r(params.seed);
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
-  std::shared_ptr<raft::mr::device::allocator> alloc(
-    new raft::mr::device::default_allocator);
+  std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
 
   raft::allocate(in_vals, params.nnz);
   r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream);
 
-  int *in_rows_h = (int *)malloc(params.nnz * sizeof(int));
-  int *in_cols_h = (int *)malloc(params.nnz * sizeof(int));
-  int *verify_h = (int *)malloc(params.nnz * sizeof(int));
+  int* in_rows_h = (int*)malloc(params.nnz * sizeof(int));
+  int* in_cols_h = (int*)malloc(params.nnz * sizeof(int));
+  int* verify_h  = (int*)malloc(params.nnz * sizeof(int));
 
   for (int i = 0; i < params.nnz; i++) {
     in_rows_h[i] = params.nnz - i - 1;
-    verify_h[i] = i;
+    verify_h[i]  = i;
     in_cols_h[i] = i;
   }
 
@@ -80,11 +80,9 @@ TEST_P(COOSort, Result) {
   raft::update_device(in_cols, in_cols_h, params.nnz, stream);
   raft::update_device(verify, verify_h, params.nnz, stream);
 
-  op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc,
-               stream);
+  op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc, stream);
 
-  ASSERT_TRUE(
-    raft::devArrMatch<int>(verify, in_rows, params.nnz, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(verify, in_rows, params.nnz, raft::Compare<int>()));
 
   delete[] in_rows_h;
   delete[] in_cols_h;
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index d104028d2b..6a66daa769 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -29,8 +29,9 @@ namespace raft {
 namespace sparse {
 
 template <typename value_idx, typename value_t>
-__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals,
-                                value_idx nnz, value_idx *sum) {
+__global__ void assert_symmetry(
+  value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum)
+{
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid >= nnz) return;
@@ -49,19 +50,21 @@ struct SparseSymmetrizeInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(
-  ::std::ostream &os, const SparseSymmetrizeInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os,
+                           const SparseSymmetrizeInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class SparseSymmetrizeTest : public ::testing::TestWithParam<
-                               SparseSymmetrizeInputs<value_idx, value_t>> {
+class SparseSymmetrizeTest
+  : public ::testing::TestWithParam<SparseSymmetrizeInputs<value_idx, value_t>> {
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
@@ -72,19 +75,19 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
     update_device(data, data_h.data(), data_h.size(), stream);
   }
 
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      SparseSymmetrizeInputs<value_idx, value_t>>::GetParam();
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<SparseSymmetrizeInputs<value_idx, value_t>>::GetParam();
 
     raft::handle_t handle;
 
     auto alloc = handle.get_device_allocator();
-    stream = handle.get_stream();
+    stream     = handle.get_stream();
 
     make_data();
 
-    value_idx m = params.indptr_h.size() - 1;
-    value_idx n = params.n_cols;
+    value_idx m   = params.indptr_h.size() - 1;
+    value_idx n   = params.n_cols;
     value_idx nnz = params.indices_h.size();
 
     raft::mr::device::buffer<value_idx> coo_rows(alloc, stream, nnz);
@@ -93,8 +96,8 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
 
     raft::sparse::COO<value_t, value_idx> out(alloc, stream);
 
-    raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m,
-                                     n, coo_rows.size(), out);
+    raft::sparse::linalg::symmetrize(
+      handle, coo_rows.data(), indices, data, m, n, coo_rows.size(), out);
 
     raft::mr::device::buffer<value_idx> sum(alloc, stream, 1);
 
@@ -107,7 +110,8 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
@@ -119,7 +123,7 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
 
   // input data
   value_idx *indptr, *indices;
-  value_t *data;
+  value_t* data;
 
   value_idx sum_h;
 
@@ -133,8 +137,7 @@ struct COOSymmetrizeInputs {
 };
 
 template <typename T>
-class COOSymmetrizeTest
-  : public ::testing::TestWithParam<COOSymmetrizeInputs<T>> {
+class COOSymmetrizeTest : public ::testing::TestWithParam<COOSymmetrizeInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -144,7 +147,8 @@ class COOSymmetrizeTest
 const std::vector<COOSymmetrizeInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef COOSymmetrizeTest<float> COOSymmetrize;
-TEST_P(COOSymmetrize, Result) {
+TEST_P(COOSymmetrize, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
@@ -153,16 +157,14 @@ TEST_P(COOSymmetrize, Result) {
 
   int nnz = 8;
 
-  int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
-  int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2};
-  float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5};
+  int* in_rows_h   = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
+  int* in_cols_h   = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2};
+  float* in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5};
 
-  int *exp_rows_h =
-    new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0};
-  int *exp_cols_h =
-    new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0};
-  float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0,
-                                         0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
+  int* exp_rows_h = new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0};
+  int* exp_cols_h = new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0};
+  float* exp_vals_h =
+    new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
 
   COO<float> in(alloc, stream, nnz, 4, 4);
   raft::update_device(in.rows(), *&in_rows_h, nnz, stream);
@@ -172,22 +174,19 @@ TEST_P(COOSymmetrize, Result) {
   COO<float> out(alloc, stream);
 
   linalg::coo_symmetrize<32, float>(
-    &in, &out,
-    [] __device__(int row, int col, float val, float trans) {
-      return val + trans;
-    },
-    alloc, stream);
+    &in,
+    &out,
+    [] __device__(int row, int col, float val, float trans) { return val + trans; },
+    alloc,
+    stream);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
   std::cout << out << std::endl;
 
   ASSERT_TRUE(out.nnz == nnz * 2);
-  ASSERT_TRUE(raft::devArrMatch<int>(out.rows(), exp_rows_h, out.nnz,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<int>(out.cols(), exp_cols_h, out.nnz,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<float>(out.vals(), exp_vals_h, out.nnz,
-                                       raft::Compare<float>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.rows(), exp_rows_h, out.nnz, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.cols(), exp_cols_h, out.nnz, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<float>(out.vals(), exp_vals_h, out.nnz, raft::Compare<float>()));
 
   cudaStreamDestroy(stream);
 
@@ -200,8 +199,7 @@ TEST_P(COOSymmetrize, Result) {
   delete[] exp_vals_h;
 }
 
-INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, ::testing::ValuesIn(inputsf));
 
 const std::vector<SparseSymmetrizeInputs<int, float>> symm_inputs_fint = {
   // Test n_clusters == n_points
@@ -221,7 +219,8 @@ const std::vector<SparseSymmetrizeInputs<int, float>> symm_inputs_fint = {
 typedef SparseSymmetrizeTest<int, float> SparseSymmetrizeTestF_int;
 TEST_P(SparseSymmetrizeTestF_int, Result) { ASSERT_TRUE(sum_h == 0); }
 
-INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, SparseSymmetrizeTestF_int,
+INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest,
+                        SparseSymmetrizeTestF_int,
                         ::testing::ValuesIn(symm_inputs_fint));
 
 }  // namespace sparse
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index def1f1685b..8d35960d6a 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -29,7 +29,8 @@ namespace knn {
 template <typename value_idx, typename value_t>
 class HaversineKNNTest : public ::testing::Test {
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     auto alloc = std::make_shared<raft::mr::device::default_allocator>();
 
     // Allocate input
@@ -44,31 +45,37 @@ class HaversineKNNTest : public ::testing::Test {
     raft::allocate(d_pred_D, n * n);
 
     // make testdata on host
-    std::vector<value_t> h_train_inputs = {
-      0.71113885, -1.29215058, 0.59613176, -2.08048115,
-      0.74932804, -1.33634042, 0.51486728, -1.65962873,
-      0.53154002, -1.47049808, 0.72891737, -1.54095137};
+    std::vector<value_t> h_train_inputs = {0.71113885,
+                                           -1.29215058,
+                                           0.59613176,
+                                           -2.08048115,
+                                           0.74932804,
+                                           -1.33634042,
+                                           0.51486728,
+                                           -1.65962873,
+                                           0.53154002,
+                                           -1.47049808,
+                                           0.72891737,
+                                           -1.54095137};
 
     h_train_inputs.resize(n);
     raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, 0);
 
-    std::vector<value_t> h_res_D = {
-      0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595,
-      0., 0.36575755, 0.44288665, 0.5170737,  0.59501296, 0.62925595,
-      0., 0.05041587, 0.152463,   0.2426416,  0.34925285, 0.59501296,
-      0., 0.16461092, 0.2345792,  0.34925285, 0.35749438, 0.36575755,
-      0., 0.16461092, 0.20535265, 0.23048252, 0.2426416,  0.5170737,
-      0., 0.152463,   0.18767063, 0.20535265, 0.2345792,  0.44288665};
+    std::vector<value_t> h_res_D = {0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595,
+                                    0., 0.36575755, 0.44288665, 0.5170737,  0.59501296, 0.62925595,
+                                    0., 0.05041587, 0.152463,   0.2426416,  0.34925285, 0.59501296,
+                                    0., 0.16461092, 0.2345792,  0.34925285, 0.35749438, 0.36575755,
+                                    0., 0.16461092, 0.20535265, 0.23048252, 0.2426416,  0.5170737,
+                                    0., 0.152463,   0.18767063, 0.20535265, 0.2345792,  0.44288665};
     h_res_D.resize(n * n);
     raft::update_device(d_ref_D, h_res_D.data(), n * n, 0);
 
-    std::vector<value_idx> h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0,
-                                      2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1,
-                                      4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1};
+    std::vector<value_idx> h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, 2, 0, 5, 4, 3, 1,
+                                      3, 4, 5, 2, 0, 1, 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1};
     h_res_I.resize(n * n);
     raft::update_device<value_idx>(d_ref_I, h_res_I.data(), n * n, 0);
 
-    std::vector<value_t *> input_vec = {d_train_inputs};
+    std::vector<value_t*> input_vec  = {d_train_inputs};
     std::vector<value_idx> sizes_vec = {n};
 
     cudaStream_t stream;
@@ -82,7 +89,8 @@ class HaversineKNNTest : public ::testing::Test {
 
   void SetUp() override { basicTest(); }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(d_train_inputs));
     CUDA_CHECK(cudaFree(d_pred_I));
     CUDA_CHECK(cudaFree(d_pred_D));
@@ -91,27 +99,26 @@ class HaversineKNNTest : public ::testing::Test {
   }
 
  protected:
-  value_t *d_train_inputs;
+  value_t* d_train_inputs;
 
   int n = 6;
   int d = 2;
 
   int k = 6;
 
-  value_idx *d_pred_I;
-  value_t *d_pred_D;
+  value_idx* d_pred_I;
+  value_t* d_pred_D;
 
-  value_idx *d_ref_I;
-  value_t *d_ref_D;
+  value_idx* d_ref_I;
+  value_t* d_ref_D;
 };
 
 typedef HaversineKNNTest<int, float> HaversineKNNTestF;
 
-TEST_F(HaversineKNNTestF, Fit) {
-  ASSERT_TRUE(raft::devArrMatch(d_ref_D, d_pred_D, n * n,
-                                raft::CompareApprox<float>(1e-3)));
-  ASSERT_TRUE(
-    raft::devArrMatch(d_ref_I, d_pred_I, n * n, raft::Compare<int>()));
+TEST_F(HaversineKNNTestF, Fit)
+{
+  ASSERT_TRUE(raft::devArrMatch(d_ref_D, d_pred_D, n * n, raft::CompareApprox<float>(1e-3)));
+  ASSERT_TRUE(raft::devArrMatch(d_ref_I, d_pred_I, n * n, raft::Compare<int>()));
 }
 
 }  // namespace knn
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 2b1ef89f7a..d4e35c9d54 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -31,18 +31,18 @@ struct KNNInputs {
   std::vector<int> labels;
 };
 
-__global__ void build_actual_output(int *output, int n_rows, int k,
-                                    const int *idx_labels,
-                                    const int64_t *indices) {
+__global__ void build_actual_output(
+  int* output, int n_rows, int k, const int* idx_labels, const int64_t* indices)
+{
   int element = threadIdx.x + blockDim.x * blockIdx.x;
   if (element >= n_rows * k) return;
 
-  int ind = (int)indices[element];
+  int ind         = (int)indices[element];
   output[element] = idx_labels[ind];
 }
 
-__global__ void build_expected_output(int *output, int n_rows, int k,
-                                      const int *labels) {
+__global__ void build_expected_output(int* output, int n_rows, int k, const int* labels)
+{
   int row = threadIdx.x + blockDim.x * blockIdx.x;
   if (row >= n_rows) return;
 
@@ -55,25 +55,33 @@ __global__ void build_expected_output(int *output, int n_rows, int k,
 template <typename T>
 class KNNTest : public ::testing::TestWithParam<KNNInputs> {
  protected:
-  void testBruteForce() {
-    raft::print_device_vector("Input array: ", input_, rows_ * cols_,
-                              std::cout);
+  void testBruteForce()
+  {
+    raft::print_device_vector("Input array: ", input_, rows_ * cols_, std::cout);
     std::cout << "K: " << k_ << "\n";
-    raft::print_device_vector("Labels array: ", search_labels_, rows_,
-                              std::cout);
+    raft::print_device_vector("Labels array: ", search_labels_, rows_, std::cout);
 
     auto stream = handle_.get_stream();
 
     raft::allocate(actual_labels_, rows_ * k_, true);
     raft::allocate(expected_labels_, rows_ * k_, true);
 
-    std::vector<float *> input_vec;
+    std::vector<float*> input_vec;
     std::vector<int> sizes_vec;
     input_vec.push_back(input_);
     sizes_vec.push_back(rows_);
 
-    brute_force_knn(handle_, input_vec, sizes_vec, cols_, search_data_, rows_,
-                    indices_, distances_, k_, true, true);
+    brute_force_knn(handle_,
+                    input_vec,
+                    sizes_vec,
+                    cols_,
+                    search_data_,
+                    rows_,
+                    indices_,
+                    distances_,
+                    k_,
+                    true,
+                    true);
 
     build_actual_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
       actual_labels_, rows_, k_, search_labels_, indices_);
@@ -81,24 +89,20 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     build_expected_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
       expected_labels_, rows_, k_, search_labels_);
 
-    raft::print_device_vector("Output indices: ", indices_, rows_ * k_,
-                              std::cout);
-    raft::print_device_vector("Output distances: ", distances_, rows_ * k_,
-                              std::cout);
-    raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_,
-                              std::cout);
-    raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_,
-                              std::cout);
-
-    ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_,
-                            raft::Compare<int>()));
+    raft::print_device_vector("Output indices: ", indices_, rows_ * k_, std::cout);
+    raft::print_device_vector("Output distances: ", distances_, rows_ * k_, std::cout);
+    raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_, std::cout);
+    raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_, std::cout);
+
+    ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_, raft::Compare<int>()));
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     params_ = ::testing::TestWithParam<KNNInputs>::GetParam();
-    rows_ = params_.input.size();
-    cols_ = params_.input[0].size();
-    k_ = params_.k;
+    rows_   = params_.input.size();
+    cols_   = params_.input[0].size();
+    k_      = params_.k;
 
     std::vector<float> row_major_input;
     for (int i = 0; i < params_.input.size(); ++i) {
@@ -107,14 +111,12 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
       }
     }
     rmm::device_buffer input_d = rmm::device_buffer(
-      row_major_input.data(), row_major_input.size() * sizeof(float),
-      handle_.get_stream());
-    float *input_ptr = static_cast<float *>(input_d.data());
+      row_major_input.data(), row_major_input.size() * sizeof(float), handle_.get_stream());
+    float* input_ptr = static_cast<float*>(input_d.data());
 
     rmm::device_buffer labels_d = rmm::device_buffer(
-      params_.labels.data(), params_.labels.size() * sizeof(int),
-      handle_.get_stream());
-    int *labels_ptr = static_cast<int *>(labels_d.data());
+      params_.labels.data(), params_.labels.size() * sizeof(int), handle_.get_stream());
+    int* labels_ptr = static_cast<int*>(labels_d.data());
 
     raft::allocate(input_, rows_ * cols_, true);
     raft::allocate(search_data_, rows_ * cols_, true);
@@ -127,7 +129,8 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     raft::copy(search_labels_, labels_ptr, rows_, handle_.get_stream());
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(search_data_));
     CUDA_CHECK(cudaFree(indices_));
     CUDA_CHECK(cudaFree(distances_));
@@ -139,15 +142,15 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
   KNNInputs params_;
   int rows_;
   int cols_;
-  float *input_;
-  float *search_data_;
-  int64_t *indices_;
-  float *distances_;
+  float* input_;
+  float* search_data_;
+  int64_t* indices_;
+  float* distances_;
   int k_;
 
-  int *search_labels_;
-  int *actual_labels_;
-  int *expected_labels_;
+  int* search_labels_;
+  int* actual_labels_;
+  int* expected_labels_;
 };
 
 const std::vector<KNNInputs> inputs = {
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index e5c2d52764..2d7d713717 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -32,7 +32,8 @@ struct csr_view_t {
   index_type number_of_edges;
 };
 }  // namespace
-TEST(Raft, SpectralMatrices) {
+TEST(Raft, SpectralMatrices)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -49,19 +50,18 @@ TEST(Raft, SpectralMatrices) {
   index_type* ro{nullptr};
   index_type* ci{nullptr};
   value_type* vs{nullptr};
-  index_type nnz = 0;
+  index_type nnz   = 0;
   index_type nrows = 0;
   sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
   sparse_matrix_t<index_type, value_type> sm2{h, csr_v};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
   ASSERT_EQ(nullptr, sm2.row_offsets_);
 
-  auto stream = h.get_stream();
+  auto stream    = h.get_stream();
   auto t_exe_pol = thrust::cuda::par.on(stream);
 
   auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) {
-    laplacian_matrix_t<index_type, value_type> lm1{h,  t_exe_pol, ro, ci,
-                                                   vs, nrows,     nnz};
+    laplacian_matrix_t<index_type, value_type> lm1{h, t_exe_pol, ro, ci, vs, nrows, nnz};
   };
   EXPECT_ANY_THROW(cnstr_lm1());  // because of nullptr ptr args
 
@@ -71,8 +71,7 @@ TEST(Raft, SpectralMatrices) {
   EXPECT_ANY_THROW(cnstr_lm2());  // because of nullptr ptr args
 
   auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) {
-    modularity_matrix_t<index_type, value_type> mm1{h,  t_exe_pol, ro, ci,
-                                                    vs, nrows,     nnz};
+    modularity_matrix_t<index_type, value_type> mm1{h, t_exe_pol, ro, ci, vs, nrows, nnz};
   };
   EXPECT_ANY_THROW(cnstr_mm1());  // because of nullptr ptr args
 
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index 4a3b0ed196..8eb2f91952 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -35,14 +35,16 @@ struct MeanInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const MeanInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MeanInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<MeanInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
 
@@ -59,13 +61,15 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
     meanSGtest(data, stream);
   }
 
-  void meanSGtest(T *data, cudaStream_t stream) {
+  void meanSGtest(T* data, cudaStream_t stream)
+  {
     int rows = params.rows, cols = params.cols;
 
     mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(mean_act));
   }
@@ -78,52 +82,52 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
 // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the
 // measured mean (of a normal distribution) will fall outside of an epsilon of
 // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times)
-const std::vector<MeanInputs<float>> inputsf = {
-  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
-  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 256, false, true, 1234ULL}};
-
-const std::vector<MeanInputs<double>> inputsd = {
-  {0.15, 1.0, 1024, 32, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 64, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 256, true, false, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 256, false, false, 1234ULL},
-  {0.15, 1.0, 1024, 32, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 64, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 256, true, true, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 64, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 256, false, true, 1234ULL}};
+const std::vector<MeanInputs<float>> inputsf = {{0.15f, 1.f, 1024, 32, true, false, 1234ULL},
+                                                {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
+                                                {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
+                                                {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
+                                                {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
+                                                {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
+                                                {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
+                                                {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
+                                                {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
+                                                {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
+                                                {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
+                                                {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
+                                                {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
+                                                {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
+                                                {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
+                                                {0.15f, -1.f, 1024, 256, false, true, 1234ULL}};
+
+const std::vector<MeanInputs<double>> inputsd = {{0.15, 1.0, 1024, 32, true, false, 1234ULL},
+                                                 {0.15, 1.0, 1024, 64, true, false, 1234ULL},
+                                                 {0.15, 1.0, 1024, 128, true, false, 1234ULL},
+                                                 {0.15, 1.0, 1024, 256, true, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 32, false, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 64, false, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 128, false, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 256, false, false, 1234ULL},
+                                                 {0.15, 1.0, 1024, 32, true, true, 1234ULL},
+                                                 {0.15, 1.0, 1024, 64, true, true, 1234ULL},
+                                                 {0.15, 1.0, 1024, 128, true, true, 1234ULL},
+                                                 {0.15, 1.0, 1024, 256, true, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 32, false, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 64, false, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 128, false, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 256, false, true, 1234ULL}};
 
 typedef MeanTest<float> MeanTestF;
-TEST_P(MeanTestF, Result) {
-  ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MeanTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(params.mean, mean_act, params.cols, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MeanTest<double> MeanTestD;
-TEST_P(MeanTestD, Result) {
-  ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MeanTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(params.mean, mean_act, params.cols, CompareApprox<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestF, ::testing::ValuesIn(inputsf));
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index 8b0d607561..67df0def05 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -34,16 +34,16 @@ struct MeanCenterInputs {
 };
 
 template <typename T, typename IdxType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const MeanCenterInputs<T, IdxType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MeanCenterInputs<T, IdxType>& dims)
+{
   return os;
 }
 
 template <typename T, typename IdxType>
-class MeanCenterTest
-  : public ::testing::TestWithParam<MeanCenterInputs<T, IdxType>> {
+class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxType>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam();
     raft::random::Rng r(params.seed);
 
@@ -51,7 +51,7 @@ class MeanCenterTest
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     auto rows = params.rows, cols = params.cols;
-    auto len = rows * cols;
+    auto len       = rows * cols;
     IdxType vecLen = params.bcastAlongRows ? cols : rows;
 
     raft::allocate(out, len);
@@ -59,16 +59,15 @@ class MeanCenterTest
     raft::allocate(data, len);
     raft::allocate(meanVec, vecLen);
     r.normal(data, len, params.mean, (T)1.0, stream);
-    raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor,
-                      stream);
-    meanCenter(out, data, meanVec, cols, rows, params.rowMajor,
-               params.bcastAlongRows, stream);
-    raft::linalg::naiveMatVec(out_ref, data, meanVec, cols, rows,
-                              params.rowMajor, params.bcastAlongRows, (T)-1.0);
+    raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor, stream);
+    meanCenter(out, data, meanVec, cols, rows, params.rowMajor, params.bcastAlongRows, stream);
+    raft::linalg::naiveMatVec(
+      out_ref, data, meanVec, cols, rows, params.rowMajor, params.bcastAlongRows, (T)-1.0);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(out));
     CUDA_CHECK(cudaFree(out_ref));
     CUDA_CHECK(cudaFree(data));
@@ -106,12 +105,11 @@ const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
   {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
   {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<float, int> MeanCenterTestF_i32;
-TEST_P(MeanCenterTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MeanCenterTestF_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32,
-                         ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
 const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
   {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL},
@@ -139,12 +137,11 @@ const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
   {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
   {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<float, size_t> MeanCenterTestF_i64;
-TEST_P(MeanCenterTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MeanCenterTestF_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64,
-                         ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
   {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
@@ -172,12 +169,12 @@ const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
   {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
   {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<double, int> MeanCenterTestD_i32;
-TEST_P(MeanCenterTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(MeanCenterTestD_i32, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out, out_ref, params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32,
-                         ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
   {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
@@ -205,12 +202,12 @@ const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
   {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
   {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<double, size_t> MeanCenterTestD_i64;
-TEST_P(MeanCenterTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(MeanCenterTestD_i64, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out, out_ref, params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64,
-                         ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 }  // end namespace stats
 }  // end namespace raft
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index ff2698788f..8b7f75171b 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -34,14 +34,16 @@ struct StdDevInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const StdDevInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const StdDevInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<StdDevInputs<T>>::GetParam();
     random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
@@ -58,21 +60,21 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void stdVarSGtest(T *data, cudaStream_t stream) {
+  void stdVarSGtest(T* data, cudaStream_t stream)
+  {
     int rows = params.rows, cols = params.cols;
 
     mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream);
 
-    stddev(stddev_act, data, mean_act, cols, rows, params.sample,
-           params.rowMajor, stream);
+    stddev(stddev_act, data, mean_act, cols, rows, params.sample, params.rowMajor, stream);
 
-    vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor,
-         stream);
+    vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor, stream);
 
     raft::matrix::seqRoot(vars_act, T(1), cols, stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(mean_act));
     CUDA_CHECK(cudaFree(stddev_act));
@@ -121,28 +123,28 @@ const std::vector<StdDevInputs<double>> inputsd = {
   {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL}};
 
 typedef StdDevTest<float> StdDevTestF;
-TEST_P(StdDevTestF, Result) {
-  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(StdDevTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(params.stddev, stddev_act, params.cols, CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols,
-                          CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(stddev_act, vars_act, params.cols, CompareApprox<float>(params.tolerance)));
 }
 
 typedef StdDevTest<double> StdDevTestD;
-TEST_P(StdDevTestD, Result) {
-  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(StdDevTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(params.stddev, stddev_act, params.cols, CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols,
-                          CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(stddev_act, vars_act, params.cols, CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace stats
 }  // end namespace raft
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index c3140d4588..89e81708cc 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -32,15 +32,17 @@ struct SumInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const SumInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SumInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
 class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
  protected:
-  void SetUp() override {
-    params = ::testing::TestWithParam<SumInputs<T>>::GetParam();
+  void SetUp() override
+  {
+    params   = ::testing::TestWithParam<SumInputs<T>>::GetParam();
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
     cudaStream_t stream;
@@ -59,7 +61,8 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(sum_act));
   }
@@ -76,15 +79,17 @@ const std::vector<SumInputs<double>> inputsd = {{0.05, 1024, 32, 1234ULL},
                                                 {0.05, 1024, 256, 1234ULL}};
 
 typedef SumTest<float> SumTestF;
-TEST_P(SumTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act, params.cols,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(SumTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    float(params.rows), sum_act, params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef SumTest<double> SumTestD;
-TEST_P(SumTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act, params.cols,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(SumTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    double(params.rows), sum_act, params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_CASE_P(SumTests, SumTestF, ::testing::ValuesIn(inputsf));
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index b8e8fe3fa0..ca09d9c855 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -25,15 +25,16 @@ namespace raft {
 
 template <typename T>
 struct Compare {
-  bool operator()(const T &a, const T &b) const { return a == b; }
+  bool operator()(const T& a, const T& b) const { return a == b; }
 };
 
 template <typename T>
 struct CompareApprox {
   CompareApprox(T eps_) : eps(eps_) {}
-  bool operator()(const T &a, const T &b) const {
-    T diff = abs(a - b);
-    T m = std::max(abs(a), abs(b));
+  bool operator()(const T& a, const T& b) const
+  {
+    T diff  = abs(a - b);
+    T m     = std::max(abs(a), abs(b));
     T ratio = diff >= eps ? diff / m : diff;
 
     return (ratio <= eps);
@@ -46,9 +47,10 @@ struct CompareApprox {
 template <typename T>
 struct CompareApproxAbs {
   CompareApproxAbs(T eps_) : eps(eps_) {}
-  bool operator()(const T &a, const T &b) const {
-    T diff = abs(abs(a) - abs(b));
-    T m = std::max(abs(a), abs(b));
+  bool operator()(const T& a, const T& b) const
+  {
+    T diff  = abs(abs(a) - abs(b));
+    T m     = std::max(abs(a), abs(b));
     T ratio = diff >= eps ? diff / m : diff;
     return (ratio <= eps);
   }
@@ -58,25 +60,26 @@ struct CompareApproxAbs {
 };
 
 template <typename T>
-T abs(const T &a) {
+T abs(const T& a)
+{
   return a > T(0) ? a : -a;
 }
 
 /*
-     * @brief Helper function to compare 2 device n-D arrays with custom comparison
-     * @tparam T the data type of the arrays
-     * @tparam L the comparator lambda or object function
-     * @param expected expected value(s)
-     * @param actual actual values
-     * @param eq_compare the comparator
-     * @param stream cuda stream
-     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
-     * @{
-     */
+ * @brief Helper function to compare 2 device n-D arrays with custom comparison
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected expected value(s)
+ * @param actual actual values
+ * @param eq_compare the comparator
+ * @param stream cuda stream
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ * @{
+ */
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(const T *expected, const T *actual,
-                                     size_t size, L eq_compare,
-                                     cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatch(
+  const T* expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0)
+{
   std::unique_ptr<T[]> exp_h(new T[size]);
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(exp_h.get(), expected, size, stream);
@@ -86,16 +89,16 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual,
     auto exp = exp_h.get()[i];
     auto act = act_h.get()[i];
     if (!eq_compare(exp, act)) {
-      return testing::AssertionFailure()
-             << "actual=" << act << " != expected=" << exp << " @" << i;
+      return testing::AssertionFailure() << "actual=" << act << " != expected=" << exp << " @" << i;
     }
   }
   return testing::AssertionSuccess();
 }
 
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size,
-                                     L eq_compare, cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatch(
+  T expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0)
+{
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -110,9 +113,13 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size,
 }
 
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(const T *expected, const T *actual,
-                                     size_t rows, size_t cols, L eq_compare,
-                                     cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatch(const T* expected,
+                                     const T* actual,
+                                     size_t rows,
+                                     size_t cols,
+                                     L eq_compare,
+                                     cudaStream_t stream = 0)
+{
   size_t size = rows * cols;
   std::unique_ptr<T[]> exp_h(new T[size]);
   std::unique_ptr<T[]> act_h(new T[size]);
@@ -126,8 +133,7 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual,
       auto act = act_h.get()[idx];
       if (!eq_compare(exp, act)) {
         return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << exp << " @" << i << ","
-               << j;
+               << "actual=" << act << " != expected=" << exp << " @" << i << "," << j;
       }
     }
   }
@@ -135,9 +141,9 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual,
 }
 
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows,
-                                     size_t cols, L eq_compare,
-                                     cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatch(
+  T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0)
+{
   size_t size = rows * cols;
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
@@ -148,8 +154,7 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows,
       auto act = act_h.get()[idx];
       if (!eq_compare(expected, act)) {
         return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << expected << " @" << i
-               << "," << j;
+               << "actual=" << act << " != expected=" << expected << " @" << i << "," << j;
       }
     }
   }
@@ -157,24 +162,24 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows,
 }
 
 /*
-     * @brief Helper function to compare a device n-D arrays with an expected array
-     * on the host, using a custom comparison
-     * @tparam T the data type of the arrays
-     * @tparam L the comparator lambda or object function
-     * @param expected_h host array of expected value(s)
-     * @param actual_d device array actual values
-     * @param eq_compare the comparator
-     * @param stream cuda stream
-     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
-     */
+ * @brief Helper function to compare a device n-D arrays with an expected array
+ * on the host, using a custom comparison
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected_h host array of expected value(s)
+ * @param actual_d device array actual values
+ * @param eq_compare the comparator
+ * @param stream cuda stream
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ */
 template <typename T, typename L>
-testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d,
-                                         size_t size, L eq_compare,
-                                         cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatchHost(
+  const T* expected_h, const T* actual_d, size_t size, L eq_compare, cudaStream_t stream = 0)
+{
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual_d, size, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
-  bool ok = true;
+  bool ok   = true;
   auto fail = testing::AssertionFailure();
   for (size_t i(0); i < size; ++i) {
     auto exp = expected_h[i];
@@ -189,19 +194,19 @@ testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d,
 }
 
 /*
-     * @brief Helper function to compare diagonal values of a 2D matrix
-     * @tparam T the data type of the arrays
-     * @tparam L the comparator lambda or object function
-     * @param expected expected value along diagonal
-     * @param actual actual matrix
-     * @param eq_compare the comparator
-     * @param stream cuda stream
-     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
-     */
+ * @brief Helper function to compare diagonal values of a 2D matrix
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected expected value along diagonal
+ * @param actual actual matrix
+ * @param eq_compare the comparator
+ * @param stream cuda stream
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ */
 template <typename T, typename L>
-testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows,
-                                       size_t cols, L eq_compare,
-                                       cudaStream_t stream = 0) {
+testing::AssertionResult diagonalMatch(
+  T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0)
+{
   size_t size = rows * cols;
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
@@ -213,8 +218,7 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows,
       auto act = act_h.get()[idx];
       if (!eq_compare(expected, act)) {
         return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << expected << " @" << i
-               << "," << j;
+               << "actual=" << act << " != expected=" << expected << " @" << i << "," << j;
       }
     }
   }
@@ -222,10 +226,10 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows,
 }
 
 template <typename T, typename L>
-testing::AssertionResult match(const T expected, T actual, L eq_compare) {
+testing::AssertionResult match(const T expected, T actual, L eq_compare)
+{
   if (!eq_compare(expected, actual)) {
-    return testing::AssertionFailure()
-           << "actual=" << actual << " != expected=" << expected;
+    return testing::AssertionFailure() << "actual=" << actual << " != expected=" << expected;
   }
   return testing::AssertionSuccess();
 }

From 31bf93e7b2ff1f64ed16f31717453586487856f1 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Wed, 24 Nov 2021 17:58:22 -0500
Subject: [PATCH 3/5] Revert "Formatting changes"

This reverts commit cc03dbac0da3a25b51404fec2526c43812982be7.
---
 cpp/include/raft.hpp                          |    3 +-
 cpp/include/raft/cache/cache_util.cuh         |  104 +-
 cpp/include/raft/common/cub_wrappers.cuh      |   42 +-
 .../raft/common/device_loads_stores.cuh       |   87 +-
 cpp/include/raft/common/scatter.cuh           |   77 +-
 cpp/include/raft/comms/comms.hpp              |  342 ++--
 cpp/include/raft/comms/helper.hpp             |   37 +-
 cpp/include/raft/comms/mpi_comms.hpp          |  300 ++--
 cpp/include/raft/comms/std_comms.hpp          |  328 ++--
 cpp/include/raft/comms/test.hpp               |  236 +--
 cpp/include/raft/comms/ucp_helper.hpp         |  138 +-
 cpp/include/raft/comms/util.hpp               |  114 +-
 cpp/include/raft/cuda_utils.cuh               |  259 +--
 cpp/include/raft/cudart_utils.h               |  190 +-
 cpp/include/raft/device_atomics.cuh           |  265 +--
 cpp/include/raft/distance/canberra.cuh        |  136 +-
 cpp/include/raft/distance/chebyshev.cuh       |  136 +-
 cpp/include/raft/distance/cosine.cuh          |  175 +-
 cpp/include/raft/distance/distance.cuh        |  520 ++----
 cpp/include/raft/distance/euclidean.cuh       |  314 ++--
 cpp/include/raft/distance/fused_l2_nn.cuh     |  254 +--
 cpp/include/raft/distance/hellinger.cuh       |  154 +-
 cpp/include/raft/distance/l1.cuh              |  128 +-
 cpp/include/raft/distance/minkowski.cuh       |  139 +-
 .../raft/distance/pairwise_distance_base.cuh  |  159 +-
 cpp/include/raft/error.hpp                    |   50 +-
 cpp/include/raft/handle.hpp                   |  121 +-
 cpp/include/raft/integer_utils.h              |   55 +-
 cpp/include/raft/label/classlabels.cuh        |  137 +-
 cpp/include/raft/label/merge_labels.cuh       |   31 +-
 cpp/include/raft/lap/d_structs.h              |   20 +-
 cpp/include/raft/lap/lap.cuh                  |  161 +-
 cpp/include/raft/lap/lap_functions.cuh        |  399 ++---
 cpp/include/raft/lap/lap_kernels.cuh          |  343 ++--
 cpp/include/raft/linalg/add.cuh               |   35 +-
 cpp/include/raft/linalg/binary_op.cuh         |   61 +-
 .../raft/linalg/cholesky_r1_update.cuh        |   63 +-
 .../raft/linalg/coalesced_reduction.cuh       |   55 +-
 cpp/include/raft/linalg/contractions.cuh      |   76 +-
 cpp/include/raft/linalg/cublas_wrappers.h     |  921 +++-------
 cpp/include/raft/linalg/cusolver_wrappers.h   | 1144 +++---------
 cpp/include/raft/linalg/divide.cuh            |    7 +-
 cpp/include/raft/linalg/eig.cuh               |  169 +-
 cpp/include/raft/linalg/eltwise.cuh           |   56 +-
 cpp/include/raft/linalg/gemm.cuh              |   85 +-
 cpp/include/raft/linalg/gemv.h                |   54 +-
 cpp/include/raft/linalg/init.h                |    6 +-
 cpp/include/raft/linalg/lanczos.hpp           |  786 +++-----
 cpp/include/raft/linalg/map.cuh               |   31 +-
 cpp/include/raft/linalg/map_then_reduce.cuh   |   92 +-
 cpp/include/raft/linalg/matrix_vector_op.cuh  |  102 +-
 .../raft/linalg/mean_squared_error.cuh        |   10 +-
 cpp/include/raft/linalg/multiply.cuh          |    7 +-
 cpp/include/raft/linalg/norm.cuh              |   92 +-
 cpp/include/raft/linalg/qr.cuh                |   87 +-
 cpp/include/raft/linalg/reduce.cuh            |   37 +-
 cpp/include/raft/linalg/strided_reduction.cuh |   74 +-
 cpp/include/raft/linalg/subtract.cuh          |   34 +-
 cpp/include/raft/linalg/svd.cuh               |  238 +--
 cpp/include/raft/linalg/transpose.h           |   61 +-
 cpp/include/raft/linalg/unary_op.cuh          |   86 +-
 cpp/include/raft/matrix/math.cuh              |  286 +--
 cpp/include/raft/matrix/matrix.cuh            |  208 +--
 cpp/include/raft/mr/buffer_base.hpp           |   59 +-
 cpp/include/raft/mr/device/allocator.hpp      |    9 +-
 cpp/include/raft/mr/device/buffer.hpp         |   14 +-
 cpp/include/raft/mr/host/allocator.hpp        |   13 +-
 cpp/include/raft/mr/host/buffer.hpp           |   21 +-
 cpp/include/raft/random/rng.cuh               |  319 ++--
 cpp/include/raft/random/rng_impl.cuh          |   89 +-
 cpp/include/raft/sparse/convert/coo.cuh       |   20 +-
 cpp/include/raft/sparse/convert/csr.cuh       |  126 +-
 cpp/include/raft/sparse/convert/dense.cuh     |   35 +-
 cpp/include/raft/sparse/coo.cuh               |  192 +-
 cpp/include/raft/sparse/csr.cuh               |  131 +-
 cpp/include/raft/sparse/cusparse_wrappers.h   | 1590 +++++------------
 .../raft/sparse/distance/bin_distance.cuh     |  189 +-
 cpp/include/raft/sparse/distance/common.h     |   18 +-
 cpp/include/raft/sparse/distance/coo_spmv.cuh |  118 +-
 .../coo_spmv_strategies/base_strategy.cuh     |  138 +-
 .../coo_mask_row_iterators.cuh                |  166 +-
 .../dense_smem_strategy.cuh                   |  104 +-
 .../coo_spmv_strategies/hash_strategy.cuh     |  277 ++-
 .../distance/detail/coo_spmv_kernel.cuh       |  196 +-
 cpp/include/raft/sparse/distance/distance.cuh |   48 +-
 .../raft/sparse/distance/ip_distance.cuh      |   27 +-
 .../raft/sparse/distance/l2_distance.cuh      |  386 ++--
 .../raft/sparse/distance/lp_distance.cuh      |  199 +--
 .../raft/sparse/distance/operators.cuh        |   29 +-
 cpp/include/raft/sparse/distance/utils.cuh    |    6 +-
 cpp/include/raft/sparse/hierarchy/common.h    |   10 +-
 .../sparse/hierarchy/detail/agglomerative.cuh |  124 +-
 .../hierarchy/detail/connectivities.cuh       |   92 +-
 .../raft/sparse/hierarchy/detail/mst.cuh      |   93 +-
 .../raft/sparse/hierarchy/single_linkage.hpp  |   66 +-
 cpp/include/raft/sparse/linalg/add.cuh        |  116 +-
 cpp/include/raft/sparse/linalg/degree.cuh     |   56 +-
 cpp/include/raft/sparse/linalg/norm.cuh       |   51 +-
 cpp/include/raft/sparse/linalg/spectral.cuh   |   72 +-
 cpp/include/raft/sparse/linalg/symmetrize.cuh |  157 +-
 cpp/include/raft/sparse/linalg/transpose.h    |   56 +-
 .../raft/sparse/mst/detail/mst_kernels.cuh    |  160 +-
 .../raft/sparse/mst/detail/mst_solver_inl.cuh |  258 ++-
 cpp/include/raft/sparse/mst/detail/utils.cuh  |   19 +-
 cpp/include/raft/sparse/mst/mst.cuh           |   34 +-
 cpp/include/raft/sparse/mst/mst_solver.cuh    |   48 +-
 cpp/include/raft/sparse/op/filter.cuh         |  115 +-
 cpp/include/raft/sparse/op/reduce.cuh         |   55 +-
 cpp/include/raft/sparse/op/row_op.cuh         |   16 +-
 cpp/include/raft/sparse/op/slice.h            |   34 +-
 cpp/include/raft/sparse/op/sort.h             |   35 +-
 .../sparse/selection/connect_components.cuh   |  224 +--
 cpp/include/raft/sparse/selection/knn.cuh     |  444 ++---
 .../raft/sparse/selection/knn_graph.cuh       |   54 +-
 .../raft/sparse/selection/selection.cuh       |   99 +-
 cpp/include/raft/sparse/utils.h               |   22 +-
 cpp/include/raft/spatial/knn/ann.hpp          |   31 +-
 cpp/include/raft/spatial/knn/ann_common.h     |   10 +-
 .../knn/detail/ann_quantized_faiss.cuh        |  141 +-
 .../raft/spatial/knn/detail/common_faiss.h    |   37 +-
 .../spatial/knn/detail/haversine_distance.cuh |   56 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |  178 +-
 .../raft/spatial/knn/detail/processing.hpp    |  134 +-
 cpp/include/raft/spatial/knn/knn.hpp          |   64 +-
 cpp/include/raft/spectral/cluster_solvers.hpp |   39 +-
 cpp/include/raft/spectral/eigen_solvers.hpp   |   66 +-
 cpp/include/raft/spectral/kmeans.hpp          |  476 ++---
 cpp/include/raft/spectral/lapack.hpp          |  552 ++----
 cpp/include/raft/spectral/matrix_wrappers.hpp |  279 ++-
 .../raft/spectral/modularity_maximization.hpp |   52 +-
 cpp/include/raft/spectral/partition.hpp       |   61 +-
 cpp/include/raft/spectral/spectral_util.hpp   |  125 +-
 cpp/include/raft/spectral/warn_dbg.hpp        |    4 +-
 cpp/include/raft/stats/mean.cuh               |   42 +-
 cpp/include/raft/stats/mean_center.cuh        |   45 +-
 cpp/include/raft/stats/stddev.cuh             |  102 +-
 cpp/include/raft/stats/sum.cuh                |   38 +-
 cpp/include/raft/vectorized.cuh               |  112 +-
 cpp/test/cluster_solvers.cu                   |   22 +-
 cpp/test/cudart_utils.cpp                     |    3 +-
 cpp/test/distance/dist_adj.cu                 |   78 +-
 cpp/test/distance/dist_canberra.cu            |   24 +-
 cpp/test/distance/dist_chebyshev.cu           |   24 +-
 cpp/test/distance/dist_cos.cu                 |   23 +-
 cpp/test/distance/dist_euc_exp.cu             |   22 +-
 cpp/test/distance/dist_euc_unexp.cu           |   18 +-
 cpp/test/distance/dist_hellinger.cu           |   24 +-
 cpp/test/distance/dist_l1.cu                  |   24 +-
 cpp/test/distance/dist_minkowski.cu           |   23 +-
 cpp/test/distance/distance_base.cuh           |  203 +--
 cpp/test/distance/fused_l2_nn.cu              |  192 +-
 cpp/test/eigen_solvers.cu                     |   35 +-
 cpp/test/handle.cpp                           |   21 +-
 cpp/test/integer_utils.cpp                    |    6 +-
 cpp/test/label/label.cu                       |   31 +-
 cpp/test/label/merge_labels.cu                |   67 +-
 cpp/test/lap/lap.cu                           |   92 +-
 cpp/test/linalg/add.cu                        |   13 +-
 cpp/test/linalg/add.cuh                       |   17 +-
 cpp/test/linalg/binary_op.cu                  |   88 +-
 cpp/test/linalg/binary_op.cuh                 |   17 +-
 cpp/test/linalg/cholesky_r1.cu                |   50 +-
 cpp/test/linalg/coalesced_reduction.cu        |   60 +-
 cpp/test/linalg/divide.cu                     |   50 +-
 cpp/test/linalg/eig.cu                        |  177 +-
 cpp/test/linalg/eig_sel.cu                    |   92 +-
 cpp/test/linalg/eltwise.cu                    |   98 +-
 cpp/test/linalg/gemm_layout.cu                |   63 +-
 cpp/test/linalg/map.cu                        |   98 +-
 cpp/test/linalg/map_then_reduce.cu            |   99 +-
 cpp/test/linalg/matrix_vector_op.cu           |  109 +-
 cpp/test/linalg/matrix_vector_op.cuh          |   73 +-
 cpp/test/linalg/multiply.cu                   |   30 +-
 cpp/test/linalg/norm.cu                       |  140 +-
 cpp/test/linalg/reduce.cu                     |   84 +-
 cpp/test/linalg/reduce.cuh                    |   59 +-
 cpp/test/linalg/strided_reduction.cu          |   61 +-
 cpp/test/linalg/subtract.cu                   |   74 +-
 cpp/test/linalg/svd.cu                        |  108 +-
 cpp/test/linalg/transpose.cu                  |   51 +-
 cpp/test/linalg/unary_op.cu                   |   46 +-
 cpp/test/linalg/unary_op.cuh                  |   17 +-
 cpp/test/matrix/math.cu                       |  194 +-
 cpp/test/matrix/matrix.cu                     |   84 +-
 cpp/test/mr/device/buffer.cpp                 |   16 +-
 cpp/test/mr/host/buffer.cpp                   |    9 +-
 cpp/test/mst.cu                               |  172 +-
 cpp/test/random/rng.cu                        |  203 ++-
 cpp/test/random/rng_int.cu                    |   66 +-
 cpp/test/random/sample_without_replacement.cu |   35 +-
 cpp/test/sparse/add.cu                        |   97 +-
 cpp/test/sparse/connect_components.cu         |  599 ++++---
 cpp/test/sparse/convert_coo.cu                |   20 +-
 cpp/test/sparse/convert_csr.cu                |   50 +-
 cpp/test/sparse/csr_row_slice.cu              |   80 +-
 cpp/test/sparse/csr_to_dense.cu               |   63 +-
 cpp/test/sparse/csr_transpose.cu              |   80 +-
 cpp/test/sparse/degree.cu                     |   23 +-
 cpp/test/sparse/dist_coo_spmv.cu              |  936 +++++-----
 cpp/test/sparse/distance.cu                   |  248 +--
 cpp/test/sparse/filter.cu                     |   33 +-
 cpp/test/sparse/knn.cu                        |   91 +-
 cpp/test/sparse/knn_graph.cu                  |   36 +-
 cpp/test/sparse/linkage.cu                    |  647 ++++---
 cpp/test/sparse/norm.cu                       |   34 +-
 cpp/test/sparse/reduce.cu                     |   50 +-
 cpp/test/sparse/row_op.cu                     |   40 +-
 cpp/test/sparse/selection.cu                  |   59 +-
 cpp/test/sparse/sort.cu                       |   22 +-
 cpp/test/sparse/symmetrize.cu                 |   89 +-
 cpp/test/spatial/haversine.cu                 |   61 +-
 cpp/test/spatial/knn.cu                       |   89 +-
 cpp/test/spectral_matrix.cu                   |   13 +-
 cpp/test/stats/mean.cu                        |   94 +-
 cpp/test/stats/mean_center.cu                 |   63 +-
 cpp/test/stats/stddev.cu                      |   46 +-
 cpp/test/stats/sum.cu                         |   25 +-
 cpp/test/test_utils.h                         |  136 +-
 218 files changed, 11470 insertions(+), 16429 deletions(-)

diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp
index 08f836d3a8..f380d276b2 100644
--- a/cpp/include/raft.hpp
+++ b/cpp/include/raft.hpp
@@ -21,8 +21,7 @@ namespace raft {
 /* Function for testing RAFT include
  *
  * @return message indicating RAFT has been included succesfully*/
-inline std::string test_raft()
-{
+inline std::string test_raft() {
   std::string status = "RAFT Setup succesfully";
   return status;
 }
diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh
index f63040fa00..ce8ef9a095 100644
--- a/cpp/include/raft/cache/cache_util.cuh
+++ b/cpp/include/raft/cache/cache_util.cuh
@@ -42,15 +42,17 @@ namespace cache {
  * @param [out] out vectors collected from the cache, size [n_vec * n]
  */
 template <typename math_t>
-__global__ void get_vecs(const math_t* cache, int n_vec, const int* cache_idx, int n, math_t* out)
-{
+__global__ void get_vecs(const math_t *cache, int n_vec, const int *cache_idx,
+                         int n, math_t *out) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int row = tid % n_vec;  // row idx
   if (tid < n_vec * n) {
-    size_t out_col   = tid / n_vec;  // col idx
+    size_t out_col = tid / n_vec;  // col idx
     size_t cache_col = cache_idx[out_col];
     if (cache_idx[out_col] >= 0) {
-      if (row + out_col * n_vec < (size_t)n_vec * n) { out[tid] = cache[row + cache_col * n_vec]; }
+      if (row + out_col * n_vec < (size_t)n_vec * n) {
+        out[tid] = cache[row + cache_col * n_vec];
+      }
     }
   }
 }
@@ -82,26 +84,21 @@ __global__ void get_vecs(const math_t* cache, int n_vec, const int* cache_idx, i
  * @param [in] n_cache_vecs
  */
 template <typename math_t>
-__global__ void store_vecs(const math_t* tile,
-                           int n_tile,
-                           int n_vec,
-                           const int* tile_idx,
-                           int n,
-                           const int* cache_idx,
-                           math_t* cache,
-                           int n_cache_vecs)
-{
+__global__ void store_vecs(const math_t *tile, int n_tile, int n_vec,
+                           const int *tile_idx, int n, const int *cache_idx,
+                           math_t *cache, int n_cache_vecs) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int row = tid % n_vec;  // row idx
   if (tid < n_vec * n) {
-    int tile_col  = tid / n_vec;  // col idx
-    int data_col  = tile_idx ? tile_idx[tile_col] : tile_col;
+    int tile_col = tid / n_vec;  // col idx
+    int data_col = tile_idx ? tile_idx[tile_col] : tile_col;
     int cache_col = cache_idx[tile_col];
 
     // We ignore negative values. The rest of the checks should be fulfilled
     // if the cache is used properly
     if (cache_col >= 0 && cache_col < n_cache_vecs && data_col < n_tile) {
-      cache[row + (size_t)cache_col * n_vec] = tile[row + (size_t)data_col * n_vec];
+      cache[row + (size_t)cache_col * n_vec] =
+        tile[row + (size_t)data_col * n_vec];
     }
   }
 }
@@ -124,15 +121,14 @@ int DI hash(int key, int n_cache_sets) { return key % n_cache_sets; }
  * @return the index of the first element in the array for which
  * array[idx] >= value. If there is no such value, then return n.
  */
-int DI arg_first_ge(const int* array, int n, int val)
-{
+int DI arg_first_ge(const int *array, int n, int val) {
   int start = 0;
-  int end   = n - 1;
+  int end = n - 1;
   if (array[0] == val) return 0;
   if (array[end] < val) return n;
   while (start + 1 < end) {
     int q = (start + end + 1) / 2;
-    // invariants:
+    //invariants:
     // start < end
     // start < q <=end
     // array[start] < val && array[end] <=val
@@ -161,8 +157,7 @@ int DI arg_first_ge(const int* array, int n, int val)
  * @return the idx of the k-th occurance of val in array, or -1 if
  * the value is not found.
  */
-int DI find_nth_occurrence(const int* array, int n, int val, int k)
-{
+int DI find_nth_occurrence(const int *array, int n, int val, int k) {
   int q = arg_first_ge(array, n, val);
   if (q + k < n && array[q + k] == val) {
     q += k;
@@ -201,10 +196,10 @@ int DI find_nth_occurrence(const int* array, int n, int val, int k)
  *   Each block should give a different pointer for rank.
  */
 template <int nthreads, int associativity>
-DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank)
-{
+DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
   const int items_per_thread = raft::ceildiv(associativity, nthreads);
-  typedef cub::BlockRadixSort<int, nthreads, items_per_thread, int> BlockRadixSort;
+  typedef cub::BlockRadixSort<int, nthreads, items_per_thread, int>
+    BlockRadixSort;
   __shared__ typename BlockRadixSort::TempStorage temp_storage;
 
   int key[items_per_thread];
@@ -213,8 +208,8 @@ DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank)
   int block_offset = blockIdx.x * associativity;
 
   for (int j = 0; j < items_per_thread; j++) {
-    int k  = threadIdx.x + j * nthreads;
-    int t  = (k < associativity) ? cache_time[block_offset + k] : 32768;
+    int k = threadIdx.x + j * nthreads;
+    int t = (k < associativity) ? cache_time[block_offset + k] : 32768;
     key[j] = t;
     val[j] = k;
   }
@@ -222,7 +217,9 @@ DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank)
   BlockRadixSort(temp_storage).Sort(key, val);
 
   for (int j = 0; j < items_per_thread; j++) {
-    if (val[j] < associativity) { rank[val[j]] = threadIdx.x * items_per_thread + j; }
+    if (val[j] < associativity) {
+      rank[val[j]] = threadIdx.x * items_per_thread + j;
+    }
   }
   __syncthreads();
 }
@@ -255,15 +252,9 @@ DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank)
  *   not be cached, size [n]
  */
 template <int nthreads, int associativity>
-__global__ void assign_cache_idx(const int* keys,
-                                 int n,
-                                 const int* cache_set,
-                                 int* cached_keys,
-                                 int n_cache_sets,
-                                 int* cache_time,
-                                 int time,
-                                 int* cache_idx)
-{
+__global__ void assign_cache_idx(const int *keys, int n, const int *cache_set,
+                                 int *cached_keys, int n_cache_sets,
+                                 int *cache_time, int time, int *cache_idx) {
   int block_offset = blockIdx.x * associativity;
 
   const int items_per_thread = raft::ceildiv(associativity, nthreads);
@@ -282,7 +273,7 @@ __global__ void assign_cache_idx(const int* keys,
   // these elements are assigned -1.
 
   for (int j = 0; j < items_per_thread; j++) {
-    int i     = threadIdx.x + j * nthreads;
+    int i = threadIdx.x + j * nthreads;
     int t_idx = block_offset + i;
     bool mask = (i < associativity);
     // whether this slot is available for writing
@@ -293,10 +284,10 @@ __global__ void assign_cache_idx(const int* keys,
     if (mask) {
       int k = find_nth_occurrence(cache_set, n, blockIdx.x, rank[i]);
       if (k > -1) {
-        int key_val        = keys[k];
+        int key_val = keys[k];
         cached_keys[t_idx] = key_val;
-        cache_idx[k]       = t_idx;
-        cache_time[t_idx]  = time;
+        cache_idx[k] = t_idx;
+        cache_time[t_idx] = time;
       }
     }
   }
@@ -324,28 +315,21 @@ namespace {
  * @param [inout] cached_keys keys stored in the cache, size [n_cache_sets * associativity]
  * @param [in] n_cache_sets number of cache sets
  * @param [in] associativity number of keys in cache set
- * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets *
- * associativity]
+ * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * associativity]
  * @param [out] cache_idx cache indices of the working set elements, size [n]
  * @param [out] is_cached whether the element is cached size[n]
  * @param [in] time iteration counter (used for time stamping)
  */
-__global__ void get_cache_idx(int* keys,
-                              int n,
-                              int* cached_keys,
-                              int n_cache_sets,
-                              int associativity,
-                              int* cache_time,
-                              int* cache_idx,
-                              bool* is_cached,
-                              int time)
-{
+__global__ void get_cache_idx(int *keys, int n, int *cached_keys,
+                              int n_cache_sets, int associativity,
+                              int *cache_time, int *cache_idx, bool *is_cached,
+                              int time) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n) {
-    int widx   = keys[tid];
-    int sidx   = hash(widx, n_cache_sets);
-    int cidx   = sidx * associativity;
-    int i      = 0;
+    int widx = keys[tid];
+    int sidx = hash(widx, n_cache_sets);
+    int cidx = sidx * associativity;
+    int i = 0;
     bool found = false;
     // search for empty spot and the least recently used spot
     while (i < associativity && !found) {
@@ -354,9 +338,9 @@ __global__ void get_cache_idx(int* keys,
     }
     is_cached[tid] = found;
     if (found) {
-      cidx             = cidx + i - 1;
-      cache_time[cidx] = time;  // update time stamp
-      cache_idx[tid]   = cidx;  // exact cache idx
+      cidx = cidx + i - 1;
+      cache_time[cidx] = time;  //update time stamp
+      cache_idx[tid] = cidx;    //exact cache idx
     } else {
       cache_idx[tid] = sidx;  // assign cache set
     }
diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh
index 4767c7f254..8d5b29f700 100644
--- a/cpp/include/raft/common/cub_wrappers.cuh
+++ b/cpp/include/raft/common/cub_wrappers.cuh
@@ -22,32 +22,28 @@
 namespace raft {
 
 /**
- * @brief Convenience wrapper over cub's SortPairs method
- * @tparam KeyT key type
- * @tparam ValueT value type
- * @param workspace workspace buffer which will get resized if not enough space
- * @param inKeys input keys array
- * @param outKeys output keys array
- * @param inVals input values array
- * @param outVals output values array
- * @param len array length
- * @param stream cuda stream
- */
+     * @brief Convenience wrapper over cub's SortPairs method
+     * @tparam KeyT key type
+     * @tparam ValueT value type
+     * @param workspace workspace buffer which will get resized if not enough space
+     * @param inKeys input keys array
+     * @param outKeys output keys array
+     * @param inVals input values array
+     * @param outVals output values array
+     * @param len array length
+     * @param stream cuda stream
+     */
 template <typename KeyT, typename ValueT>
-void sortPairs(raft::mr::device::buffer<char>& workspace,
-               const KeyT* inKeys,
-               KeyT* outKeys,
-               const ValueT* inVals,
-               ValueT* outVals,
-               int len,
-               cudaStream_t stream)
-{
+void sortPairs(raft::mr::device::buffer<char> &workspace, const KeyT *inKeys,
+               KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len,
+               cudaStream_t stream) {
   size_t worksize;
-  cub::DeviceRadixSort::SortPairs(
-    nullptr, worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream);
+  cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals,
+                                  outVals, len, 0, sizeof(KeyT) * 8, stream);
   workspace.resize(worksize, stream);
-  cub::DeviceRadixSort::SortPairs(
-    workspace.data(), worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream);
+  cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys,
+                                  inVals, outVals, len, 0, sizeof(KeyT) * 8,
+                                  stream);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh
index 41dc9cab08..bb2b019ecb 100644
--- a/cpp/include/raft/common/device_loads_stores.cuh
+++ b/cpp/include/raft/common/device_loads_stores.cuh
@@ -31,43 +31,40 @@ namespace raft {
  * @param[out] addr shared memory address (should be aligned to vector size)
  * @param[in]  x    data to be stored at this address
  */
-DI void sts(float* addr, const float& x)
-{
+DI void sts(float* addr, const float& x) {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x));
 }
-DI void sts(float* addr, const float (&x)[1])
-{
+DI void sts(float* addr, const float (&x)[1]) {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0]));
 }
-DI void sts(float* addr, const float (&x)[2])
-{
+DI void sts(float* addr, const float (&x)[2]) {
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
-  asm volatile("st.shared.v2.f32 [%0], {%1, %2};" : : "l"(s2), "f"(x[0]), "f"(x[1]));
+  asm volatile("st.shared.v2.f32 [%0], {%1, %2};"
+               :
+               : "l"(s2), "f"(x[0]), "f"(x[1]));
 }
-DI void sts(float* addr, const float (&x)[4])
-{
+DI void sts(float* addr, const float (&x)[4]) {
   auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
   asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};"
                :
                : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3]));
 }
 
-DI void sts(double* addr, const double& x)
-{
+DI void sts(double* addr, const double& x) {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x));
 }
-DI void sts(double* addr, const double (&x)[1])
-{
+DI void sts(double* addr, const double (&x)[1]) {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0]));
 }
-DI void sts(double* addr, const double (&x)[2])
-{
+DI void sts(double* addr, const double (&x)[2]) {
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<double2*>(addr));
-  asm volatile("st.shared.v2.f64 [%0], {%1, %2};" : : "l"(s2), "d"(x[0]), "d"(x[1]));
+  asm volatile("st.shared.v2.f64 [%0], {%1, %2};"
+               :
+               : "l"(s2), "d"(x[0]), "d"(x[1]));
 }
 /** @} */
 
@@ -83,42 +80,39 @@ DI void sts(double* addr, const double (&x)[2])
  * @param[in]  addr shared memory address from where to load
  *                  (should be aligned to vector size)
  */
-DI void lds(float& x, float* addr)
-{
+DI void lds(float& x, float* addr) {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1));
 }
-DI void lds(float (&x)[1], float* addr)
-{
+DI void lds(float (&x)[1], float* addr) {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1));
 }
-DI void lds(float (&x)[2], float* addr)
-{
+DI void lds(float (&x)[2], float* addr) {
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
-  asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2));
+  asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];"
+               : "=f"(x[0]), "=f"(x[1])
+               : "l"(s2));
 }
-DI void lds(float (&x)[4], float* addr)
-{
+DI void lds(float (&x)[4], float* addr) {
   auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
   asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];"
                : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
                : "l"(s4));
 }
-DI void lds(double& x, double* addr)
-{
+DI void lds(double& x, double* addr) {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1));
 }
-DI void lds(double (&x)[1], double* addr)
-{
+DI void lds(double (&x)[1], double* addr) {
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1));
 }
-DI void lds(double (&x)[2], double* addr)
-{
+DI void lds(double (&x)[2], double* addr) {
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<double2*>(addr));
-  asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(s2));
+  asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];"
+               : "=d"(x[0]), "=d"(x[1])
+               : "l"(s2));
 }
 /** @} */
 
@@ -129,35 +123,32 @@ DI void lds(double (&x)[2], double* addr)
  * @param[out] x    data to be loaded from global memory
  * @param[in]  addr address in global memory from where to load
  */
-DI void ldg(float& x, const float* addr)
-{
+DI void ldg(float& x, const float* addr) {
   asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x) : "l"(addr));
 }
-DI void ldg(float (&x)[1], const float* addr)
-{
+DI void ldg(float (&x)[1], const float* addr) {
   asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x[0]) : "l"(addr));
 }
-DI void ldg(float (&x)[2], const float* addr)
-{
-  asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(addr));
+DI void ldg(float (&x)[2], const float* addr) {
+  asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];"
+               : "=f"(x[0]), "=f"(x[1])
+               : "l"(addr));
 }
-DI void ldg(float (&x)[4], const float* addr)
-{
+DI void ldg(float (&x)[4], const float* addr) {
   asm volatile("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];"
                : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
                : "l"(addr));
 }
-DI void ldg(double& x, const double* addr)
-{
+DI void ldg(double& x, const double* addr) {
   asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x) : "l"(addr));
 }
-DI void ldg(double (&x)[1], const double* addr)
-{
+DI void ldg(double (&x)[1], const double* addr) {
   asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x[0]) : "l"(addr));
 }
-DI void ldg(double (&x)[2], const double* addr)
-{
-  asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr));
+DI void ldg(double (&x)[2], const double* addr) {
+  asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];"
+               : "=d"(x[0]), "=d"(x[1])
+               : "l"(addr));
 }
 /** @} */
 
diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh
index b228ac5499..785794461e 100644
--- a/cpp/include/raft/common/scatter.cuh
+++ b/cpp/include/raft/common/scatter.cuh
@@ -22,8 +22,8 @@
 namespace raft {
 
 template <typename DataT, int VecLen, typename Lambda, typename IdxT>
-__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
-{
+__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx,
+                              IdxT len, Lambda op) {
   typedef TxN_t<DataT, VecLen> DataVec;
   typedef TxN_t<IdxT, VecLen> IdxVec;
   IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
@@ -34,60 +34,61 @@ __global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT
   DataVec dataIn;
 #pragma unroll
   for (int i = 0; i < VecLen; ++i) {
-    auto inPos         = idxIn.val.data[i];
+    auto inPos = idxIn.val.data[i];
     dataIn.val.data[i] = op(in[inPos], tid + i);
   }
   dataIn.store(out, tid);
 }
 
 template <typename DataT, int VecLen, typename Lambda, typename IdxT, int TPB>
-void scatterImpl(
-  DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream)
-{
+void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len,
+                 Lambda op, cudaStream_t stream) {
   const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB);
-  scatterKernel<DataT, VecLen, Lambda, IdxT><<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
+  scatterKernel<DataT, VecLen, Lambda, IdxT>
+    <<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
   CUDA_CHECK(cudaGetLastError());
 }
 
 /**
- * @brief Performs scatter operation based on the input indexing array
- * @tparam DataT data type whose array gets scattered
- * @tparam IdxT indexing type
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam Lambda the device-lambda performing a unary operation on the loaded
- * data before it gets scattered
- * @param out the output array
- * @param in the input array
- * @param idx the indexing array
- * @param len number of elements in the input array
- * @param stream cuda stream where to launch work
- * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This
- * will be applied to every element before scattering it to the right location.
- * The second param in this method will be the destination index.
- */
-template <typename DataT, typename IdxT, typename Lambda = raft::Nop<DataT, IdxT>, int TPB = 256>
-void scatter(DataT* out,
-             const DataT* in,
-             const IdxT* idx,
-             IdxT len,
-             cudaStream_t stream,
-             Lambda op = raft::Nop<DataT, IdxT>())
-{
+     * @brief Performs scatter operation based on the input indexing array
+     * @tparam DataT data type whose array gets scattered
+     * @tparam IdxT indexing type
+     * @tparam TPB threads-per-block in the final kernel launched
+     * @tparam Lambda the device-lambda performing a unary operation on the loaded
+     * data before it gets scattered
+     * @param out the output array
+     * @param in the input array
+     * @param idx the indexing array
+     * @param len number of elements in the input array
+     * @param stream cuda stream where to launch work
+     * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This
+     * will be applied to every element before scattering it to the right location.
+     * The second param in this method will be the destination index.
+     */
+template <typename DataT, typename IdxT,
+          typename Lambda = raft::Nop<DataT, IdxT>, int TPB = 256>
+void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len,
+             cudaStream_t stream, Lambda op = raft::Nop<DataT, IdxT>()) {
   if (len <= 0) return;
-  constexpr size_t DataSize   = sizeof(DataT);
-  constexpr size_t IdxSize    = sizeof(IdxT);
+  constexpr size_t DataSize = sizeof(DataT);
+  constexpr size_t IdxSize = sizeof(IdxT);
   constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize;
-  size_t bytes                = len * MaxPerElem;
+  size_t bytes = len * MaxPerElem;
   if (16 / MaxPerElem && bytes % 16 == 0) {
-    scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len,
+                                                           op, stream);
   } else if (8 / MaxPerElem && bytes % 8 == 0) {
-    scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
+                                                          stream);
   } else if (4 / MaxPerElem && bytes % 4 == 0) {
-    scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
+                                                          stream);
   } else if (2 / MaxPerElem && bytes % 2 == 0) {
-    scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
+                                                          stream);
   } else if (1 / MaxPerElem) {
-    scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
+    scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
+                                                          stream);
   } else {
     scatterImpl<DataT, 1, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   }
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index 72c3b3897e..dc172c9503 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -25,7 +25,16 @@ namespace raft {
 namespace comms {
 
 typedef unsigned int request_t;
-enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
+enum class datatype_t {
+  CHAR,
+  UINT8,
+  INT32,
+  UINT32,
+  INT64,
+  UINT64,
+  FLOAT32,
+  FLOAT64
+};
 enum class op_t { SUM, PROD, MIN, MAX };
 
 /**
@@ -41,50 +50,42 @@ template <typename value_t>
 constexpr datatype_t get_type();
 
 template <>
-constexpr datatype_t get_type<char>()
-{
+constexpr datatype_t get_type<char>() {
   return datatype_t::CHAR;
 }
 
 template <>
-constexpr datatype_t get_type<uint8_t>()
-{
+constexpr datatype_t get_type<uint8_t>() {
   return datatype_t::UINT8;
 }
 
 template <>
-constexpr datatype_t get_type<int>()
-{
+constexpr datatype_t get_type<int>() {
   return datatype_t::INT32;
 }
 
 template <>
-constexpr datatype_t get_type<uint32_t>()
-{
+constexpr datatype_t get_type<uint32_t>() {
   return datatype_t::UINT32;
 }
 
 template <>
-constexpr datatype_t get_type<int64_t>()
-{
+constexpr datatype_t get_type<int64_t>() {
   return datatype_t::INT64;
 }
 
 template <>
-constexpr datatype_t get_type<uint64_t>()
-{
+constexpr datatype_t get_type<uint64_t>() {
   return datatype_t::UINT64;
 }
 
 template <>
-constexpr datatype_t get_type<float>()
-{
+constexpr datatype_t get_type<float>() {
   return datatype_t::FLOAT32;
 }
 
 template <>
-constexpr datatype_t get_type<double>()
-{
+constexpr datatype_t get_type<double>() {
   return datatype_t::FLOAT64;
 }
 
@@ -94,99 +95,72 @@ class comms_iface {
   virtual int get_rank() const = 0;
 
   virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
-  virtual void barrier() const                                              = 0;
+  virtual void barrier() const = 0;
 
   virtual status_t sync_stream(cudaStream_t stream) const = 0;
 
-  virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
+  virtual void isend(const void* buf, size_t size, int dest, int tag,
+                     request_t* request) const = 0;
 
-  virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
+  virtual void irecv(void* buf, size_t size, int source, int tag,
+                     request_t* request) const = 0;
 
   virtual void waitall(int count, request_t array_of_requests[]) const = 0;
 
-  virtual void allreduce(const void* sendbuff,
-                         void* recvbuff,
-                         size_t count,
-                         datatype_t datatype,
-                         op_t op,
+  virtual void allreduce(const void* sendbuff, void* recvbuff, size_t count,
+                         datatype_t datatype, op_t op,
                          cudaStream_t stream) const = 0;
 
-  virtual void bcast(
-    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
+  virtual void bcast(void* buff, size_t count, datatype_t datatype, int root,
+                     cudaStream_t stream) const = 0;
 
-  virtual void reduce(const void* sendbuff,
-                      void* recvbuff,
-                      size_t count,
-                      datatype_t datatype,
-                      op_t op,
-                      int root,
+  virtual void reduce(const void* sendbuff, void* recvbuff, size_t count,
+                      datatype_t datatype, op_t op, int root,
                       cudaStream_t stream) const = 0;
 
-  virtual void allgather(const void* sendbuff,
-                         void* recvbuff,
-                         size_t sendcount,
-                         datatype_t datatype,
-                         cudaStream_t stream) const = 0;
+  virtual void allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
+                         datatype_t datatype, cudaStream_t stream) const = 0;
+
+  virtual void allgatherv(const void* sendbuf, void* recvbuf,
+                          const size_t* recvcounts, const size_t* displs,
+                          datatype_t datatype, cudaStream_t stream) const = 0;
 
-  virtual void allgatherv(const void* sendbuf,
-                          void* recvbuf,
-                          const size_t* recvcounts,
-                          const size_t* displs,
-                          datatype_t datatype,
-                          cudaStream_t stream) const = 0;
-
-  virtual void gather(const void* sendbuff,
-                      void* recvbuff,
-                      size_t sendcount,
-                      datatype_t datatype,
-                      int root,
+  virtual void gather(const void* sendbuff, void* recvbuff, size_t sendcount,
+                      datatype_t datatype, int root,
                       cudaStream_t stream) const = 0;
 
-  virtual void gatherv(const void* sendbuf,
-                       void* recvbuf,
-                       size_t sendcount,
-                       const size_t* recvcounts,
-                       const size_t* displs,
-                       datatype_t datatype,
-                       int root,
+  virtual void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount,
+                       const size_t* recvcounts, const size_t* displs,
+                       datatype_t datatype, int root,
                        cudaStream_t stream) const = 0;
 
-  virtual void reducescatter(const void* sendbuff,
-                             void* recvbuff,
-                             size_t recvcount,
-                             datatype_t datatype,
-                             op_t op,
+  virtual void reducescatter(const void* sendbuff, void* recvbuff,
+                             size_t recvcount, datatype_t datatype, op_t op,
                              cudaStream_t stream) const = 0;
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
+  virtual void device_send(const void* buf, size_t size, int dest,
+                           cudaStream_t stream) const = 0;
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
-
-  virtual void device_sendrecv(const void* sendbuf,
-                               size_t sendsize,
-                               int dest,
-                               void* recvbuf,
-                               size_t recvsize,
-                               int source,
+  virtual void device_recv(void* buf, size_t size, int source,
+                           cudaStream_t stream) const = 0;
+
+  virtual void device_sendrecv(const void* sendbuf, size_t sendsize, int dest,
+                               void* recvbuf, size_t recvsize, int source,
                                cudaStream_t stream) const = 0;
 
-  virtual void device_multicast_sendrecv(const void* sendbuf,
-                                         std::vector<size_t> const& sendsizes,
-                                         std::vector<size_t> const& sendoffsets,
-                                         std::vector<int> const& dests,
-                                         void* recvbuf,
-                                         std::vector<size_t> const& recvsizes,
-                                         std::vector<size_t> const& recvoffsets,
-                                         std::vector<int> const& sources,
-                                         cudaStream_t stream) const = 0;
+  virtual void device_multicast_sendrecv(
+    const void* sendbuf, std::vector<size_t> const& sendsizes,
+    std::vector<size_t> const& sendoffsets, std::vector<int> const& dests,
+    void* recvbuf, std::vector<size_t> const& recvsizes,
+    std::vector<size_t> const& recvoffsets, std::vector<int> const& sources,
+    cudaStream_t stream) const = 0;
 };
 
 class comms_t {
  public:
-  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
-  {
+  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release()) {
     ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
   }
 
@@ -213,8 +187,7 @@ class comms_t {
    * @param color ranks w/ the same color are placed in the same communicator
    * @param key controls rank assignment
    */
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const
-  {
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const {
     return impl_->comm_split(color, key);
   }
 
@@ -231,7 +204,9 @@ class comms_t {
    *
    * @param stream the cuda stream to sync collective operations on
    */
-  status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
+  status_t sync_stream(cudaStream_t stream) const {
+    return impl_->sync_stream(stream);
+  }
 
   /**
    * Performs an asynchronous point-to-point send
@@ -244,9 +219,10 @@ class comms_t {
    * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
    */
   template <typename value_t>
-  void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
-  {
-    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
+  void isend(const value_t* buf, size_t size, int dest, int tag,
+             request_t* request) const {
+    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest,
+                 tag, request);
   }
 
   /**
@@ -260,9 +236,10 @@ class comms_t {
    * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
    */
   template <typename value_t>
-  void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
-  {
-    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
+  void irecv(value_t* buf, size_t size, int source, int tag,
+             request_t* request) const {
+    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag,
+                 request);
   }
 
   /**
@@ -270,8 +247,7 @@ class comms_t {
    * @param count number of requests to synchronize on
    * @param array_of_requests an array of request_t objects returned from isend/irecv
    */
-  void waitall(int count, request_t array_of_requests[]) const
-  {
+  void waitall(int count, request_t array_of_requests[]) const {
     impl_->waitall(count, array_of_requests);
   }
 
@@ -285,15 +261,11 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void allreduce(
-    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
-  {
+  void allreduce(const value_t* sendbuff, value_t* recvbuff, size_t count,
+                 op_t op, cudaStream_t stream) const {
     impl_->allreduce(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff),
-                     count,
-                     get_type<value_t>(),
-                     op,
-                     stream);
+                     static_cast<void*>(recvbuff), count, get_type<value_t>(),
+                     op, stream);
   }
 
   /**
@@ -305,9 +277,9 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
-  {
-    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
+  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const {
+    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root,
+                 stream);
   }
 
   /**
@@ -321,20 +293,11 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void reduce(const value_t* sendbuff,
-              value_t* recvbuff,
-              size_t count,
-              op_t op,
-              int root,
-              cudaStream_t stream) const
-  {
+  void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op,
+              int root, cudaStream_t stream) const {
     impl_->reduce(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff),
-                  count,
-                  get_type<value_t>(),
-                  op,
-                  root,
-                  stream);
+                  static_cast<void*>(recvbuff), count, get_type<value_t>(), op,
+                  root, stream);
   }
 
   /**
@@ -346,16 +309,11 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void allgather(const value_t* sendbuff,
-                 value_t* recvbuff,
-                 size_t sendcount,
-                 cudaStream_t stream) const
-  {
+  void allgather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount,
+                 cudaStream_t stream) const {
     impl_->allgather(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff),
-                     sendcount,
-                     get_type<value_t>(),
-                     stream);
+                     static_cast<void*>(recvbuff), sendcount,
+                     get_type<value_t>(), stream);
   }
 
   /**
@@ -370,18 +328,12 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void allgatherv(const value_t* sendbuf,
-                  value_t* recvbuf,
-                  const size_t* recvcounts,
-                  const size_t* displs,
-                  cudaStream_t stream) const
-  {
+  void allgatherv(const value_t* sendbuf, value_t* recvbuf,
+                  const size_t* recvcounts, const size_t* displs,
+                  cudaStream_t stream) const {
     impl_->allgatherv(static_cast<const void*>(sendbuf),
-                      static_cast<void*>(recvbuf),
-                      recvcounts,
-                      displs,
-                      get_type<value_t>(),
-                      stream);
+                      static_cast<void*>(recvbuf), recvcounts, displs,
+                      get_type<value_t>(), stream);
   }
 
   /**
@@ -394,18 +346,11 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void gather(const value_t* sendbuff,
-              value_t* recvbuff,
-              size_t sendcount,
-              int root,
-              cudaStream_t stream) const
-  {
+  void gather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount,
+              int root, cudaStream_t stream) const {
     impl_->gather(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff),
-                  sendcount,
-                  get_type<value_t>(),
-                  root,
-                  stream);
+                  static_cast<void*>(recvbuff), sendcount, get_type<value_t>(),
+                  root, stream);
   }
 
   /**
@@ -422,22 +367,12 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void gatherv(const value_t* sendbuf,
-               value_t* recvbuf,
-               size_t sendcount,
-               const size_t* recvcounts,
-               const size_t* displs,
-               int root,
-               cudaStream_t stream) const
-  {
+  void gatherv(const value_t* sendbuf, value_t* recvbuf, size_t sendcount,
+               const size_t* recvcounts, const size_t* displs, int root,
+               cudaStream_t stream) const {
     impl_->gatherv(static_cast<const void*>(sendbuf),
-                   static_cast<void*>(recvbuf),
-                   sendcount,
-                   recvcounts,
-                   displs,
-                   get_type<value_t>(),
-                   root,
-                   stream);
+                   static_cast<void*>(recvbuf), sendcount, recvcounts, displs,
+                   get_type<value_t>(), root, stream);
   }
 
   /**
@@ -449,18 +384,11 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void reducescatter(const value_t* sendbuff,
-                     value_t* recvbuff,
-                     size_t recvcount,
-                     op_t op,
-                     cudaStream_t stream) const
-  {
+  void reducescatter(const value_t* sendbuff, value_t* recvbuff,
+                     size_t recvcount, op_t op, cudaStream_t stream) const {
     impl_->reducescatter(static_cast<const void*>(sendbuff),
-                         static_cast<void*>(recvbuff),
-                         recvcount,
-                         get_type<value_t>(),
-                         op,
-                         stream);
+                         static_cast<void*>(recvbuff), recvcount,
+                         get_type<value_t>(), op, stream);
   }
 
   /**
@@ -475,9 +403,10 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
-  {
-    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
+  void device_send(const value_t* buf, size_t size, int dest,
+                   cudaStream_t stream) const {
+    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t),
+                       dest, stream);
   }
 
   /**
@@ -492,9 +421,10 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
-  {
-    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
+  void device_recv(value_t* buf, size_t size, int source,
+                   cudaStream_t stream) const {
+    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source,
+                       stream);
   }
 
   /**
@@ -510,21 +440,12 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_sendrecv(const value_t* sendbuf,
-                       size_t sendsize,
-                       int dest,
-                       value_t* recvbuf,
-                       size_t recvsize,
-                       int source,
-                       cudaStream_t stream) const
-  {
-    impl_->device_sendrecv(static_cast<const void*>(sendbuf),
-                           sendsize * sizeof(value_t),
-                           dest,
-                           static_cast<void*>(recvbuf),
-                           recvsize * sizeof(value_t),
-                           source,
-                           stream);
+  void device_sendrecv(const value_t* sendbuf, size_t sendsize, int dest,
+                       value_t* recvbuf, size_t recvsize, int source,
+                       cudaStream_t stream) const {
+    impl_->device_sendrecv(
+      static_cast<const void*>(sendbuf), sendsize * sizeof(value_t), dest,
+      static_cast<void*>(recvbuf), recvsize * sizeof(value_t), source, stream);
   }
 
   /**
@@ -542,37 +463,28 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_multicast_sendrecv(const value_t* sendbuf,
-                                 std::vector<size_t> const& sendsizes,
-                                 std::vector<size_t> const& sendoffsets,
-                                 std::vector<int> const& dests,
-                                 value_t* recvbuf,
-                                 std::vector<size_t> const& recvsizes,
-                                 std::vector<size_t> const& recvoffsets,
-                                 std::vector<int> const& sources,
-                                 cudaStream_t stream) const
-  {
-    auto sendbytesizes   = sendsizes;
+  void device_multicast_sendrecv(
+    const value_t* sendbuf, std::vector<size_t> const& sendsizes,
+    std::vector<size_t> const& sendoffsets, std::vector<int> const& dests,
+    value_t* recvbuf, std::vector<size_t> const& recvsizes,
+    std::vector<size_t> const& recvoffsets, std::vector<int> const& sources,
+    cudaStream_t stream) const {
+    auto sendbytesizes = sendsizes;
     auto sendbyteoffsets = sendoffsets;
     for (size_t i = 0; i < sendsizes.size(); ++i) {
       sendbytesizes[i] *= sizeof(value_t);
       sendbyteoffsets[i] *= sizeof(value_t);
     }
-    auto recvbytesizes   = recvsizes;
+    auto recvbytesizes = recvsizes;
     auto recvbyteoffsets = recvoffsets;
     for (size_t i = 0; i < recvsizes.size(); ++i) {
       recvbytesizes[i] *= sizeof(value_t);
       recvbyteoffsets[i] *= sizeof(value_t);
     }
     impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
-                                     sendbytesizes,
-                                     sendbyteoffsets,
-                                     dests,
-                                     static_cast<void*>(recvbuf),
-                                     recvbytesizes,
-                                     recvbyteoffsets,
-                                     sources,
-                                     stream);
+                                     sendbytesizes, sendbyteoffsets, dests,
+                                     static_cast<void*>(recvbuf), recvbytesizes,
+                                     recvbyteoffsets, sources, stream);
   }
 
  private:
diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp
index 93e31b4d6a..7b24e31bbe 100644
--- a/cpp/include/raft/comms/helper.hpp
+++ b/cpp/include/raft/comms/helper.hpp
@@ -36,9 +36,9 @@ namespace comms {
  * @param num_ranks number of ranks in communicator clique
  * @param rank rank of local instance
  */
-void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank)
-{
-  auto d_alloc        = handle->get_device_allocator();
+void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm,
+                           int num_ranks, int rank) {
+  auto d_alloc = handle->get_device_allocator();
   cudaStream_t stream = handle->get_stream();
 
   auto communicator = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(
@@ -61,41 +61,40 @@ void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks
  * @param num_ranks number of ranks in communicator clique
  * @param rank rank of local instance
  */
-void build_comms_nccl_ucx(
-  handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank)
-{
-  auto eps_sp = std::make_shared<ucp_ep_h*>(new ucp_ep_h[num_ranks]);
+void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm,
+                          void *ucp_worker, void *eps, int num_ranks,
+                          int rank) {
+  auto eps_sp = std::make_shared<ucp_ep_h *>(new ucp_ep_h[num_ranks]);
 
-  auto size_t_ep_arr = reinterpret_cast<size_t*>(eps);
+  auto size_t_ep_arr = reinterpret_cast<size_t *>(eps);
 
   for (int i = 0; i < num_ranks; i++) {
-    size_t ptr    = size_t_ep_arr[i];
-    auto ucp_ep_v = reinterpret_cast<ucp_ep_h*>(*eps_sp);
+    size_t ptr = size_t_ep_arr[i];
+    auto ucp_ep_v = reinterpret_cast<ucp_ep_h *>(*eps_sp);
 
     if (ptr != 0) {
       auto eps_ptr = reinterpret_cast<ucp_ep_h>(size_t_ep_arr[i]);
-      ucp_ep_v[i]  = eps_ptr;
+      ucp_ep_v[i] = eps_ptr;
     } else {
       ucp_ep_v[i] = nullptr;
     }
   }
 
-  auto d_alloc        = handle->get_device_allocator();
+  auto d_alloc = handle->get_device_allocator();
   cudaStream_t stream = handle->get_stream();
 
-  auto communicator =
-    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new raft::comms::std_comms(
-      nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, d_alloc, stream)));
+  auto communicator = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(
+    new raft::comms::std_comms(nccl_comm, (ucp_worker_h)ucp_worker, eps_sp,
+                               num_ranks, rank, d_alloc, stream)));
   handle->set_comms(communicator);
 }
 
-inline void nccl_unique_id_from_char(ncclUniqueId* id, char* uniqueId, int size)
-{
+inline void nccl_unique_id_from_char(ncclUniqueId *id, char *uniqueId,
+                                     int size) {
   memcpy(id->internal, uniqueId, size);
 }
 
-inline void get_unique_id(char* uid, int size)
-{
+inline void get_unique_id(char *uid, int size) {
   ncclUniqueId id;
   ncclGetUniqueId(&id);
 
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index 65f38b2625..8dda74f0a9 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -32,16 +32,16 @@
 #include <raft/error.hpp>
 #include <raft/handle.hpp>
 
-#define MPI_TRY(call)                                                                         \
-  do {                                                                                        \
-    int status = call;                                                                        \
-    if (MPI_SUCCESS != status) {                                                              \
-      int mpi_error_string_lenght = 0;                                                        \
-      char mpi_error_string[MPI_MAX_ERROR_STRING];                                            \
-      MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght);                   \
-      RAFT_EXPECTS(                                                                           \
-        MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", #call, mpi_error_string); \
-    }                                                                                         \
+#define MPI_TRY(call)                                                          \
+  do {                                                                         \
+    int status = call;                                                         \
+    if (MPI_SUCCESS != status) {                                               \
+      int mpi_error_string_lenght = 0;                                         \
+      char mpi_error_string[MPI_MAX_ERROR_STRING];                             \
+      MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght);    \
+      RAFT_EXPECTS(MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", \
+                   #call, mpi_error_string);                                   \
+    }                                                                          \
   } while (0)
 
 #define MPI_TRY_NO_THROW(call)                                              \
@@ -51,41 +51,48 @@
       int mpi_error_string_lenght = 0;                                      \
       char mpi_error_string[MPI_MAX_ERROR_STRING];                          \
       MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \
-      printf("MPI call='%s' at file=%s line=%d failed with %s ",            \
-             #call,                                                         \
-             __FILE__,                                                      \
-             __LINE__,                                                      \
-             mpi_error_string);                                             \
+      printf("MPI call='%s' at file=%s line=%d failed with %s ", #call,     \
+             __FILE__, __LINE__, mpi_error_string);                         \
     }                                                                       \
   } while (0)
 
 namespace raft {
 namespace comms {
 
-constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype)
-{
+constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) {
   switch (datatype) {
-    case datatype_t::CHAR: return MPI_CHAR;
-    case datatype_t::UINT8: return MPI_UNSIGNED_CHAR;
-    case datatype_t::INT32: return MPI_INT;
-    case datatype_t::UINT32: return MPI_UNSIGNED;
-    case datatype_t::INT64: return MPI_LONG_LONG;
-    case datatype_t::UINT64: return MPI_UNSIGNED_LONG_LONG;
-    case datatype_t::FLOAT32: return MPI_FLOAT;
-    case datatype_t::FLOAT64: return MPI_DOUBLE;
+    case datatype_t::CHAR:
+      return MPI_CHAR;
+    case datatype_t::UINT8:
+      return MPI_UNSIGNED_CHAR;
+    case datatype_t::INT32:
+      return MPI_INT;
+    case datatype_t::UINT32:
+      return MPI_UNSIGNED;
+    case datatype_t::INT64:
+      return MPI_LONG_LONG;
+    case datatype_t::UINT64:
+      return MPI_UNSIGNED_LONG_LONG;
+    case datatype_t::FLOAT32:
+      return MPI_FLOAT;
+    case datatype_t::FLOAT64:
+      return MPI_DOUBLE;
     default:
       // Execution should never reach here. This takes care of compiler warning.
       return MPI_DOUBLE;
   }
 }
 
-constexpr MPI_Op get_mpi_op(const op_t op)
-{
+constexpr MPI_Op get_mpi_op(const op_t op) {
   switch (op) {
-    case op_t::SUM: return MPI_SUM;
-    case op_t::PROD: return MPI_PROD;
-    case op_t::MIN: return MPI_MIN;
-    case op_t::MAX: return MPI_MAX;
+    case op_t::SUM:
+      return MPI_SUM;
+    case op_t::PROD:
+      return MPI_PROD;
+    case op_t::MIN:
+      return MPI_MIN;
+    case op_t::MAX:
+      return MPI_MAX;
     default:
       // Execution should never reach here. This takes care of compiler warning.
       return MPI_MAX;
@@ -95,35 +102,38 @@ constexpr MPI_Op get_mpi_op(const op_t op)
 class mpi_comms : public comms_iface {
  public:
   mpi_comms(MPI_Comm comm, const bool owns_mpi_comm)
-    : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0)
-  {
+    : owns_mpi_comm_(owns_mpi_comm),
+      mpi_comm_(comm),
+      size_(0),
+      rank_(1),
+      next_request_id_(0) {
     int mpi_is_initialized = 0;
     MPI_TRY(MPI_Initialized(&mpi_is_initialized));
     RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!");
     MPI_TRY(MPI_Comm_size(mpi_comm_, &size_));
     MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_));
-    // get NCCL unique ID at rank 0 and broadcast it to all others
+    //get NCCL unique ID at rank 0 and broadcast it to all others
     ncclUniqueId id;
     if (0 == rank_) NCCL_TRY(ncclGetUniqueId(&id));
     MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_));
 
-    // initializing NCCL
+    //initializing NCCL
     NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_));
   }
 
-  virtual ~mpi_comms()
-  {
-    // finalizing NCCL
+  virtual ~mpi_comms() {
+    //finalizing NCCL
     NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_));
-    if (owns_mpi_comm_) { MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); }
+    if (owns_mpi_comm_) {
+      MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_));
+    }
   }
 
   int get_size() const { return size_; }
 
   int get_rank() const { return rank_; }
 
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const
-  {
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const {
     MPI_Comm new_comm;
     MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm));
     return std::unique_ptr<comms_iface>(new mpi_comms(new_comm, true));
@@ -131,15 +141,15 @@ class mpi_comms : public comms_iface {
 
   void barrier() const { MPI_TRY(MPI_Barrier(mpi_comm_)); }
 
-  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
-  {
+  void isend(const void* buf, size_t size, int dest, int tag,
+             request_t* request) const {
     MPI_Request mpi_req;
     request_t req_id;
     if (free_requests_.empty()) {
       req_id = next_request_id_++;
     } else {
       auto it = free_requests_.begin();
-      req_id  = *it;
+      req_id = *it;
       free_requests_.erase(it);
     }
     MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req));
@@ -147,15 +157,15 @@ class mpi_comms : public comms_iface {
     *request = req_id;
   }
 
-  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
-  {
+  void irecv(void* buf, size_t size, int source, int tag,
+             request_t* request) const {
     MPI_Request mpi_req;
     request_t req_id;
     if (free_requests_.empty()) {
       req_id = next_request_id_++;
     } else {
       auto it = free_requests_.begin();
-      req_id  = *it;
+      req_id = *it;
       free_requests_.erase(it);
     }
 
@@ -164,8 +174,7 @@ class mpi_comms : public comms_iface {
     *request = req_id;
   }
 
-  void waitall(int count, request_t array_of_requests[]) const
-  {
+  void waitall(int count, request_t array_of_requests[]) const {
     std::vector<MPI_Request> requests;
     requests.reserve(count);
     for (int i = 0; i < count; ++i) {
@@ -180,138 +189,87 @@ class mpi_comms : public comms_iface {
     MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE));
   }
 
-  void allreduce(const void* sendbuff,
-                 void* recvbuff,
-                 size_t count,
-                 datatype_t datatype,
-                 op_t op,
-                 cudaStream_t stream) const
-  {
-    NCCL_TRY(ncclAllReduce(
-      sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
+  void allreduce(const void* sendbuff, void* recvbuff, size_t count,
+                 datatype_t datatype, op_t op, cudaStream_t stream) const {
+    NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count,
+                           get_nccl_datatype(datatype), get_nccl_op(op),
+                           nccl_comm_, stream));
   }
 
-  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
-  {
-    NCCL_TRY(
-      ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+  void bcast(void* buff, size_t count, datatype_t datatype, int root,
+             cudaStream_t stream) const {
+    NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root,
+                           nccl_comm_, stream));
   }
 
-  void reduce(const void* sendbuff,
-              void* recvbuff,
-              size_t count,
-              datatype_t datatype,
-              op_t op,
-              int root,
-              cudaStream_t stream) const
-  {
-    NCCL_TRY(ncclReduce(sendbuff,
-                        recvbuff,
-                        count,
-                        get_nccl_datatype(datatype),
-                        get_nccl_op(op),
-                        root,
-                        nccl_comm_,
-                        stream));
+  void reduce(const void* sendbuff, void* recvbuff, size_t count,
+              datatype_t datatype, op_t op, int root,
+              cudaStream_t stream) const {
+    NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype),
+                        get_nccl_op(op), root, nccl_comm_, stream));
   }
 
-  void allgather(const void* sendbuff,
-                 void* recvbuff,
-                 size_t sendcount,
-                 datatype_t datatype,
-                 cudaStream_t stream) const
-  {
-    NCCL_TRY(ncclAllGather(
-      sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
+  void allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
+                 datatype_t datatype, cudaStream_t stream) const {
+    NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount,
+                           get_nccl_datatype(datatype), nccl_comm_, stream));
   }
 
-  void allgatherv(const void* sendbuf,
-                  void* recvbuf,
-                  const size_t* recvcounts,
-                  const size_t* displs,
-                  datatype_t datatype,
-                  cudaStream_t stream) const
-  {
-    // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
-    // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
+  void allgatherv(const void* sendbuf, void* recvbuf, const size_t* recvcounts,
+                  const size_t* displs, datatype_t datatype,
+                  cudaStream_t stream) const {
+    //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf
+    //Listing 1 on page 4.
     for (int root = 0; root < size_; ++root) {
-      NCCL_TRY(
-        ncclBroadcast(sendbuf,
-                      static_cast<char*>(recvbuf) + displs[root] * get_datatype_size(datatype),
-                      recvcounts[root],
-                      get_nccl_datatype(datatype),
-                      root,
-                      nccl_comm_,
-                      stream));
+      NCCL_TRY(ncclBroadcast(sendbuf,
+                             static_cast<char*>(recvbuf) +
+                               displs[root] * get_datatype_size(datatype),
+                             recvcounts[root], get_nccl_datatype(datatype),
+                             root, nccl_comm_, stream));
     }
   }
 
-  void gather(const void* sendbuff,
-              void* recvbuff,
-              size_t sendcount,
-              datatype_t datatype,
-              int root,
-              cudaStream_t stream) const
-  {
+  void gather(const void* sendbuff, void* recvbuff, size_t sendcount,
+              datatype_t datatype, int root, cudaStream_t stream) const {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
-                          sendcount,
-                          get_nccl_datatype(datatype),
-                          r,
-                          nccl_comm_,
-                          stream));
+        NCCL_TRY(ncclRecv(
+          static_cast<char*>(recvbuff) + sendcount * r * dtype_size, sendcount,
+          get_nccl_datatype(datatype), r, nccl_comm_, stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
+                      nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void gatherv(const void* sendbuff,
-               void* recvbuff,
-               size_t sendcount,
-               const size_t* recvcounts,
-               const size_t* displs,
-               datatype_t datatype,
-               int root,
-               cudaStream_t stream) const
-  {
+  void gatherv(const void* sendbuff, void* recvbuff, size_t sendcount,
+               const size_t* recvcounts, const size_t* displs,
+               datatype_t datatype, int root, cudaStream_t stream) const {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
         NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
-                          recvcounts[r],
-                          get_nccl_datatype(datatype),
-                          r,
-                          nccl_comm_,
-                          stream));
+                          recvcounts[r], get_nccl_datatype(datatype), r,
+                          nccl_comm_, stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
+                      nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void reducescatter(const void* sendbuff,
-                     void* recvbuff,
-                     size_t recvcount,
-                     datatype_t datatype,
-                     op_t op,
-                     cudaStream_t stream) const
-  {
-    NCCL_TRY(ncclReduceScatter(sendbuff,
-                               recvbuff,
-                               recvcount,
-                               get_nccl_datatype(datatype),
-                               get_nccl_op(op),
-                               nccl_comm_,
-                               stream));
+  void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount,
+                     datatype_t datatype, op_t op, cudaStream_t stream) const {
+    NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
+                               get_nccl_datatype(datatype), get_nccl_op(op),
+                               nccl_comm_, stream));
   }
 
-  status_t sync_stream(cudaStream_t stream) const
-  {
+  status_t sync_stream(cudaStream_t stream) const {
     cudaError_t cudaErr;
     ncclResult_t ncclErr, ncclAsyncErr;
     while (1) {
@@ -344,58 +302,45 @@ class mpi_comms : public comms_iface {
   };
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
-  {
+  void device_send(const void* buf, size_t size, int dest,
+                   cudaStream_t stream) const {
     NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
-  {
+  void device_recv(void* buf, size_t size, int source,
+                   cudaStream_t stream) const {
     NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
   }
 
-  void device_sendrecv(const void* sendbuf,
-                       size_t sendsize,
-                       int dest,
-                       void* recvbuf,
-                       size_t recvsize,
-                       int source,
-                       cudaStream_t stream) const
-  {
+  void device_sendrecv(const void* sendbuf, size_t sendsize, int dest,
+                       void* recvbuf, size_t recvsize, int source,
+                       cudaStream_t stream) const {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
-    NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
+    NCCL_TRY(
+      ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
   void device_multicast_sendrecv(const void* sendbuf,
                                  std::vector<size_t> const& sendsizes,
                                  std::vector<size_t> const& sendoffsets,
-                                 std::vector<int> const& dests,
-                                 void* recvbuf,
+                                 std::vector<int> const& dests, void* recvbuf,
                                  std::vector<size_t> const& recvsizes,
                                  std::vector<size_t> const& recvoffsets,
                                  std::vector<int> const& sources,
-                                 cudaStream_t stream) const
-  {
+                                 cudaStream_t stream) const {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     for (size_t i = 0; i < sendsizes.size(); ++i) {
       NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
-                        sendsizes[i],
-                        ncclUint8,
-                        dests[i],
-                        nccl_comm_,
-                        stream));
+                        sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream));
     }
     for (size_t i = 0; i < recvsizes.size(); ++i) {
       NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
-                        recvsizes[i],
-                        ncclUint8,
-                        sources[i],
-                        nccl_comm_,
+                        recvsizes[i], ncclUint8, sources[i], nccl_comm_,
                         stream));
     }
     NCCL_TRY(ncclGroupEnd());
@@ -413,10 +358,9 @@ class mpi_comms : public comms_iface {
   mutable std::unordered_set<request_t> free_requests_;
 };
 
-inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm)
-{
-  auto communicator =
-    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mpi_comms(comm, true)));
+inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) {
+  auto communicator = std::make_shared<comms_t>(
+    std::unique_ptr<comms_iface>(new mpi_comms(comm, true)));
   handle->set_comms(communicator);
 };
 
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 5f80328d3f..765e8741bb 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -62,14 +62,10 @@ class std_comms : public comms_iface {
    * @param size size of the cluster
    * @param rank rank of the current worker
    */
-  std_comms(ncclComm_t nccl_comm,
-            ucp_worker_h ucp_worker,
-            std::shared_ptr<ucp_ep_h*> eps,
-            int num_ranks,
-            int rank,
+  std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker,
+            std::shared_ptr<ucp_ep_h *> eps, int num_ranks, int rank,
             const std::shared_ptr<mr::device::allocator> device_allocator,
-            cudaStream_t stream,
-            bool subcomms_ucp = true)
+            cudaStream_t stream, bool subcomms_ucp = true)
     : nccl_comm_(nccl_comm),
       stream_(stream),
       num_ranks_(num_ranks),
@@ -78,8 +74,7 @@ class std_comms : public comms_iface {
       ucp_worker_(ucp_worker),
       ucp_eps_(eps),
       next_request_id_(0),
-      device_allocator_(device_allocator)
-  {
+      device_allocator_(device_allocator) {
     initialize();
   };
 
@@ -89,9 +84,7 @@ class std_comms : public comms_iface {
    * @param size size of the cluster
    * @param rank rank of the current worker
    */
-  std_comms(const ncclComm_t nccl_comm,
-            int num_ranks,
-            int rank,
+  std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank,
             const std::shared_ptr<mr::device::allocator> device_allocator,
             cudaStream_t stream)
     : nccl_comm_(nccl_comm),
@@ -99,37 +92,37 @@ class std_comms : public comms_iface {
       num_ranks_(num_ranks),
       rank_(rank),
       subcomms_ucp_(false),
-      device_allocator_(device_allocator)
-  {
+      device_allocator_(device_allocator) {
     initialize();
   };
 
-  virtual ~std_comms()
-  {
+  virtual ~std_comms() {
     device_allocator_->deallocate(sendbuff_, sizeof(int), stream_);
     device_allocator_->deallocate(recvbuff_, sizeof(int), stream_);
   }
 
-  void initialize()
-  {
-    sendbuff_ = reinterpret_cast<int*>(device_allocator_->allocate(sizeof(int), stream_));
-    recvbuff_ = reinterpret_cast<int*>(device_allocator_->allocate(sizeof(int), stream_));
+  void initialize() {
+    sendbuff_ = reinterpret_cast<int *>(
+      device_allocator_->allocate(sizeof(int), stream_));
+    recvbuff_ = reinterpret_cast<int *>(
+      device_allocator_->allocate(sizeof(int), stream_));
   }
 
   int get_size() const { return num_ranks_; }
 
   int get_rank() const { return rank_; }
 
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const
-  {
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const {
     mr::device::buffer<int> d_colors(device_allocator_, stream_, get_size());
     mr::device::buffer<int> d_keys(device_allocator_, stream_, get_size());
 
     update_device(d_colors.data() + get_rank(), &color, 1, stream_);
     update_device(d_keys.data() + get_rank(), &key, 1, stream_);
 
-    allgather(d_colors.data() + get_rank(), d_colors.data(), 1, datatype_t::INT32, stream_);
-    allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, stream_);
+    allgather(d_colors.data() + get_rank(), d_colors.data(), 1,
+              datatype_t::INT32, stream_);
+    allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32,
+              stream_);
     this->sync_stream(stream_);
 
     std::vector<int> h_colors(get_size());
@@ -146,7 +139,9 @@ class std_comms : public comms_iface {
     for (int i = 0; i < get_size(); ++i) {
       if (h_colors[i] == color) {
         subcomm_ranks.push_back(i);
-        if (ucp_worker_ != nullptr && subcomms_ucp_) { new_ucx_ptrs.push_back((*ucp_eps_)[i]); }
+        if (ucp_worker_ != nullptr && subcomms_ucp_) {
+          new_ucx_ptrs.push_back((*ucp_eps_)[i]);
+        }
       }
     }
 
@@ -155,7 +150,8 @@ class std_comms : public comms_iface {
       NCCL_TRY(ncclGetUniqueId(&id));
       std::vector<request_t> requests(subcomm_ranks.size() - 1);
       for (size_t i = 1; i < subcomm_ranks.size(); ++i) {
-        isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1));
+        isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color,
+              requests.data() + (i - 1));
       }
       waitall(requests.size(), requests.data());
     } else {
@@ -170,23 +166,17 @@ class std_comms : public comms_iface {
     NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key));
 
     if (ucp_worker_ != nullptr && subcomms_ucp_) {
-      auto eps_sp = std::make_shared<ucp_ep_h*>(new_ucx_ptrs.data());
-      return std::unique_ptr<comms_iface>(new std_comms(nccl_comm,
-                                                        (ucp_worker_h)ucp_worker_,
-                                                        eps_sp,
-                                                        subcomm_ranks.size(),
-                                                        key,
-                                                        device_allocator_,
-                                                        stream_,
-                                                        subcomms_ucp_));
+      auto eps_sp = std::make_shared<ucp_ep_h *>(new_ucx_ptrs.data());
+      return std::unique_ptr<comms_iface>(new std_comms(
+        nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp, subcomm_ranks.size(), key,
+        device_allocator_, stream_, subcomms_ucp_));
     } else {
-      return std::unique_ptr<comms_iface>(
-        new std_comms(nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_));
+      return std::unique_ptr<comms_iface>(new std_comms(
+        nccl_comm, subcomm_ranks.size(), key, device_allocator_, stream_));
     }
   }
 
-  void barrier() const
-  {
+  void barrier() const {
     CUDA_CHECK(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_));
     CUDA_CHECK(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_));
 
@@ -196,37 +186,39 @@ class std_comms : public comms_iface {
            "ERROR: syncStream failed. This can be caused by a failed rank_.");
   }
 
-  void get_request_id(request_t* req) const
-  {
+  void get_request_id(request_t *req) const {
     request_t req_id;
 
     if (this->free_requests_.empty())
       req_id = this->next_request_id_++;
     else {
       auto it = this->free_requests_.begin();
-      req_id  = *it;
+      req_id = *it;
       this->free_requests_.erase(it);
     }
     *req = req_id;
   }
 
-  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
-  {
-    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
+  void isend(const void *buf, size_t size, int dest, int tag,
+             request_t *request) const {
+    ASSERT(ucp_worker_ != nullptr,
+           "ERROR: UCX comms not initialized on communicator.");
 
     get_request_id(request);
     ucp_ep_h ep_ptr = (*ucp_eps_)[dest];
 
-    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
+    ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request));
 
-    this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank());
+    this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag,
+                                 default_tag_mask, get_rank());
 
     requests_in_flight_.insert(std::make_pair(*request, ucp_req));
   }
 
-  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
-  {
-    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
+  void irecv(void *buf, size_t size, int source, int tag,
+             request_t *request) const {
+    ASSERT(ucp_worker_ != nullptr,
+           "ERROR: UCX comms not initialized on communicator.");
 
     get_request_id(request);
 
@@ -234,17 +226,18 @@ class std_comms : public comms_iface {
 
     ucp_tag_t tag_mask = default_tag_mask;
 
-    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
-    ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source);
+    ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request));
+    ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag,
+                           tag_mask, source);
 
     requests_in_flight_.insert(std::make_pair(*request, ucp_req));
   }
 
-  void waitall(int count, request_t array_of_requests[]) const
-  {
-    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
+  void waitall(int count, request_t array_of_requests[]) const {
+    ASSERT(ucp_worker_ != nullptr,
+           "ERROR: UCX comms not initialized on communicator.");
 
-    std::vector<ucp_request*> requests;
+    std::vector<ucp_request *> requests;
     requests.reserve(count);
 
     time_t start = time(NULL);
@@ -252,8 +245,7 @@ class std_comms : public comms_iface {
     for (int i = 0; i < count; ++i) {
       auto req_it = requests_in_flight_.find(array_of_requests[i]);
       ASSERT(requests_in_flight_.end() != req_it,
-             "ERROR: waitall on invalid request: %d",
-             array_of_requests[i]);
+             "ERROR: waitall on invalid request: %d", array_of_requests[i]);
       requests.push_back(req_it->second);
       free_requests_.insert(req_it->first);
       requests_in_flight_.erase(req_it);
@@ -266,7 +258,8 @@ class std_comms : public comms_iface {
       // in 10 or more seconds.
       ASSERT(now - start < 10, "Timed out waiting for requests.");
 
-      for (std::vector<ucp_request*>::iterator it = requests.begin(); it != requests.end();) {
+      for (std::vector<ucp_request *>::iterator it = requests.begin();
+           it != requests.end();) {
         bool restart = false;  // resets the timeout when any progress was made
 
         // Causes UCP to progress through the send/recv message queue
@@ -279,8 +272,10 @@ class std_comms : public comms_iface {
         // If the message needs release, we know it will be sent/received
         // asynchronously, so we will need to track and verify its state
         if (req->needs_release) {
-          ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer");
-          ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req));
+          ASSERT(UCS_PTR_IS_PTR(req->req),
+                 "UCX Request Error. Request is not valid UCX pointer");
+          ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n",
+                 UCS_PTR_STATUS(req->req));
           ASSERT(req->req->completed == 1 || req->req->completed == 0,
                  "request->completed not a valid value: %d\n",
                  req->req->completed);
@@ -301,143 +296,94 @@ class std_comms : public comms_iface {
           ++it;
         }
         // if any progress was made, reset the timeout start time
-        if (restart) { start = time(NULL); }
+        if (restart) {
+          start = time(NULL);
+        }
       }
     }
   }
 
-  void allreduce(const void* sendbuff,
-                 void* recvbuff,
-                 size_t count,
-                 datatype_t datatype,
-                 op_t op,
-                 cudaStream_t stream) const
-  {
-    NCCL_TRY(ncclAllReduce(
-      sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
+  void allreduce(const void *sendbuff, void *recvbuff, size_t count,
+                 datatype_t datatype, op_t op, cudaStream_t stream) const {
+    NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count,
+                           get_nccl_datatype(datatype), get_nccl_op(op),
+                           nccl_comm_, stream));
   }
 
-  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
-  {
-    NCCL_TRY(
-      ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+  void bcast(void *buff, size_t count, datatype_t datatype, int root,
+             cudaStream_t stream) const {
+    NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root,
+                           nccl_comm_, stream));
   }
 
-  void reduce(const void* sendbuff,
-              void* recvbuff,
-              size_t count,
-              datatype_t datatype,
-              op_t op,
-              int root,
-              cudaStream_t stream) const
-  {
-    NCCL_TRY(ncclReduce(sendbuff,
-                        recvbuff,
-                        count,
-                        get_nccl_datatype(datatype),
-                        get_nccl_op(op),
-                        root,
-                        nccl_comm_,
-                        stream));
+  void reduce(const void *sendbuff, void *recvbuff, size_t count,
+              datatype_t datatype, op_t op, int root,
+              cudaStream_t stream) const {
+    NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype),
+                        get_nccl_op(op), root, nccl_comm_, stream));
   }
 
-  void allgather(const void* sendbuff,
-                 void* recvbuff,
-                 size_t sendcount,
-                 datatype_t datatype,
-                 cudaStream_t stream) const
-  {
-    NCCL_TRY(ncclAllGather(
-      sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
+  void allgather(const void *sendbuff, void *recvbuff, size_t sendcount,
+                 datatype_t datatype, cudaStream_t stream) const {
+    NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount,
+                           get_nccl_datatype(datatype), nccl_comm_, stream));
   }
 
-  void allgatherv(const void* sendbuf,
-                  void* recvbuf,
-                  const size_t* recvcounts,
-                  const size_t* displs,
-                  datatype_t datatype,
-                  cudaStream_t stream) const
-  {
-    // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
-    // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
+  void allgatherv(const void *sendbuf, void *recvbuf, const size_t *recvcounts,
+                  const size_t *displs, datatype_t datatype,
+                  cudaStream_t stream) const {
+    //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf
+    //Listing 1 on page 4.
     for (int root = 0; root < num_ranks_; ++root) {
       size_t dtype_size = get_datatype_size(datatype);
-      NCCL_TRY(ncclBroadcast(sendbuf,
-                             static_cast<char*>(recvbuf) + displs[root] * dtype_size,
-                             recvcounts[root],
-                             get_nccl_datatype(datatype),
-                             root,
-                             nccl_comm_,
-                             stream));
+      NCCL_TRY(ncclBroadcast(
+        sendbuf, static_cast<char *>(recvbuf) + displs[root] * dtype_size,
+        recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_,
+        stream));
     }
   }
 
-  void gather(const void* sendbuff,
-              void* recvbuff,
-              size_t sendcount,
-              datatype_t datatype,
-              int root,
-              cudaStream_t stream) const
-  {
+  void gather(const void *sendbuff, void *recvbuff, size_t sendcount,
+              datatype_t datatype, int root, cudaStream_t stream) const {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
-                          sendcount,
-                          get_nccl_datatype(datatype),
-                          r,
-                          nccl_comm_,
-                          stream));
+        NCCL_TRY(ncclRecv(
+          static_cast<char *>(recvbuff) + sendcount * r * dtype_size, sendcount,
+          get_nccl_datatype(datatype), r, nccl_comm_, stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
+                      nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void gatherv(const void* sendbuff,
-               void* recvbuff,
-               size_t sendcount,
-               const size_t* recvcounts,
-               const size_t* displs,
-               datatype_t datatype,
-               int root,
-               cudaStream_t stream) const
-  {
+  void gatherv(const void *sendbuff, void *recvbuff, size_t sendcount,
+               const size_t *recvcounts, const size_t *displs,
+               datatype_t datatype, int root, cudaStream_t stream) const {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
-                          recvcounts[r],
-                          get_nccl_datatype(datatype),
-                          r,
-                          nccl_comm_,
-                          stream));
+        NCCL_TRY(ncclRecv(
+          static_cast<char *>(recvbuff) + displs[r] * dtype_size, recvcounts[r],
+          get_nccl_datatype(datatype), r, nccl_comm_, stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
+                      nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void reducescatter(const void* sendbuff,
-                     void* recvbuff,
-                     size_t recvcount,
-                     datatype_t datatype,
-                     op_t op,
-                     cudaStream_t stream) const
-  {
-    NCCL_TRY(ncclReduceScatter(sendbuff,
-                               recvbuff,
-                               recvcount,
-                               get_nccl_datatype(datatype),
-                               get_nccl_op(op),
-                               nccl_comm_,
-                               stream));
+  void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount,
+                     datatype_t datatype, op_t op, cudaStream_t stream) const {
+    NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
+                               get_nccl_datatype(datatype), get_nccl_op(op),
+                               nccl_comm_, stream));
   }
 
-  status_t sync_stream(cudaStream_t stream) const
-  {
+  status_t sync_stream(cudaStream_t stream) const {
     cudaError_t cudaErr;
     ncclResult_t ncclErr, ncclAsyncErr;
     while (1) {
@@ -470,58 +416,45 @@ class std_comms : public comms_iface {
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
-  {
+  void device_send(const void *buf, size_t size, int dest,
+                   cudaStream_t stream) const {
     NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
-  {
+  void device_recv(void *buf, size_t size, int source,
+                   cudaStream_t stream) const {
     NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
   }
 
-  void device_sendrecv(const void* sendbuf,
-                       size_t sendsize,
-                       int dest,
-                       void* recvbuf,
-                       size_t recvsize,
-                       int source,
-                       cudaStream_t stream) const
-  {
+  void device_sendrecv(const void *sendbuf, size_t sendsize, int dest,
+                       void *recvbuf, size_t recvsize, int source,
+                       cudaStream_t stream) const {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
-    NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
+    NCCL_TRY(
+      ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void device_multicast_sendrecv(const void* sendbuf,
-                                 std::vector<size_t> const& sendsizes,
-                                 std::vector<size_t> const& sendoffsets,
-                                 std::vector<int> const& dests,
-                                 void* recvbuf,
-                                 std::vector<size_t> const& recvsizes,
-                                 std::vector<size_t> const& recvoffsets,
-                                 std::vector<int> const& sources,
-                                 cudaStream_t stream) const
-  {
+  void device_multicast_sendrecv(const void *sendbuf,
+                                 std::vector<size_t> const &sendsizes,
+                                 std::vector<size_t> const &sendoffsets,
+                                 std::vector<int> const &dests, void *recvbuf,
+                                 std::vector<size_t> const &recvsizes,
+                                 std::vector<size_t> const &recvoffsets,
+                                 std::vector<int> const &sources,
+                                 cudaStream_t stream) const {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     for (size_t i = 0; i < sendsizes.size(); ++i) {
-      NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
-                        sendsizes[i],
-                        ncclUint8,
-                        dests[i],
-                        nccl_comm_,
-                        stream));
+      NCCL_TRY(ncclSend(static_cast<const char *>(sendbuf) + sendoffsets[i],
+                        sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream));
     }
     for (size_t i = 0; i < recvsizes.size(); ++i) {
-      NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
-                        recvsizes[i],
-                        ncclUint8,
-                        sources[i],
-                        nccl_comm_,
+      NCCL_TRY(ncclRecv(static_cast<char *>(recvbuf) + recvoffsets[i],
+                        recvsizes[i], ncclUint8, sources[i], nccl_comm_,
                         stream));
     }
     NCCL_TRY(ncclGroupEnd());
@@ -540,9 +473,10 @@ class std_comms : public comms_iface {
 
   comms_ucp_handler ucp_handler_;
   ucp_worker_h ucp_worker_;
-  std::shared_ptr<ucp_ep_h*> ucp_eps_;
+  std::shared_ptr<ucp_ep_h *> ucp_eps_;
   mutable request_t next_request_id_;
-  mutable std::unordered_map<request_t, struct ucp_request*> requests_in_flight_;
+  mutable std::unordered_map<request_t, struct ucp_request *>
+    requests_in_flight_;
   mutable std::unordered_set<request_t> free_requests_;
 
   std::shared_ptr<mr::device::allocator> device_allocator_;
diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
index 86827a294e..4e95c4eef0 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/test.hpp
@@ -37,9 +37,8 @@ namespace comms {
  * @param the raft handle to use. This is expected to already have an
  *        initialized comms instance.
  */
-bool test_collective_allreduce(const handle_t& handle, int root)
-{
-  comms_t const& communicator = handle.get_comms();
+bool test_collective_allreduce(const handle_t &handle, int root) {
+  comms_t const &communicator = handle.get_comms();
 
   int const send = 1;
 
@@ -47,12 +46,14 @@ bool test_collective_allreduce(const handle_t& handle, int root)
 
   raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
   temp_d.resize(1, stream);
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
 
   communicator.allreduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, stream);
 
   int temp_h = 0;
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -68,9 +69,8 @@ bool test_collective_allreduce(const handle_t& handle, int root)
  * @param the raft handle to use. This is expected to already have an
  *        initialized comms instance.
  */
-bool test_collective_broadcast(const handle_t& handle, int root)
-{
-  comms_t const& communicator = handle.get_comms();
+bool test_collective_broadcast(const handle_t &handle, int root) {
+  comms_t const &communicator = handle.get_comms();
 
   int const send = root;
 
@@ -80,12 +80,14 @@ bool test_collective_broadcast(const handle_t& handle, int root)
   temp_d.resize(1, stream);
 
   if (communicator.get_rank() == root)
-    CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
+                               cudaMemcpyHostToDevice, stream));
 
   communicator.bcast(temp_d.data(), 1, root, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int),
+                             cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -95,9 +97,8 @@ bool test_collective_broadcast(const handle_t& handle, int root)
   return temp_h == root;
 }
 
-bool test_collective_reduce(const handle_t& handle, int root)
-{
-  comms_t const& communicator = handle.get_comms();
+bool test_collective_reduce(const handle_t &handle, int root) {
+  comms_t const &communicator = handle.get_comms();
 
   int const send = root;
 
@@ -106,12 +107,14 @@ bool test_collective_reduce(const handle_t& handle, int root)
   raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
   temp_d.resize(1, stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
+                             cudaMemcpyHostToDevice, stream));
 
   communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int),
+                             cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -124,9 +127,8 @@ bool test_collective_reduce(const handle_t& handle, int root)
     return true;
 }
 
-bool test_collective_allgather(const handle_t& handle, int root)
-{
-  comms_t const& communicator = handle.get_comms();
+bool test_collective_allgather(const handle_t &handle, int root) {
+  comms_t const &communicator = handle.get_comms();
 
   int const send = communicator.get_rank();
 
@@ -135,16 +137,19 @@ bool test_collective_allgather(const handle_t& handle, int root)
   raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
   temp_d.resize(1, stream);
 
-  raft::mr::device::buffer<int> recv_d(
-    handle.get_device_allocator(), stream, communicator.get_size());
+  raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(), stream,
+                                       communicator.get_size());
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
+                             cudaMemcpyHostToDevice, stream));
 
   communicator.allgather(temp_d.data(), recv_d.data(), 1, stream);
   communicator.sync_stream(stream);
-  int temp_h[communicator.get_size()];  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(
-    &temp_h, recv_d.data(), sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream));
+  int
+    temp_h[communicator.get_size()];  // Verify more than one byte is being sent
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(),
+                             sizeof(int) * communicator.get_size(),
+                             cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -157,9 +162,8 @@ bool test_collective_allgather(const handle_t& handle, int root)
   return true;
 }
 
-bool test_collective_gather(const handle_t& handle, int root)
-{
-  comms_t const& communicator = handle.get_comms();
+bool test_collective_gather(const handle_t &handle, int root) {
+  comms_t const &communicator = handle.get_comms();
 
   int const send = communicator.get_rank();
 
@@ -169,19 +173,20 @@ bool test_collective_gather(const handle_t& handle, int root)
   temp_d.resize(1, stream);
 
   raft::mr::device::buffer<int> recv_d(
-    handle.get_device_allocator(),
-    stream,
+    handle.get_device_allocator(), stream,
     communicator.get_rank() == root ? communicator.get_size() : 0);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
+                             cudaMemcpyHostToDevice, stream));
 
   communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream);
   communicator.sync_stream(stream);
 
   if (communicator.get_rank() == root) {
     std::vector<int> temp_h(communicator.get_size(), 0);
-    CUDA_CHECK(cudaMemcpyAsync(
-      temp_h.data(), recv_d.data(), sizeof(int) * temp_h.size(), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(),
+                               sizeof(int) * temp_h.size(),
+                               cudaMemcpyDeviceToHost, stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     for (int i = 0; i < communicator.get_size(); i++) {
@@ -191,48 +196,46 @@ bool test_collective_gather(const handle_t& handle, int root)
   return true;
 }
 
-bool test_collective_gatherv(const handle_t& handle, int root)
-{
-  comms_t const& communicator = handle.get_comms();
+bool test_collective_gatherv(const handle_t &handle, int root) {
+  comms_t const &communicator = handle.get_comms();
 
   std::vector<size_t> sendcounts(communicator.get_size());
   std::iota(sendcounts.begin(), sendcounts.end(), size_t{1});
   std::vector<size_t> displacements(communicator.get_size() + 1, 0);
-  std::partial_sum(sendcounts.begin(), sendcounts.end(), displacements.begin() + 1);
+  std::partial_sum(sendcounts.begin(), sendcounts.end(),
+                   displacements.begin() + 1);
 
-  std::vector<int> sends(
-    displacements[communicator.get_rank() + 1] - displacements[communicator.get_rank()],
-    communicator.get_rank());
+  std::vector<int> sends(displacements[communicator.get_rank() + 1] -
+                           displacements[communicator.get_rank()],
+                         communicator.get_rank());
 
   cudaStream_t stream = handle.get_stream();
 
   raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream);
   temp_d.resize(sends.size(), stream);
 
-  raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(),
-                                       stream,
-                                       communicator.get_rank() == root ? displacements.back() : 0);
+  raft::mr::device::buffer<int> recv_d(
+    handle.get_device_allocator(), stream,
+    communicator.get_rank() == root ? displacements.back() : 0);
 
-  CUDA_CHECK(cudaMemcpyAsync(
-    temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
+                             sends.size() * sizeof(int), cudaMemcpyHostToDevice,
+                             stream));
 
   communicator.gatherv(
-    temp_d.data(),
-    recv_d.data(),
-    temp_d.size(),
-    communicator.get_rank() == root ? sendcounts.data() : static_cast<size_t*>(nullptr),
-    communicator.get_rank() == root ? displacements.data() : static_cast<size_t*>(nullptr),
-    root,
-    stream);
+    temp_d.data(), recv_d.data(), temp_d.size(),
+    communicator.get_rank() == root ? sendcounts.data()
+                                    : static_cast<size_t *>(nullptr),
+    communicator.get_rank() == root ? displacements.data()
+                                    : static_cast<size_t *>(nullptr),
+    root, stream);
   communicator.sync_stream(stream);
 
   if (communicator.get_rank() == root) {
     std::vector<int> temp_h(displacements.back(), 0);
-    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(),
-                               recv_d.data(),
+    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(),
                                sizeof(int) * displacements.back(),
-                               cudaMemcpyDeviceToHost,
-                               stream));
+                               cudaMemcpyDeviceToHost, stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     for (int i = 0; i < communicator.get_size(); i++) {
@@ -246,24 +249,28 @@ bool test_collective_gatherv(const handle_t& handle, int root)
   return true;
 }
 
-bool test_collective_reducescatter(const handle_t& handle, int root)
-{
-  comms_t const& communicator = handle.get_comms();
+bool test_collective_reducescatter(const handle_t &handle, int root) {
+  comms_t const &communicator = handle.get_comms();
 
   std::vector<int> sends(communicator.get_size(), 1);
 
   cudaStream_t stream = handle.get_stream();
 
-  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream, sends.size());
-  raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(), stream, 1);
+  raft::mr::device::buffer<int> temp_d(handle.get_device_allocator(), stream,
+                                       sends.size());
+  raft::mr::device::buffer<int> recv_d(handle.get_device_allocator(), stream,
+                                       1);
 
-  CUDA_CHECK(cudaMemcpyAsync(
-    temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
+                             sends.size() * sizeof(int), cudaMemcpyHostToDevice,
+                             stream));
 
-  communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream);
+  communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM,
+                             stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int),
+                             cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -280,10 +287,9 @@ bool test_collective_reducescatter(const handle_t& handle, int root)
  *        initialized comms instance.
  * @param number of iterations of all-to-all messaging to perform
  */
-bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
-{
-  comms_t const& communicator = h.get_comms();
-  int const rank              = communicator.get_rank();
+bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
+  comms_t const &communicator = h.get_comms();
+  int const rank = communicator.get_rank();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -292,11 +298,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
     std::vector<request_t> requests;
     requests.resize(2 * (communicator.get_size() - 1));
     int request_idx = 0;
-    // post receives
+    //post receives
     for (int r = 0; r < communicator.get_size(); ++r) {
       if (r != rank) {
-        communicator.irecv(
-          received_data.data() + request_idx, 1, r, 0, requests.data() + request_idx);
+        communicator.irecv(received_data.data() + request_idx, 1, r, 0,
+                           requests.data() + request_idx);
         ++request_idx;
       }
     }
@@ -332,7 +338,8 @@ bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
       communicator.barrier();
     }
 
-    if (communicator.get_rank() == 0) std::cout << "=========================" << std::endl;
+    if (communicator.get_rank() == 0)
+      std::cout << "=========================" << std::endl;
   }
 
   return ret;
@@ -345,11 +352,10 @@ bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
-{
-  comms_t const& communicator = h.get_comms();
-  int const rank              = communicator.get_rank();
-  cudaStream_t stream         = h.get_stream();
+bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) {
+  comms_t const &communicator = h.get_comms();
+  int const rank = communicator.get_rank();
+  cudaStream_t stream = h.get_stream();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -372,9 +378,13 @@ bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
 
     communicator.sync_stream(stream);
 
-    if (!sender && received_data.value(stream) != rank - 1) { ret = false; }
+    if (!sender && received_data.value(stream) != rank - 1) {
+      ret = false;
+    }
 
-    if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; }
+    if (communicator.get_rank() == 0) {
+      std::cout << "=========================" << std::endl;
+    }
   }
 
   return ret;
@@ -387,11 +397,10 @@ bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
-{
-  comms_t const& communicator = h.get_comms();
-  int const rank              = communicator.get_rank();
-  cudaStream_t stream         = h.get_stream();
+bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
+  comms_t const &communicator = h.get_comms();
+  int const rank = communicator.get_rank();
+  cudaStream_t stream = h.get_stream();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -405,12 +414,12 @@ bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
 
     if (rank % 2 == 0) {
       if (rank + 1 < communicator.get_size()) {
-        communicator.device_sendrecv(
-          sent_data.data(), 1, rank + 1, received_data.data(), 1, rank + 1, stream);
+        communicator.device_sendrecv(sent_data.data(), 1, rank + 1,
+                                     received_data.data(), 1, rank + 1, stream);
       }
     } else {
-      communicator.device_sendrecv(
-        sent_data.data(), 1, rank - 1, received_data.data(), 1, rank - 1, stream);
+      communicator.device_sendrecv(sent_data.data(), 1, rank - 1,
+                                   received_data.data(), 1, rank - 1, stream);
     }
 
     communicator.sync_stream(stream);
@@ -420,7 +429,9 @@ bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
       ret = false;
     }
 
-    if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; }
+    if (communicator.get_rank() == 0) {
+      std::cout << "=========================" << std::endl;
+    }
   }
 
   return ret;
@@ -433,11 +444,11 @@ bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
  *        initialized comms instance.
  * @param number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials)
-{
-  comms_t const& communicator = h.get_comms();
-  int const rank              = communicator.get_rank();
-  cudaStream_t stream         = h.get_stream();
+bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
+                                                 int numTrials) {
+  comms_t const &communicator = h.get_comms();
+  int const rank = communicator.get_rank();
+  cudaStream_t stream = h.get_stream();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -460,26 +471,25 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrial
     std::vector<int> srcs(communicator.get_size());
     std::iota(srcs.begin(), srcs.end(), int{0});
 
-    communicator.device_multicast_sendrecv(sent_data.data(),
-                                           sendsizes,
-                                           sendoffsets,
-                                           dests,
-                                           received_data.data(),
-                                           recvsizes,
-                                           recvoffsets,
-                                           srcs,
-                                           stream);
+    communicator.device_multicast_sendrecv(
+      sent_data.data(), sendsizes, sendoffsets, dests, received_data.data(),
+      recvsizes, recvoffsets, srcs, stream);
 
     communicator.sync_stream(stream);
 
     std::vector<int> h_received_data(communicator.get_size());
-    raft::update_host(h_received_data.data(), received_data.data(), received_data.size(), stream);
+    raft::update_host(h_received_data.data(), received_data.data(),
+                      received_data.size(), stream);
     CUDA_TRY(cudaStreamSynchronize(stream));
     for (int i = 0; i < communicator.get_size(); ++i) {
-      if (h_received_data[i] != i) { ret = false; }
+      if (h_received_data[i] != i) {
+        ret = false;
+      }
     }
 
-    if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; }
+    if (communicator.get_rank() == 0) {
+      std::cout << "=========================" << std::endl;
+    }
   }
 
   return ret;
@@ -492,20 +502,20 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrial
  *        initialized comms instance.
  * @param n_colors number of different colors to test
  */
-bool test_commsplit(const handle_t& h, int n_colors)
-{
-  comms_t const& communicator = h.get_comms();
-  int const rank              = communicator.get_rank();
-  int const size              = communicator.get_size();
+bool test_commsplit(const handle_t &h, int n_colors) {
+  comms_t const &communicator = h.get_comms();
+  int const rank = communicator.get_rank();
+  int const size = communicator.get_size();
 
   if (n_colors > size) n_colors = size;
 
   // first we need to assign to a color, then assign the rank within the color
   int color = rank % n_colors;
-  int key   = rank / n_colors;
+  int key = rank / n_colors;
 
   handle_t new_handle(1);
-  auto shared_comm = std::make_shared<comms_t>(communicator.comm_split(color, key));
+  auto shared_comm =
+    std::make_shared<comms_t>(communicator.comm_split(color, key));
   new_handle.set_comms(shared_comm);
 
   return test_collective_allreduce(new_handle, 0);
diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp
index 89c7b25630..226b6f0527 100644
--- a/cpp/include/raft/comms/ucp_helper.hpp
+++ b/cpp/include/raft/comms/ucp_helper.hpp
@@ -25,19 +25,16 @@
 namespace raft {
 namespace comms {
 
-typedef void (*dlsym_print_info)(ucp_ep_h, FILE*);
-typedef void (*dlsym_rec_free)(void*);
+typedef void (*dlsym_print_info)(ucp_ep_h, FILE *);
+typedef void (*dlsym_rec_free)(void *);
 typedef int (*dlsym_worker_progress)(ucp_worker_h);
 
-typedef ucs_status_ptr_t (*dlsym_send)(
-  ucp_ep_h, const void*, size_t, ucp_datatype_t, ucp_tag_t, ucp_send_callback_t);
-typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h,
-                                       void*,
-                                       size_t count,
-                                       ucp_datatype_t datatype,
-                                       ucp_tag_t,
-                                       ucp_tag_t,
-                                       ucp_tag_recv_callback_t);
+typedef ucs_status_ptr_t (*dlsym_send)(ucp_ep_h, const void *, size_t,
+                                       ucp_datatype_t, ucp_tag_t,
+                                       ucp_send_callback_t);
+typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, void *, size_t count,
+                                       ucp_datatype_t datatype, ucp_tag_t,
+                                       ucp_tag_t, ucp_tag_recv_callback_t);
 
 /**
  * Standard UCX request object that will be passed
@@ -58,9 +55,9 @@ struct ucx_context {
  */
 class ucp_request {
  public:
-  struct ucx_context* req;
-  bool needs_release   = true;
-  int other_rank       = -1;
+  struct ucx_context *req;
+  bool needs_release = true;
+  int other_rank = -1;
   bool is_send_request = false;
 };
 
@@ -70,19 +67,18 @@ static const ucp_tag_t default_tag_mask = -1;
 /**
  * @brief Asynchronous send callback sets request to completed
  */
-static void send_callback(void* request, ucs_status_t status)
-{
-  struct ucx_context* context = (struct ucx_context*)request;
-  context->completed          = 1;
+static void send_callback(void *request, ucs_status_t status) {
+  struct ucx_context *context = (struct ucx_context *)request;
+  context->completed = 1;
 }
 
 /**
  * @brief Asynchronous recv callback sets request to completed
  */
-static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_t* info)
-{
-  struct ucx_context* context = (struct ucx_context*)request;
-  context->completed          = 1;
+static void recv_callback(void *request, ucs_status_t status,
+                          ucp_tag_recv_info_t *info) {
+  struct ucx_context *context = (struct ucx_context *)request;
+  context->completed = 1;
 }
 
 /**
@@ -91,8 +87,7 @@ static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_
  */
 class comms_ucp_handler {
  public:
-  comms_ucp_handler()
-  {
+  comms_ucp_handler() {
     load_ucp_handle();
     load_send_func();
     load_recv_func();
@@ -104,7 +99,7 @@ class comms_ucp_handler {
   ~comms_ucp_handler() { dlclose(ucp_handle); }
 
  private:
-  void* ucp_handle;
+  void *ucp_handle;
 
   dlsym_print_info print_info_func;
   dlsym_rec_free req_free_func;
@@ -112,8 +107,7 @@ class comms_ucp_handler {
   dlsym_send send_func;
   dlsym_recv recv_func;
 
-  void load_ucp_handle()
-  {
+  void load_ucp_handle() {
     ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE);
     if (!ucp_handle) {
       ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE);
@@ -123,56 +117,51 @@ class comms_ucp_handler {
     dlerror();
   }
 
-  void assert_dlerror()
-  {
-    char* error = dlerror();
+  void assert_dlerror() {
+    char *error = dlerror();
     ASSERT(error == NULL, "Error loading function symbol: %s\n", error);
   }
 
-  void load_send_func()
-  {
+  void load_send_func() {
     send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb");
     assert_dlerror();
   }
 
-  void load_free_req_func()
-  {
+  void load_free_req_func() {
     req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free");
     assert_dlerror();
   }
 
-  void load_print_info_func()
-  {
+  void load_print_info_func() {
     print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info");
     assert_dlerror();
   }
 
-  void load_worker_progress_func()
-  {
-    worker_progress_func = (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress");
+  void load_worker_progress_func() {
+    worker_progress_func =
+      (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress");
     assert_dlerror();
   }
 
-  void load_recv_func()
-  {
+  void load_recv_func() {
     recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb");
     assert_dlerror();
   }
 
-  ucp_tag_t build_message_tag(int rank, int tag) const
-  {
+  ucp_tag_t build_message_tag(int rank, int tag) const {
     // keeping the rank in the lower bits enables debugging.
     return ((uint32_t)tag << 31) | (uint32_t)rank;
   }
 
  public:
-  int ucp_progress(ucp_worker_h worker) const { return (*(worker_progress_func))(worker); }
+  int ucp_progress(ucp_worker_h worker) const {
+    return (*(worker_progress_func))(worker);
+  }
 
   /**
    * @brief Frees any memory underlying the given ucp request object
    */
-  void free_ucp_request(ucp_request* request) const
-  {
+  void free_ucp_request(ucp_request *request) const {
     if (request->needs_release) {
       request->req->completed = 0;
       (*(req_free_func))(request->req);
@@ -183,67 +172,56 @@ class comms_ucp_handler {
   /**
    * @brief Asynchronously send data to the given endpoint using the given tag
    */
-  void ucp_isend(ucp_request* req,
-                 ucp_ep_h ep_ptr,
-                 const void* buf,
-                 size_t size,
-                 int tag,
-                 ucp_tag_t tag_mask,
-                 int rank) const
-  {
+  void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf,
+                 size_t size, int tag, ucp_tag_t tag_mask, int rank) const {
     ucp_tag_t ucp_tag = build_message_tag(rank, tag);
 
-    ucs_status_ptr_t send_result =
-      (*(send_func))(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback);
-    struct ucx_context* ucp_req = (struct ucx_context*)send_result;
+    ucs_status_ptr_t send_result = (*(send_func))(
+      ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback);
+    struct ucx_context *ucp_req = (struct ucx_context *)send_result;
 
     if (UCS_PTR_IS_ERR(send_result)) {
       ASSERT(!UCS_PTR_IS_ERR(send_result),
              "unable to send UCX data message (%d)\n",
              UCS_PTR_STATUS(send_result));
       /**
-       * If the request didn't fail, but it's not OK, it is in flight.
-       * Expect the handler to be invoked
-       */
+     * If the request didn't fail, but it's not OK, it is in flight.
+     * Expect the handler to be invoked
+     */
     } else if (UCS_PTR_STATUS(send_result) != UCS_OK) {
       /**
-       * If the request is OK, it's already been completed and we don't need to wait on it.
-       * The request will be a nullptr, however, so we need to create a new request
-       * and set it to completed to make the "waitall()" function work properly.
-       */
+      * If the request is OK, it's already been completed and we don't need to wait on it.
+      * The request will be a nullptr, however, so we need to create a new request
+      * and set it to completed to make the "waitall()" function work properly.
+      */
       req->needs_release = true;
     } else {
       req->needs_release = false;
     }
 
-    req->other_rank      = rank;
+    req->other_rank = rank;
     req->is_send_request = true;
-    req->req             = ucp_req;
+    req->req = ucp_req;
   }
 
   /**
    * @brief Asynchronously receive data from given endpoint with the given tag.
    */
-  void ucp_irecv(ucp_request* req,
-                 ucp_worker_h worker,
-                 ucp_ep_h ep_ptr,
-                 void* buf,
-                 size_t size,
-                 int tag,
-                 ucp_tag_t tag_mask,
-                 int sender_rank) const
-  {
+  void ucp_irecv(ucp_request *req, ucp_worker_h worker, ucp_ep_h ep_ptr,
+                 void *buf, size_t size, int tag, ucp_tag_t tag_mask,
+                 int sender_rank) const {
     ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag);
 
     ucs_status_ptr_t recv_result =
-      (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback);
+      (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag,
+                     tag_mask, recv_callback);
 
-    struct ucx_context* ucp_req = (struct ucx_context*)recv_result;
+    struct ucx_context *ucp_req = (struct ucx_context *)recv_result;
 
-    req->req             = ucp_req;
-    req->needs_release   = true;
+    req->req = ucp_req;
+    req->needs_release = true;
     req->is_send_request = false;
-    req->other_rank      = sender_rank;
+    req->other_rank = sender_rank;
 
     ASSERT(!UCS_PTR_IS_ERR(recv_result),
            "unable to receive UCX data message (%d)\n",
diff --git a/cpp/include/raft/comms/util.hpp b/cpp/include/raft/comms/util.hpp
index 1b0548fc00..f3216abc37 100644
--- a/cpp/include/raft/comms/util.hpp
+++ b/cpp/include/raft/comms/util.hpp
@@ -26,70 +26,88 @@
  * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
  * exception detailing the NCCL error that occurred
  */
-#define NCCL_TRY(call)                             \
-  do {                                             \
-    ncclResult_t const status = (call);            \
-    if (ncclSuccess != status) {                   \
-      std::string msg{};                           \
-      SET_ERROR_MSG(msg,                           \
-                    "NCCL error encountered at: ", \
-                    "call='%s', Reason=%d:%s",     \
-                    #call,                         \
-                    status,                        \
-                    ncclGetErrorString(status));   \
-      throw raft::logic_error(msg);                \
-    }                                              \
+#define NCCL_TRY(call)                                                        \
+  do {                                                                        \
+    ncclResult_t const status = (call);                                       \
+    if (ncclSuccess != status) {                                              \
+      std::string msg{};                                                      \
+      SET_ERROR_MSG(msg,                                                      \
+                    "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \
+                    #call, status, ncclGetErrorString(status));               \
+      throw raft::logic_error(msg);                                           \
+    }                                                                         \
   } while (0);
 
-#define NCCL_TRY_NO_THROW(call)                                                        \
-  do {                                                                                 \
-    ncclResult_t status = call;                                                        \
-    if (ncclSuccess != status) {                                                       \
-      printf("NCCL call='%s' failed. Reason:%s\n", #call, ncclGetErrorString(status)); \
-    }                                                                                  \
+#define NCCL_TRY_NO_THROW(call)                           \
+  do {                                                    \
+    ncclResult_t status = call;                           \
+    if (ncclSuccess != status) {                          \
+      printf("NCCL call='%s' failed. Reason:%s\n", #call, \
+             ncclGetErrorString(status));                 \
+    }                                                     \
   } while (0)
 
 namespace raft {
 namespace comms {
 
-constexpr size_t get_datatype_size(const datatype_t datatype)
-{
+constexpr size_t get_datatype_size(const datatype_t datatype) {
   switch (datatype) {
-    case datatype_t::CHAR: return sizeof(char);
-    case datatype_t::UINT8: return sizeof(uint8_t);
-    case datatype_t::INT32: return sizeof(int);
-    case datatype_t::UINT32: return sizeof(unsigned int);
-    case datatype_t::INT64: return sizeof(int64_t);
-    case datatype_t::UINT64: return sizeof(uint64_t);
-    case datatype_t::FLOAT32: return sizeof(float);
-    case datatype_t::FLOAT64: return sizeof(double);
-    default: throw "Unsupported datatype";
+    case datatype_t::CHAR:
+      return sizeof(char);
+    case datatype_t::UINT8:
+      return sizeof(uint8_t);
+    case datatype_t::INT32:
+      return sizeof(int);
+    case datatype_t::UINT32:
+      return sizeof(unsigned int);
+    case datatype_t::INT64:
+      return sizeof(int64_t);
+    case datatype_t::UINT64:
+      return sizeof(uint64_t);
+    case datatype_t::FLOAT32:
+      return sizeof(float);
+    case datatype_t::FLOAT64:
+      return sizeof(double);
+    default:
+      throw "Unsupported datatype";
   }
 }
 
-constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype)
-{
+constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) {
   switch (datatype) {
-    case datatype_t::CHAR: return ncclChar;
-    case datatype_t::UINT8: return ncclUint8;
-    case datatype_t::INT32: return ncclInt;
-    case datatype_t::UINT32: return ncclUint32;
-    case datatype_t::INT64: return ncclInt64;
-    case datatype_t::UINT64: return ncclUint64;
-    case datatype_t::FLOAT32: return ncclFloat;
-    case datatype_t::FLOAT64: return ncclDouble;
-    default: throw "Unsupported datatype";
+    case datatype_t::CHAR:
+      return ncclChar;
+    case datatype_t::UINT8:
+      return ncclUint8;
+    case datatype_t::INT32:
+      return ncclInt;
+    case datatype_t::UINT32:
+      return ncclUint32;
+    case datatype_t::INT64:
+      return ncclInt64;
+    case datatype_t::UINT64:
+      return ncclUint64;
+    case datatype_t::FLOAT32:
+      return ncclFloat;
+    case datatype_t::FLOAT64:
+      return ncclDouble;
+    default:
+      throw "Unsupported datatype";
   }
 }
 
-constexpr ncclRedOp_t get_nccl_op(const op_t op)
-{
+constexpr ncclRedOp_t get_nccl_op(const op_t op) {
   switch (op) {
-    case op_t::SUM: return ncclSum;
-    case op_t::PROD: return ncclProd;
-    case op_t::MIN: return ncclMin;
-    case op_t::MAX: return ncclMax;
-    default: throw "Unsupported datatype";
+    case op_t::SUM:
+      return ncclSum;
+    case op_t::PROD:
+      return ncclProd;
+    case op_t::MIN:
+      return ncclMin;
+    case op_t::MAX:
+      return ncclMax;
+    default:
+      throw "Unsupported datatype";
   }
 }
 };  // namespace comms
diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh
index 8a66eff242..14274043f5 100644
--- a/cpp/include/raft/cuda_utils.cuh
+++ b/cpp/include/raft/cuda_utils.cuh
@@ -36,17 +36,16 @@
 namespace raft {
 
 /** helper macro for device inlined functions */
-#define DI  inline __device__
+#define DI inline __device__
 #define HDI inline __host__ __device__
-#define HD  __host__ __device__
+#define HD __host__ __device__
 
 /**
  * @brief Provide a ceiling division operation ie. ceil(a / b)
  * @tparam IntType supposed to be only integers for now!
  */
 template <typename IntType>
-constexpr HDI IntType ceildiv(IntType a, IntType b)
-{
+constexpr HDI IntType ceildiv(IntType a, IntType b) {
   return (a + b - 1) / b;
 }
 
@@ -55,8 +54,7 @@ constexpr HDI IntType ceildiv(IntType a, IntType b)
  * @tparam IntType supposed to be only integers for now!
  */
 template <typename IntType>
-constexpr HDI IntType alignTo(IntType a, IntType b)
-{
+constexpr HDI IntType alignTo(IntType a, IntType b) {
   return ceildiv(a, b) * b;
 }
 
@@ -65,8 +63,7 @@ constexpr HDI IntType alignTo(IntType a, IntType b)
  * @tparam IntType supposed to be only integers for now!
  */
 template <typename IntType>
-constexpr HDI IntType alignDown(IntType a, IntType b)
-{
+constexpr HDI IntType alignDown(IntType a, IntType b) {
   return (a / b) * b;
 }
 
@@ -75,8 +72,7 @@ constexpr HDI IntType alignDown(IntType a, IntType b)
  * @tparam IntType data type (checked only for integers)
  */
 template <typename IntType>
-constexpr HDI bool isPo2(IntType num)
-{
+constexpr HDI bool isPo2(IntType num) {
   return (num && !(num & (num - 1)));
 }
 
@@ -85,16 +81,14 @@ constexpr HDI bool isPo2(IntType num)
  * @tparam IntType data type (checked only for integers)
  */
 template <typename IntType>
-constexpr HDI IntType log2(IntType num, IntType ret = IntType(0))
-{
+constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) {
   return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret);
 }
 
 /** Device function to apply the input lambda across threads in the grid */
 template <int ItemsPerThread, typename L>
-DI void forEach(int num, L lambda)
-{
-  int idx              = (blockDim.x * blockIdx.x) + threadIdx.x;
+DI void forEach(int num, L lambda) {
+  int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
   const int numThreads = blockDim.x * gridDim.x;
 #pragma unroll
   for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) {
@@ -106,8 +100,7 @@ DI void forEach(int num, L lambda)
 static const int WarpSize = 32;
 
 /** get the laneId of the current thread */
-DI int laneId()
-{
+DI int laneId() {
   int id;
   asm("mov.s32 %0, %laneid;" : "=r"(id));
   return id;
@@ -120,17 +113,15 @@ DI int laneId()
  * @param b second input
  */
 template <typename T>
-HDI void swapVals(T& a, T& b)
-{
+HDI void swapVals(T &a, T &b) {
   T tmp = a;
-  a     = b;
-  b     = tmp;
+  a = b;
+  b = tmp;
 }
 
 /** Device function to have atomic add support for older archs */
 template <typename Type>
-DI void myAtomicAdd(Type* address, Type val)
-{
+DI void myAtomicAdd(Type *address, Type val) {
   atomicAdd(address, val);
 }
 
@@ -138,114 +129,105 @@ DI void myAtomicAdd(Type* address, Type val)
 // Ref:
 // http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf
 template <>
-DI void myAtomicAdd(double* address, double val)
-{
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old             = *address_as_ull, assumed;
+DI void myAtomicAdd(double *address, double val) {
+  unsigned long long int *address_as_ull = (unsigned long long int *)address;
+  unsigned long long int old = *address_as_ull, assumed;
   do {
     assumed = old;
-    old =
-      atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
   } while (assumed != old);
 }
 #endif
 
 template <typename T, typename ReduceLambda>
-DI void myAtomicReduce(T* address, T val, ReduceLambda op);
+DI void myAtomicReduce(T *address, T val, ReduceLambda op);
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(double* address, double val, ReduceLambda op)
-{
-  unsigned long long int* address_as_ull = (unsigned long long int*)address;
-  unsigned long long int old             = *address_as_ull, assumed;
+DI void myAtomicReduce(double *address, double val, ReduceLambda op) {
+  unsigned long long int *address_as_ull = (unsigned long long int *)address;
+  unsigned long long int old = *address_as_ull, assumed;
   do {
     assumed = old;
-    old     = atomicCAS(
-      address_as_ull, assumed, __double_as_longlong(op(val, __longlong_as_double(assumed))));
+    old =
+      atomicCAS(address_as_ull, assumed,
+                __double_as_longlong(op(val, __longlong_as_double(assumed))));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(float* address, float val, ReduceLambda op)
-{
-  unsigned int* address_as_uint = (unsigned int*)address;
-  unsigned int old              = *address_as_uint, assumed;
+DI void myAtomicReduce(float *address, float val, ReduceLambda op) {
+  unsigned int *address_as_uint = (unsigned int *)address;
+  unsigned int old = *address_as_uint, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address_as_uint, assumed, __float_as_uint(op(val, __uint_as_float(assumed))));
+    old = atomicCAS(address_as_uint, assumed,
+                    __float_as_uint(op(val, __uint_as_float(assumed))));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(int* address, int val, ReduceLambda op)
-{
+DI void myAtomicReduce(int *address, int val, ReduceLambda op) {
   int old = *address, assumed;
   do {
     assumed = old;
-    old     = atomicCAS(address, assumed, op(val, assumed));
+    old = atomicCAS(address, assumed, op(val, assumed));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(long long* address, long long val, ReduceLambda op)
-{
+DI void myAtomicReduce(long long *address, long long val, ReduceLambda op) {
   long long old = *address, assumed;
   do {
     assumed = old;
-    old     = atomicCAS(address, assumed, op(val, assumed));
+    old = atomicCAS(address, assumed, op(val, assumed));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(unsigned long long* address, unsigned long long val, ReduceLambda op)
-{
+DI void myAtomicReduce(unsigned long long *address, unsigned long long val,
+                       ReduceLambda op) {
   unsigned long long old = *address, assumed;
   do {
     assumed = old;
-    old     = atomicCAS(address, assumed, op(val, assumed));
+    old = atomicCAS(address, assumed, op(val, assumed));
   } while (assumed != old);
 }
 
 /**
  * @brief Provide atomic min operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ min(old value,
- * val)
+ * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMin(T* address, T val);
+DI T myAtomicMin(T *address, T val);
 
 /**
  * @brief Provide atomic max operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ max(old value,
- * val)
+ * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMax(T* address, T val);
+DI T myAtomicMax(T *address, T val);
 
-DI float myAtomicMin(float* address, float val)
-{
+DI float myAtomicMin(float *address, float val) {
   myAtomicReduce(address, val, fminf);
   return *address;
 }
 
-DI float myAtomicMax(float* address, float val)
-{
+DI float myAtomicMax(float *address, float val) {
   myAtomicReduce(address, val, fmaxf);
   return *address;
 }
 
-DI double myAtomicMin(double* address, double val)
-{
+DI double myAtomicMin(double *address, double val) {
   myAtomicReduce<double(double, double)>(address, val, fmin);
   return *address;
 }
 
-DI double myAtomicMax(double* address, double val)
-{
+DI double myAtomicMax(double *address, double val) {
   myAtomicReduce<double(double, double)>(address, val, fmax);
   return *address;
 }
@@ -257,13 +239,11 @@ DI double myAtomicMax(double* address, double val)
 template <typename T>
 HDI T myMax(T x, T y);
 template <>
-HDI float myMax<float>(float x, float y)
-{
+HDI float myMax<float>(float x, float y) {
   return fmaxf(x, y);
 }
 template <>
-HDI double myMax<double>(double x, double y)
-{
+HDI double myMax<double>(double x, double y) {
   return fmax(x, y);
 }
 /** @} */
@@ -275,13 +255,11 @@ HDI double myMax<double>(double x, double y)
 template <typename T>
 HDI T myMin(T x, T y);
 template <>
-HDI float myMin<float>(float x, float y)
-{
+HDI float myMin<float>(float x, float y) {
   return fminf(x, y);
 }
 template <>
-HDI double myMin<double>(double x, double y)
-{
+HDI double myMin<double>(double x, double y) {
   return fmin(x, y);
 }
 /** @} */
@@ -289,13 +267,11 @@ HDI double myMin<double>(double x, double y)
 /**
  * @brief Provide atomic min operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ min(old value,
- * val)
+ * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMin(T* address, T val)
-{
+DI T myAtomicMin(T *address, T val) {
   myAtomicReduce(address, val, myMin<T>);
   return *address;
 }
@@ -303,13 +279,11 @@ DI T myAtomicMin(T* address, T val)
 /**
  * @brief Provide atomic max operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ max(old value,
- * val)
+ * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMax(T* address, T val)
-{
+DI T myAtomicMax(T *address, T val) {
   myAtomicReduce(address, val, myMax<T>);
   return *address;
 }
@@ -318,8 +292,7 @@ DI T myAtomicMax(T* address, T val)
  * Sign function
  */
 template <typename T>
-HDI int sgn(const T val)
-{
+HDI int sgn(const T val) {
   return (T(0) < val) - (val < T(0));
 }
 
@@ -330,13 +303,11 @@ HDI int sgn(const T val)
 template <typename T>
 HDI T myExp(T x);
 template <>
-HDI float myExp(float x)
-{
+HDI float myExp(float x) {
   return expf(x);
 }
 template <>
-HDI double myExp(double x)
-{
+HDI double myExp(double x) {
   return exp(x);
 }
 /** @} */
@@ -348,13 +319,11 @@ HDI double myExp(double x)
 template <typename T>
 inline __device__ T myInf();
 template <>
-inline __device__ float myInf<float>()
-{
+inline __device__ float myInf<float>() {
   return CUDART_INF_F;
 }
 template <>
-inline __device__ double myInf<double>()
-{
+inline __device__ double myInf<double>() {
   return CUDART_INF;
 }
 /** @} */
@@ -366,13 +335,11 @@ inline __device__ double myInf<double>()
 template <typename T>
 HDI T myLog(T x);
 template <>
-HDI float myLog(float x)
-{
+HDI float myLog(float x) {
   return logf(x);
 }
 template <>
-HDI double myLog(double x)
-{
+HDI double myLog(double x) {
   return log(x);
 }
 /** @} */
@@ -384,13 +351,11 @@ HDI double myLog(double x)
 template <typename T>
 HDI T mySqrt(T x);
 template <>
-HDI float mySqrt(float x)
-{
+HDI float mySqrt(float x) {
   return sqrtf(x);
 }
 template <>
-HDI double mySqrt(double x)
-{
+HDI double mySqrt(double x) {
   return sqrt(x);
 }
 /** @} */
@@ -400,15 +365,13 @@ HDI double mySqrt(double x)
  * @{
  */
 template <typename T>
-DI void mySinCos(T x, T& s, T& c);
+DI void mySinCos(T x, T &s, T &c);
 template <>
-DI void mySinCos(float x, float& s, float& c)
-{
+DI void mySinCos(float x, float &s, float &c) {
   sincosf(x, &s, &c);
 }
 template <>
-DI void mySinCos(double x, double& s, double& c)
-{
+DI void mySinCos(double x, double &s, double &c) {
   sincos(x, &s, &c);
 }
 /** @} */
@@ -420,13 +383,11 @@ DI void mySinCos(double x, double& s, double& c)
 template <typename T>
 DI T mySin(T x);
 template <>
-DI float mySin(float x)
-{
+DI float mySin(float x) {
   return sinf(x);
 }
 template <>
-DI double mySin(double x)
-{
+DI double mySin(double x) {
   return sin(x);
 }
 /** @} */
@@ -436,18 +397,15 @@ DI double mySin(double x)
  * @{
  */
 template <typename T>
-DI T myAbs(T x)
-{
+DI T myAbs(T x) {
   return x < 0 ? -x : x;
 }
 template <>
-DI float myAbs(float x)
-{
+DI float myAbs(float x) {
   return fabsf(x);
 }
 template <>
-DI double myAbs(double x)
-{
+DI double myAbs(double x) {
   return fabs(x);
 }
 /** @} */
@@ -459,13 +417,11 @@ DI double myAbs(double x)
 template <typename T>
 HDI T myPow(T x, T power);
 template <>
-HDI float myPow(float x, float power)
-{
+HDI float myPow(float x, float power) {
   return powf(x, power);
 }
 template <>
-HDI double myPow(double x, double power)
-{
+HDI double myPow(double x, double power) {
   return pow(x, power);
 }
 /** @} */
@@ -477,13 +433,11 @@ HDI double myPow(double x, double power)
 template <typename T>
 HDI T myTanh(T x);
 template <>
-HDI float myTanh(float x)
-{
+HDI float myTanh(float x) {
   return tanhf(x);
 }
 template <>
-HDI double myTanh(double x)
-{
+HDI double myTanh(double x) {
   return tanh(x);
 }
 /** @} */
@@ -495,13 +449,11 @@ HDI double myTanh(double x)
 template <typename T>
 HDI T myATanh(T x);
 template <>
-HDI float myATanh(float x)
-{
+HDI float myATanh(float x) {
   return atanhf(x);
 }
 template <>
-HDI double myATanh(double x)
-{
+HDI double myATanh(double x) {
   return atanh(x);
 }
 /** @} */
@@ -540,18 +492,15 @@ struct Sum {
  * @{
  */
 template <typename T>
-DI T signPrim(T x)
-{
+DI T signPrim(T x) {
   return x < 0 ? -1 : +1;
 }
 template <>
-DI float signPrim(float x)
-{
+DI float signPrim(float x) {
   return signbit(x) == true ? -1.0f : +1.0f;
 }
 template <>
-DI double signPrim(double x)
-{
+DI double signPrim(double x) {
   return signbit(x) == true ? -1.0 : +1.0;
 }
 /** @} */
@@ -565,33 +514,28 @@ DI double signPrim(double x)
  * @{
  */
 template <typename T>
-DI T maxPrim(T x, T y)
-{
+DI T maxPrim(T x, T y) {
   return x > y ? x : y;
 }
 template <>
-DI float maxPrim(float x, float y)
-{
+DI float maxPrim(float x, float y) {
   return fmaxf(x, y);
 }
 template <>
-DI double maxPrim(double x, double y)
-{
+DI double maxPrim(double x, double y) {
   return fmax(x, y);
 }
 /** @} */
 
 /** apply a warp-wide fence (useful from Volta+ archs) */
-DI void warpFence()
-{
+DI void warpFence() {
 #if __CUDA_ARCH__ >= 700
   __syncwarp();
 #endif
 }
 
 /** warp-wide any boolean aggregator */
-DI bool any(bool inFlag, uint32_t mask = 0xffffffffu)
-{
+DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) {
 #if CUDART_VERSION >= 9000
   inFlag = __any_sync(mask, inFlag);
 #else
@@ -601,8 +545,7 @@ DI bool any(bool inFlag, uint32_t mask = 0xffffffffu)
 }
 
 /** warp-wide all boolean aggregator */
-DI bool all(bool inFlag, uint32_t mask = 0xffffffffu)
-{
+DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) {
 #if CUDART_VERSION >= 9000
   inFlag = __all_sync(mask, inFlag);
 #else
@@ -621,8 +564,8 @@ DI bool all(bool inFlag, uint32_t mask = 0xffffffffu)
  * @return the shuffled data
  */
 template <typename T>
-DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu)
-{
+DI T shfl(T val, int srcLane, int width = WarpSize,
+          uint32_t mask = 0xffffffffu) {
 #if CUDART_VERSION >= 9000
   return __shfl_sync(mask, val, srcLane, width);
 #else
@@ -640,8 +583,8 @@ DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu)
  * @return the shuffled data
  */
 template <typename T>
-DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xffffffffu)
-{
+DI T shfl_xor(T val, int laneMask, int width = WarpSize,
+              uint32_t mask = 0xffffffffu) {
 #if CUDART_VERSION >= 9000
   return __shfl_xor_sync(mask, val, laneMask, width);
 #else
@@ -659,8 +602,7 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xfffff
  * @todo Expand this to support arbitrary reduction ops
  */
 template <typename T>
-DI T warpReduce(T val)
-{
+DI T warpReduce(T val) {
 #pragma unroll
   for (int i = WarpSize / 2; i > 0; i >>= 1) {
     T tmp = shfl(val, laneId() + i);
@@ -681,13 +623,12 @@ DI T warpReduce(T val)
  * @todo Expand this to support arbitrary reduction ops
  */
 template <typename T>
-DI T blockReduce(T val, char* smem)
-{
-  auto* sTemp = reinterpret_cast<T*>(smem);
-  int nWarps  = (blockDim.x + WarpSize - 1) / WarpSize;
-  int lid     = laneId();
-  int wid     = threadIdx.x / WarpSize;
-  val         = warpReduce(val);
+DI T blockReduce(T val, char *smem) {
+  auto *sTemp = reinterpret_cast<T *>(smem);
+  int nWarps = (blockDim.x + WarpSize - 1) / WarpSize;
+  int lid = laneId();
+  int wid = threadIdx.x / WarpSize;
+  val = warpReduce(val);
   if (lid == 0) sTemp[wid] = val;
   __syncthreads();
   val = lid < nWarps ? sTemp[lid] : T(0);
@@ -703,10 +644,8 @@ DI T blockReduce(T val, char* smem)
  * @param idx the index for which to query the stream
  */
 inline cudaStream_t select_stream(cudaStream_t user_stream,
-                                  cudaStream_t* int_streams,
-                                  int n_int_streams,
-                                  int idx)
-{
+                                  cudaStream_t *int_streams, int n_int_streams,
+                                  int idx) {
   return n_int_streams > 0 ? int_streams[idx % n_int_streams] : user_stream;
 }
 
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 872dab7d82..86c60addf2 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -49,20 +49,17 @@ struct cuda_error : public raft::exception {
  * exception detailing the CUDA error that occurred
  *
  */
-#define CUDA_TRY(call)                             \
-  do {                                             \
-    cudaError_t const status = call;               \
-    if (status != cudaSuccess) {                   \
-      cudaGetLastError();                          \
-      std::string msg{};                           \
-      SET_ERROR_MSG(msg,                           \
-                    "CUDA error encountered at: ", \
-                    "call='%s', Reason=%s:%s",     \
-                    #call,                         \
-                    cudaGetErrorName(status),      \
-                    cudaGetErrorString(status));   \
-      throw raft::cuda_error(msg);                 \
-    }                                              \
+#define CUDA_TRY(call)                                                        \
+  do {                                                                        \
+    cudaError_t const status = call;                                          \
+    if (status != cudaSuccess) {                                              \
+      cudaGetLastError();                                                     \
+      std::string msg{};                                                      \
+      SET_ERROR_MSG(                                                          \
+        msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \
+        cudaGetErrorName(status), cudaGetErrorString(status));                \
+      throw raft::cuda_error(msg);                                            \
+    }                                                                         \
   } while (0)
 
 /**
@@ -92,16 +89,13 @@ struct cuda_error : public raft::exception {
 //  * @brief check for cuda runtime API errors but log error instead of raising
 //  *        exception.
 //  */
-#define CUDA_CHECK_NO_THROW(call)                                  \
-  do {                                                             \
-    cudaError_t const status = call;                               \
-    if (cudaSuccess != status) {                                   \
-      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \
-             #call,                                                \
-             __FILE__,                                             \
-             __LINE__,                                             \
-             cudaGetErrorString(status));                          \
-    }                                                              \
+#define CUDA_CHECK_NO_THROW(call)                                         \
+  do {                                                                    \
+    cudaError_t const status = call;                                      \
+    if (cudaSuccess != status) {                                          \
+      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \
+             __FILE__, __LINE__, cudaGetErrorString(status));             \
+    }                                                                     \
   } while (0)
 
 namespace raft {
@@ -109,7 +103,9 @@ namespace raft {
 /** Helper method to get to know warp size in device code */
 __host__ __device__ constexpr inline int warp_size() { return 32; }
 
-__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
+__host__ __device__ constexpr inline unsigned int warp_full_mask() {
+  return 0xffffffff;
+}
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
@@ -128,16 +124,13 @@ class grid_1d_thread_t {
    * @param elements_per_thread Typically, a single kernel thread processes more than a single
    * element; this affects the number of threads the grid must contain
    */
-  grid_1d_thread_t(size_t overall_num_elements,
-                   size_t num_threads_per_block,
-                   size_t max_num_blocks_1d,
-                   size_t elements_per_thread = 1)
+  grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block,
+                   size_t max_num_blocks_1d, size_t elements_per_thread = 1)
     : block_size(num_threads_per_block),
-      num_blocks(
-        std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
-                   (elements_per_thread * num_threads_per_block),
-                 max_num_blocks_1d))
-  {
+      num_blocks(std::min((overall_num_elements +
+                           (elements_per_thread * num_threads_per_block) - 1) /
+                            (elements_per_thread * num_threads_per_block),
+                          max_num_blocks_1d)) {
     RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
     RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
                  "num_threads_per_block / warp_size() must be > 0");
@@ -160,14 +153,13 @@ class grid_1d_warp_t {
    * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
    * this can't be determined generically/automatically (as opposed to the number of blocks)
    */
-  grid_1d_warp_t(size_t overall_num_elements,
-                 size_t num_threads_per_block,
+  grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block,
                  size_t max_num_blocks_1d)
     : block_size(num_threads_per_block),
-      num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
-                            (num_threads_per_block / warp_size()),
-                          max_num_blocks_1d))
-  {
+      num_blocks(std::min(
+        (overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
+          (num_threads_per_block / warp_size()),
+        max_num_blocks_1d)) {
     RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
     RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
                  "num_threads_per_block / warp_size() must be > 0");
@@ -189,12 +181,10 @@ class grid_1d_block_t {
    * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
    * this can't be determined generically/automatically (as opposed to the number of blocks)
    */
-  grid_1d_block_t(size_t overall_num_elements,
-                  size_t num_threads_per_block,
+  grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block,
                   size_t max_num_blocks_1d)
     : block_size(num_threads_per_block),
-      num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
-  {
+      num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) {
     RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
     RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
                  "num_threads_per_block / warp_size() must be > 0");
@@ -210,9 +200,9 @@ class grid_1d_block_t {
  * @param stream cuda stream
  */
 template <typename Type>
-void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream)
-{
-  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream) {
+  CUDA_CHECK(
+    cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
 }
 
 /**
@@ -223,22 +213,23 @@ void copy(Type* dst, const Type* src, size_t len, cudaStream_t stream)
  */
 /** performs a host to device copy */
 template <typename Type>
-void update_device(Type* d_ptr, const Type* h_ptr, size_t len, cudaStream_t stream)
-{
+void update_device(Type* d_ptr, const Type* h_ptr, size_t len,
+                   cudaStream_t stream) {
   copy(d_ptr, h_ptr, len, stream);
 }
 
 /** performs a device to host copy */
 template <typename Type>
-void update_host(Type* h_ptr, const Type* d_ptr, size_t len, cudaStream_t stream)
-{
+void update_host(Type* h_ptr, const Type* d_ptr, size_t len,
+                 cudaStream_t stream) {
   copy(h_ptr, d_ptr, len, stream);
 }
 
 template <typename Type>
-void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, cudaStream_t stream)
-{
-  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
+void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len,
+                cudaStream_t stream) {
+  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type),
+                             cudaMemcpyDeviceToDevice, stream));
 }
 /** @} */
 
@@ -247,11 +238,8 @@ void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, cudaStream_t strea
  * @{
  */
 template <class T, class OutStream>
-void print_host_vector(const char* variable_name,
-                       const T* host_mem,
-                       size_t componentsCount,
-                       OutStream& out)
-{
+void print_host_vector(const char* variable_name, const T* host_mem,
+                       size_t componentsCount, OutStream& out) {
   out << variable_name << "=[";
   for (size_t i = 0; i < componentsCount; ++i) {
     if (i != 0) out << ",";
@@ -261,13 +249,11 @@ void print_host_vector(const char* variable_name,
 }
 
 template <class T, class OutStream>
-void print_device_vector(const char* variable_name,
-                         const T* devMem,
-                         size_t componentsCount,
-                         OutStream& out)
-{
+void print_device_vector(const char* variable_name, const T* devMem,
+                         size_t componentsCount, OutStream& out) {
   T* host_mem = new T[componentsCount];
-  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T),
+                        cudaMemcpyDeviceToHost));
   print_host_vector(variable_name, host_mem, componentsCount, out);
   delete[] host_mem;
 }
@@ -275,36 +261,35 @@ void print_device_vector(const char* variable_name,
 
 /** cuda malloc */
 template <typename Type>
-void allocate(Type*& ptr, size_t len, bool setZero = false)
-{
+void allocate(Type*& ptr, size_t len, bool setZero = false) {
   CUDA_CHECK(cudaMalloc((void**)&ptr, sizeof(Type) * len));
   if (setZero) CUDA_CHECK(cudaMemset(ptr, 0, sizeof(Type) * len));
 }
 
 /** helper method to get max usable shared mem per block parameter */
-inline int getSharedMemPerBlock()
-{
+inline int getSharedMemPerBlock() {
   int devId;
   CUDA_CHECK(cudaGetDevice(&devId));
   int smemPerBlk;
-  CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
+  CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk,
+                                    cudaDevAttrMaxSharedMemoryPerBlock, devId));
   return smemPerBlk;
 }
 
 /** helper method to get multi-processor count parameter */
-inline int getMultiProcessorCount()
-{
+inline int getMultiProcessorCount() {
   int devId;
   CUDA_CHECK(cudaGetDevice(&devId));
   int mpCount;
-  CUDA_CHECK(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+  CUDA_CHECK(
+    cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
   return mpCount;
 }
 
 /** helper method to convert an array on device to a string on host */
 template <typename T>
-std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
-{
+std::string arr2Str(const T* arr, int size, std::string name,
+                    cudaStream_t stream, int width = 4) {
   std::stringstream ss;
 
   T* arr_h = (T*)malloc(size * sizeof(T));
@@ -326,54 +311,53 @@ std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t strea
 
 /** this seems to be unused, but may be useful in the future */
 template <typename T>
-void ASSERT_DEVICE_MEM(T* ptr, std::string name)
-{
+void ASSERT_DEVICE_MEM(T* ptr, std::string name) {
   cudaPointerAttributes s_att;
   cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
 
   if (s_err != 0 || s_att.device == -1)
-    std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
-              << ", err=" << s_err << std::endl;
+    std::cout << "Invalid device pointer encountered in " << name
+              << ". device=" << s_att.device << ", err=" << s_err << std::endl;
 }
 
-inline uint32_t curTimeMillis()
-{
-  auto now      = std::chrono::high_resolution_clock::now();
+inline uint32_t curTimeMillis() {
+  auto now = std::chrono::high_resolution_clock::now();
   auto duration = now.time_since_epoch();
-  return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+  return std::chrono::duration_cast<std::chrono::milliseconds>(duration)
+    .count();
 }
 
 /** Helper function to calculate need memory for allocate to store dense matrix.
- * @param rows number of rows in matrix
- * @param columns number of columns in matrix
- * @return need number of items to allocate via allocate()
- * @sa allocate()
- */
-inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
+    * @param rows number of rows in matrix
+    * @param columns number of columns in matrix
+    * @return need number of items to allocate via allocate()
+    * @sa allocate()
+    */
+inline size_t allocLengthForMatrix(size_t rows, size_t columns) {
+  return rows * columns;
+}
 
 /** Helper function to check alignment of pointer.
- * @param ptr the pointer to check
- * @param alignment to be checked for
- * @return true if address in bytes is a multiple of alignment
- */
+    * @param ptr the pointer to check
+    * @param alignment to be checked for
+    * @return true if address in bytes is a multiple of alignment
+    */
 template <typename Type>
-bool is_aligned(Type* ptr, size_t alignment)
-{
+bool is_aligned(Type* ptr, size_t alignment) {
   return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
 }
 
 /** calculate greatest common divisor of two numbers
- * @a integer
- * @b integer
- * @ return gcd of a and b
- */
+* @a integer
+* @b integer
+* @ return gcd of a and b
+*/
 template <typename IntType>
-IntType gcd(IntType a, IntType b)
-{
+IntType gcd(IntType a, IntType b) {
   while (b != 0) {
     IntType tmp = b;
-    b           = a % b;
-    a           = tmp;
+    b = a % b;
+    a = tmp;
   }
   return a;
 }
diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh
index e113ca92eb..dc8093ca1d 100644
--- a/cpp/include/raft/device_atomics.cuh
+++ b/cpp/include/raft/device_atomics.cuh
@@ -39,9 +39,9 @@ namespace detail {
 
 /* @brief binary `sum` operator */
 struct DeviceSum {
-  template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs)
-  {
+  template <typename T,
+            typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs) {
     return lhs + rhs;
   }
 };
@@ -49,8 +49,7 @@ struct DeviceSum {
 /* @brief binary `min` operator */
 struct DeviceMin {
   template <typename T>
-  __device__ T operator()(const T& lhs, const T& rhs)
-  {
+  __device__ T operator()(const T& lhs, const T& rhs) {
     return lhs < rhs ? lhs : rhs;
   }
 };
@@ -58,44 +57,43 @@ struct DeviceMin {
 /* @brief binary `max` operator */
 struct DeviceMax {
   template <typename T>
-  __device__ T operator()(const T& lhs, const T& rhs)
-  {
+  __device__ T operator()(const T& lhs, const T& rhs) {
     return lhs > rhs ? lhs : rhs;
   }
 };
 
 /* @brief binary `product` operator */
 struct DeviceProduct {
-  template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs)
-  {
+  template <typename T,
+            typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs) {
     return lhs * rhs;
   }
 };
 
 /* @brief binary `and` operator */
 struct DeviceAnd {
-  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs)
-  {
+  template <typename T,
+            typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs) {
     return (lhs & rhs);
   }
 };
 
 /* @brief binary `or` operator */
 struct DeviceOr {
-  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs)
-  {
+  template <typename T,
+            typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs) {
     return (lhs | rhs);
   }
 };
 
 /* @brief binary `xor` operator */
 struct DeviceXor {
-  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs)
-  {
+  template <typename T,
+            typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs) {
     return (lhs ^ rhs);
   }
 };
@@ -105,9 +103,9 @@ struct DeviceXor {
 #define errmsg_cast "size mismatch."
 
 template <typename T_output, typename T_input>
-__forceinline__ __device__ T_output type_reinterpret(T_input value)
-{
-  static_assert(sizeof(T_output) == sizeof(T_input), "type_reinterpret for different size");
+__forceinline__ __device__ T_output type_reinterpret(T_input value) {
+  static_assert(sizeof(T_output) == sizeof(T_input),
+                "type_reinterpret for different size");
   return *(reinterpret_cast<T_output*>(&value));
 }
 
@@ -120,22 +118,25 @@ struct genericAtomicOperationImpl;
 // single byte atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 1> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          Op op) {
     using T_int = unsigned int;
 
-    T_int* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
-    T_int shift           = ((reinterpret_cast<size_t>(addr) & 3) * 8);
+    T_int* address_uint32 =
+      reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
+    T_int shift = ((reinterpret_cast<size_t>(addr) & 3) * 8);
 
     T_int old = *address_uint32;
     T_int assumed;
 
     do {
-      assumed                = old;
-      T target_value         = T((old >> shift) & 0xff);
-      uint8_t updating_value = type_reinterpret<uint8_t, T>(op(target_value, update_value));
-      T_int new_value        = (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift);
-      old                    = atomicCAS(address_uint32, assumed, new_value);
+      assumed = old;
+      T target_value = T((old >> shift) & 0xff);
+      uint8_t updating_value =
+        type_reinterpret<uint8_t, T>(op(target_value, update_value));
+      T_int new_value =
+        (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift);
+      old = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return T((old >> shift) & 0xff);
@@ -145,24 +146,26 @@ struct genericAtomicOperationImpl<T, Op, 1> {
 // 2 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 2> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
-  {
-    using T_int      = unsigned int;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          Op op) {
+    using T_int = unsigned int;
     bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) ? false : true;
-    T_int* address_uint32 =
-      reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
+    T_int* address_uint32 = reinterpret_cast<T_int*>(
+      reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
 
     T_int old = *address_uint32;
     T_int assumed;
 
     do {
-      assumed                 = old;
-      T target_value          = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
-      uint16_t updating_value = type_reinterpret<uint16_t, T>(op(target_value, update_value));
-
-      T_int new_value = (is_32_align) ? (old & 0xffff0000) | updating_value
-                                      : (old & 0xffff) | (T_int(updating_value) << 16);
-      old             = atomicCAS(address_uint32, assumed, new_value);
+      assumed = old;
+      T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
+      uint16_t updating_value =
+        type_reinterpret<uint16_t, T>(op(target_value, update_value));
+
+      T_int new_value = (is_32_align)
+                          ? (old & 0xffff0000) | updating_value
+                          : (old & 0xffff) | (T_int(updating_value) << 16);
+      old = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return (is_32_align) ? T(old & 0xffff) : T(old >> 16);
@@ -173,15 +176,15 @@ struct genericAtomicOperationImpl<T, Op, 2> {
 // 4 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          Op op) {
     using T_int = unsigned int;
 
     T old_value = *addr;
     T assumed{old_value};
 
     do {
-      assumed           = old_value;
+      assumed = old_value;
       const T new_value = op(old_value, update_value);
 
       T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
@@ -198,8 +201,8 @@ struct genericAtomicOperationImpl<T, Op, 4> {
 // 8 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          Op op) {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
 
@@ -207,7 +210,7 @@ struct genericAtomicOperationImpl<T, Op, 8> {
     T assumed{old_value};
 
     do {
-      assumed           = old_value;
+      assumed = old_value;
       const T new_value = op(old_value, update_value);
 
       T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
@@ -223,8 +226,8 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 
 // -------------------------------------------------------------------------------------------------
 // specialized functions for operators
-// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is
-// not supproted.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int
+// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is not supproted.)
+// `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int
 // `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int
 
 // CUDA natively supports `unsigned long long int` for `atomicAdd`,
@@ -237,11 +240,12 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 template <>
 struct genericAtomicOperationImpl<long int, DeviceSum, 8> {
   using T = long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          DeviceSum op) {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr),
+                          type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -249,11 +253,12 @@ struct genericAtomicOperationImpl<long int, DeviceSum, 8> {
 template <>
 struct genericAtomicOperationImpl<unsigned long int, DeviceSum, 8> {
   using T = unsigned long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          DeviceSum op) {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr),
+                          type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -268,11 +273,12 @@ struct genericAtomicOperationImpl<unsigned long int, DeviceSum, 8> {
 template <>
 struct genericAtomicOperationImpl<long long int, DeviceSum, 8> {
   using T = long long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          DeviceSum op) {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr),
+                          type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -280,11 +286,12 @@ struct genericAtomicOperationImpl<long long int, DeviceSum, 8> {
 template <>
 struct genericAtomicOperationImpl<unsigned long int, DeviceMin, 8> {
   using T = unsigned long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          DeviceMin op) {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T ret = atomicMin(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    T ret = atomicMin(reinterpret_cast<T_int*>(addr),
+                      type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -292,44 +299,48 @@ struct genericAtomicOperationImpl<unsigned long int, DeviceMin, 8> {
 template <>
 struct genericAtomicOperationImpl<unsigned long int, DeviceMax, 8> {
   using T = unsigned long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          DeviceMax op) {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T ret = atomicMax(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    T ret = atomicMax(reinterpret_cast<T_int*>(addr),
+                      type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
 
 template <typename T>
 struct genericAtomicOperationImpl<T, DeviceAnd, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          DeviceAnd op) {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAnd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAnd(reinterpret_cast<T_int*>(addr),
+                          type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
 
 template <typename T>
 struct genericAtomicOperationImpl<T, DeviceOr, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          DeviceOr op) {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicOr(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicOr(reinterpret_cast<T_int*>(addr),
+                         type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
 
 template <typename T>
 struct genericAtomicOperationImpl<T, DeviceXor, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
+                                          DeviceXor op) {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicXor(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicXor(reinterpret_cast<T_int*>(addr),
+                          type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -342,12 +353,13 @@ struct typesAtomicCASImpl;
 
 template <typename T>
 struct typesAtomicCASImpl<T, 1> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare,
+                                          T const& update_value) {
     using T_int = unsigned int;
 
-    T_int shift           = ((reinterpret_cast<size_t>(addr) & 3) * 8);
-    T_int* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
+    T_int shift = ((reinterpret_cast<size_t>(addr) & 3) * 8);
+    T_int* address_uint32 =
+      reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
 
     // the 'target_value' in `old` can be different from `compare`
     // because other thread may update the value
@@ -358,14 +370,15 @@ struct typesAtomicCASImpl<T, 1> {
     uint8_t u_val = type_reinterpret<uint8_t, T>(update_value);
 
     do {
-      assumed      = old;
+      assumed = old;
       target_value = T((old >> shift) & 0xff);
       // have to compare `target_value` and `compare` before calling atomicCAS
       // the `target_value` in `old` can be different with `compare`
       if (target_value != compare) break;
 
-      T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift);
-      old             = atomicCAS(address_uint32, assumed, new_value);
+      T_int new_value =
+        (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift);
+      old = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return target_value;
@@ -374,13 +387,13 @@ struct typesAtomicCASImpl<T, 1> {
 
 template <typename T>
 struct typesAtomicCASImpl<T, 2> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare,
+                                          T const& update_value) {
     using T_int = unsigned int;
 
     bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) ? false : true;
-    T_int* address_uint32 =
-      reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
+    T_int* address_uint32 = reinterpret_cast<T_int*>(
+      reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
 
     T_int old = *address_uint32;
     T_int assumed;
@@ -388,12 +401,12 @@ struct typesAtomicCASImpl<T, 2> {
     uint16_t u_val = type_reinterpret<uint16_t, T>(update_value);
 
     do {
-      assumed      = old;
+      assumed = old;
       target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
       if (target_value != compare) break;
 
-      T_int new_value =
-        (is_32_align) ? (old & 0xffff0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16);
+      T_int new_value = (is_32_align) ? (old & 0xffff0000) | u_val
+                                      : (old & 0xffff) | (T_int(u_val) << 16);
       old = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
@@ -403,8 +416,8 @@ struct typesAtomicCASImpl<T, 2> {
 
 template <typename T>
 struct typesAtomicCASImpl<T, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare,
+                                          T const& update_value) {
     using T_int = unsigned int;
 
     T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
@@ -418,8 +431,8 @@ struct typesAtomicCASImpl<T, 4> {
 // 8 bytes atomic operation
 template <typename T>
 struct typesAtomicCASImpl<T, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
-  {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare,
+                                          T const& update_value) {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
 
@@ -451,10 +464,11 @@ struct typesAtomicCASImpl<T, 8> {
  * @returns The old value at `address`
  * -------------------------------------------------------------------------**/
 template <typename T, typename BinaryOp>
-typename std::enable_if_t<std::is_arithmetic<T>::value, T> __forceinline__ __device__
-genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
-{
-  auto fun = raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
+typename std::enable_if_t<std::is_arithmetic<T>::value, T> __forceinline__
+  __device__
+  genericAtomicOperation(T* address, T const& update_value, BinaryOp op) {
+  auto fun =
+    raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
   return T(fun(address, update_value, op));
 }
 
@@ -462,11 +476,11 @@ genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
 template <typename BinaryOp>
 __forceinline__ __device__ bool genericAtomicOperation(bool* address,
                                                        bool const& update_value,
-                                                       BinaryOp op)
-{
+                                                       BinaryOp op) {
   using T = bool;
   // don't use underlying type to apply operation for bool
-  auto fun = raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
+  auto fun =
+    raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
   return T(fun(address, update_value, op));
 }
 
@@ -488,9 +502,9 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address,
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicAdd(T* address, T val)
-{
-  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceSum{});
+__forceinline__ __device__ T atomicAdd(T* address, T val) {
+  return raft::genericAtomicOperation(
+    address, val, raft::device_atomics::detail::DeviceSum{});
 }
 
 /**
@@ -509,9 +523,9 @@ __forceinline__ __device__ T atomicAdd(T* address, T val)
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMin(T* address, T val)
-{
-  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMin{});
+__forceinline__ __device__ T atomicMin(T* address, T val) {
+  return raft::genericAtomicOperation(
+    address, val, raft::device_atomics::detail::DeviceMin{});
 }
 
 /**
@@ -530,9 +544,9 @@ __forceinline__ __device__ T atomicMin(T* address, T val)
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMax(T* address, T val)
-{
-  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMax{});
+__forceinline__ __device__ T atomicMax(T* address, T val) {
+  return raft::genericAtomicOperation(
+    address, val, raft::device_atomics::detail::DeviceMax{});
 }
 
 /**
@@ -552,9 +566,9 @@ __forceinline__ __device__ T atomicMax(T* address, T val)
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicCAS(T* address, T compare, T val)
-{
-  return raft::device_atomics::detail::typesAtomicCASImpl<T>()(address, compare, val);
+__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) {
+  return raft::device_atomics::detail::typesAtomicCASImpl<T>()(address, compare,
+                                                               val);
 }
 
 /**
@@ -572,10 +586,11 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val)
  *
  * @returns The old value at `address`
  */
-template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-__forceinline__ __device__ T atomicAnd(T* address, T val)
-{
-  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceAnd{});
+template <typename T,
+          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__forceinline__ __device__ T atomicAnd(T* address, T val) {
+  return raft::genericAtomicOperation(
+    address, val, raft::device_atomics::detail::DeviceAnd{});
 }
 
 /**
@@ -593,10 +608,11 @@ __forceinline__ __device__ T atomicAnd(T* address, T val)
  *
  * @returns The old value at `address`
  */
-template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-__forceinline__ __device__ T atomicOr(T* address, T val)
-{
-  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceOr{});
+template <typename T,
+          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__forceinline__ __device__ T atomicOr(T* address, T val) {
+  return raft::genericAtomicOperation(address, val,
+                                      raft::device_atomics::detail::DeviceOr{});
 }
 
 /**
@@ -614,8 +630,9 @@ __forceinline__ __device__ T atomicOr(T* address, T val)
  *
  * @returns The old value at `address`
  */
-template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-__forceinline__ __device__ T atomicXor(T* address, T val)
-{
-  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceXor{});
+template <typename T,
+          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__forceinline__ __device__ T atomicXor(T* address, T val) {
+  return raft::genericAtomicOperation(
+    address, val, raft::device_atomics::detail::DeviceXor{});
 }
diff --git a/cpp/include/raft/distance/canberra.cuh b/cpp/include/raft/distance/canberra.cuh
index 61622d7c87..b87c295eb0 100644
--- a/cpp/include/raft/distance/canberra.cuh
+++ b/cpp/include/raft/distance/canberra.cuh
@@ -44,108 +44,75 @@ namespace distance {
  * @param fin_op    the final gemm epilogue lambda
  * @param stream    cuda stream to launch work
  */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void canberraImpl(const DataT* x,
-                         const DataT* y,
-                         IdxT m,
-                         IdxT n,
-                         IdxT k,
-                         IdxT lda,
-                         IdxT ldb,
-                         IdxT ldd,
-                         OutT* dOutput,
-                         FinalLambda fin_op,
-                         cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void canberraImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
+                         IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
+                         FinalLambda fin_op, cudaStream_t stream) {
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     const auto diff = raft::L1Op<AccT, IdxT>()(x - y);
-    const auto add  = raft::myAbs(x) + raft::myAbs(y);
+    const auto add = raft::myAbs(x) + raft::myAbs(y);
     // deal with potential for 0 in denominator by
     // forcing 1/0 instead
     acc += ((add != 0) * diff / (add + (add == 0)));
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) { return; };
+  auto epilog_lambda = [] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) { return; };
 
   if (isRowMajor) {
-    auto canberraRowMajor = pairwiseDistanceMatKernel<false,
-                                                      DataT,
-                                                      AccT,
-                                                      OutT,
-                                                      IdxT,
-                                                      KPolicy,
-                                                      decltype(core_lambda),
-                                                      decltype(epilog_lambda),
-                                                      FinalLambda,
-                                                      true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraRowMajor);
+    auto canberraRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid =
+      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraRowMajor);
 
     canberraRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
   } else {
-    auto canberraColMajor = pairwiseDistanceMatKernel<false,
-                                                      DataT,
-                                                      AccT,
-                                                      OutT,
-                                                      IdxT,
-                                                      KPolicy,
-                                                      decltype(core_lambda),
-                                                      decltype(epilog_lambda),
-                                                      FinalLambda,
-                                                      false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraColMajor);
+    auto canberraColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid =
+      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraColMajor);
     canberraColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void canberra(IdxT m,
-              IdxT n,
-              IdxT k,
-              IdxT lda,
-              IdxT ldb,
-              IdxT ldd,
-              const DataT* x,
-              const DataT* y,
-              OutT* dOutput,
-              FinalLambda fin_op,
-              cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+              const DataT *x, const DataT *y, OutT *dOutput, FinalLambda fin_op,
+              cudaStream_t stream) {
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    canberraImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    canberraImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                 isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                             stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    canberraImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    canberraImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                 isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                             stream);
   } else {
     canberraImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -170,25 +137,16 @@ void canberra(IdxT m,
  * @param[in] stream cuda stream to launch work
  * @param[in] isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void canberraImpl(int m,
-                  int n,
-                  int k,
-                  const InType* pA,
-                  const InType* pB,
-                  OutType* pD,
-                  FinalLambda fin_op,
-                  cudaStream_t stream,
-                  bool isRowMajor)
-{
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void canberraImpl(int m, int n, int k, const InType *pA, const InType *pB,
+                  OutType *pD, FinalLambda fin_op, cudaStream_t stream,
+                  bool isRowMajor) {
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type canberraOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    canberraOutType;
   Index_ lda, ldb, ldd;
-  canberraOutType* pDcast = reinterpret_cast<canberraOutType*>(pD);
+  canberraOutType *pDcast = reinterpret_cast<canberraOutType *>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     canberra<InType, AccType, canberraOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/chebyshev.cuh b/cpp/include/raft/distance/chebyshev.cuh
index b7ecdb945b..8d53408cf8 100644
--- a/cpp/include/raft/distance/chebyshev.cuh
+++ b/cpp/include/raft/distance/chebyshev.cuh
@@ -44,105 +44,72 @@ namespace distance {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void chebyshevImpl(const DataT* x,
-                          const DataT* y,
-                          IdxT m,
-                          IdxT n,
-                          IdxT k,
-                          IdxT lda,
-                          IdxT ldb,
-                          IdxT ldd,
-                          OutT* dOutput,
-                          FinalLambda fin_op,
-                          cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void chebyshevImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
+                          IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
+                          FinalLambda fin_op, cudaStream_t stream) {
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     const auto diff = raft::L1Op<AccT, IdxT>()(x - y);
-    acc             = raft::myMax(acc, diff);
+    acc = raft::myMax(acc, diff);
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) { return; };
+  auto epilog_lambda = [] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) { return; };
 
   if (isRowMajor) {
-    auto chebyshevRowMajor = pairwiseDistanceMatKernel<false,
-                                                       DataT,
-                                                       AccT,
-                                                       OutT,
-                                                       IdxT,
-                                                       KPolicy,
-                                                       decltype(core_lambda),
-                                                       decltype(epilog_lambda),
-                                                       FinalLambda,
-                                                       true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, chebyshevRowMajor);
+    auto chebyshevRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               chebyshevRowMajor);
 
     chebyshevRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
   } else {
-    auto chebyshevColMajor = pairwiseDistanceMatKernel<false,
-                                                       DataT,
-                                                       AccT,
-                                                       OutT,
-                                                       IdxT,
-                                                       KPolicy,
-                                                       decltype(core_lambda),
-                                                       decltype(epilog_lambda),
-                                                       FinalLambda,
-                                                       false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, chebyshevColMajor);
+    auto chebyshevColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               chebyshevColMajor);
     chebyshevColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void chebyshev(IdxT m,
-               IdxT n,
-               IdxT k,
-               IdxT lda,
-               IdxT ldb,
-               IdxT ldd,
-               const DataT* x,
-               const DataT* y,
-               OutT* dOutput,
-               FinalLambda fin_op,
-               cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+               const DataT *x, const DataT *y, OutT *dOutput,
+               FinalLambda fin_op, cudaStream_t stream) {
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    chebyshevImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                              stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    chebyshevImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                              stream);
   } else {
     chebyshevImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -167,25 +134,16 @@ void chebyshev(IdxT m,
  * @param[in] stream cuda stream to launch work
  * @param[in] isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void chebyshevImpl(int m,
-                   int n,
-                   int k,
-                   const InType* pA,
-                   const InType* pB,
-                   OutType* pD,
-                   FinalLambda fin_op,
-                   cudaStream_t stream,
-                   bool isRowMajor)
-{
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void chebyshevImpl(int m, int n, int k, const InType *pA, const InType *pB,
+                   OutType *pD, FinalLambda fin_op, cudaStream_t stream,
+                   bool isRowMajor) {
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type chebyshevOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    chebyshevOutType;
   Index_ lda, ldb, ldd;
-  chebyshevOutType* pDcast = reinterpret_cast<chebyshevOutType*>(pD);
+  chebyshevOutType *pDcast = reinterpret_cast<chebyshevOutType *>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     chebyshev<InType, AccType, chebyshevOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/cosine.cuh b/cpp/include/raft/distance/cosine.cuh
index 3e034e15d2..ed9bd28b7f 100644
--- a/cpp/include/raft/distance/cosine.cuh
+++ b/cpp/include/raft/distance/cosine.cuh
@@ -24,7 +24,7 @@ namespace distance {
 
 /**
  * @brief the cosine distance matrix calculation implementer
- *  It computes the following equation:
+ *  It computes the following equation: 
  *    C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2)))
  * @tparam DataT input data-type (for A and B matrices)
  * @tparam AccT   accumulation data-type
@@ -49,43 +49,30 @@ namespace distance {
  * @param fin_op  the final gemm epilogue lambda
 *  @param stream  cuda stream to launch cuda operations.
  */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-void cosineImpl(const DataT* x,
-                const DataT* y,
-                const DataT* xn,
-                const DataT* yn,
-                IdxT m,
-                IdxT n,
-                IdxT k,
-                IdxT lda,
-                IdxT ldb,
-                IdxT ldd,
-                OutT* dOutput,
-                FinalLambda fin_op,
-                cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+void cosineImpl(const DataT *x, const DataT *y, const DataT *xn,
+                const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb,
+                IdxT ldd, OutT *dOutput, FinalLambda fin_op,
+                cudaStream_t stream) {
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    acc += x * y;
+  };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) {
+  auto epilog_lambda = [] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -98,66 +85,43 @@ void cosineImpl(const DataT* x,
   constexpr size_t shmemSize =
     KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
   if (isRowMajor) {
-    auto cosineRowMajor = pairwiseDistanceMatKernel<true,
-                                                    DataT,
-                                                    AccT,
-                                                    OutT,
-                                                    IdxT,
-                                                    KPolicy,
-                                                    decltype(core_lambda),
-                                                    decltype(epilog_lambda),
-                                                    FinalLambda,
-                                                    true>;
-    dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
+    auto cosineRowMajor =
+      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
     cosineRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
+      fin_op);
   } else {
-    auto cosineColMajor = pairwiseDistanceMatKernel<true,
-                                                    DataT,
-                                                    AccT,
-                                                    OutT,
-                                                    IdxT,
-                                                    KPolicy,
-                                                    decltype(core_lambda),
-                                                    decltype(epilog_lambda),
-                                                    FinalLambda,
-                                                    false>;
-    dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
+    auto cosineColMajor =
+      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
     cosineColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
+      fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void cosine(IdxT m,
-            IdxT n,
-            IdxT k,
-            IdxT lda,
-            IdxT ldb,
-            IdxT ldd,
-            const DataT* x,
-            const DataT* y,
-            const DataT* xn,
-            const DataT* yn,
-            OutT* dOutput,
-            FinalLambda fin_op,
-            cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+            const DataT *x, const DataT *y, const DataT *xn, const DataT *yn,
+            OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) {
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    cosineImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    cosineImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+               isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput,
+                           fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    cosineImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    cosineImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+               isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput,
+                           fin_op, stream);
   } else {
     cosineImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -166,7 +130,7 @@ void cosine(IdxT m,
 
 /**
  * @brief the expanded cosine distance matrix calculation
- *  It computes the following equation:
+ *  It computes the following equation: 
  *              C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2)))
  * @tparam IType input data-type (for A and B matrices)
  * @tparam AccType accumulation data-type
@@ -187,23 +151,12 @@ void cosine(IdxT m,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void cosineAlgo1(Index_ m,
-                 Index_ n,
-                 Index_ k,
-                 const InType* pA,
-                 const InType* pB,
-                 OutType* pD,
-                 AccType* workspace,
-                 size_t worksize,
-                 FinalLambda fin_op,
-                 cudaStream_t stream,
-                 bool isRowMajor)
-{
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
+                 const InType *pB, OutType *pD, AccType *workspace,
+                 size_t worksize, FinalLambda fin_op, cudaStream_t stream,
+                 bool isRowMajor) {
   auto norm_op = [] __device__(AccType in) { return raft::mySqrt(in); };
 
   // Wrap fin_op to allow computing 1 - pA before calling fin_op
@@ -212,33 +165,39 @@ void cosineAlgo1(Index_ m,
   };
 
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type CosOutType;
-  CosOutType* pDcast = reinterpret_cast<CosOutType*>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    CosOutType;
+  CosOutType *pDcast = reinterpret_cast<CosOutType *>(pD);
 
-  ASSERT(
-    !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))),
-    "workspace size error");
+  ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) ||
+           (worksize < m * sizeof(AccType))),
+         "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
   Index_ lda, ldb, ldd;
-  InType* col_vec = workspace;
-  InType* row_vec = workspace;
+  InType *col_vec = workspace;
+  InType *row_vec = workspace;
   if (pA != pB) {
     row_vec += m;
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
-    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
+                          stream, norm_op);
+    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor,
+                          stream, norm_op);
   } else {
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
+                          stream, norm_op);
   }
 
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op), true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, stream);
+      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op,
+      stream);
   } else {
     lda = n, ldb = m, ldd = m;
-    cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op), false>(
-      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, wrapped_fin_op, stream);
+    cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op),
+           false>(n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast,
+                  wrapped_fin_op, stream);
   }
 }
 
diff --git a/cpp/include/raft/distance/distance.cuh b/cpp/include/raft/distance/distance.cuh
index 1627753b43..1b39a6ec18 100644
--- a/cpp/include/raft/distance/distance.cuh
+++ b/cpp/include/raft/distance/distance.cuh
@@ -32,314 +32,140 @@ namespace raft {
 namespace distance {
 
 namespace {
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
+template <raft::distance::DistanceType distanceType, typename InType,
+          typename AccType, typename OutType, typename FinalLambda,
           typename Index_>
 struct DistanceImpl {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg = 2.0f)
-  {
-  }
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType metric_arg = 2.0f) {}
 };
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2Expanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg)
-  {
-    raft::distance::euclideanAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, false, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2Expanded, InType, AccType,
+                    OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+    raft::distance::euclideanAlgo1<InType, AccType, OutType, FinalLambda,
+                                   Index_>(m, n, k, x, y, dist, false,
+                                           (AccType *)workspace, worksize,
+                                           fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2SqrtExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg)
-  {
-    raft::distance::euclideanAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, true, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2SqrtExpanded, InType,
+                    AccType, OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+    raft::distance::euclideanAlgo1<InType, AccType, OutType, FinalLambda,
+                                   Index_>(m, n, k, x, y, dist, true,
+                                           (AccType *)workspace, worksize,
+                                           fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::CosineExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg)
-  {
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::CosineExpanded, InType,
+                    AccType, OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
     raft::distance::cosineAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
+      m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream,
+      isRowMajor);
   }
 };
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2Unexpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg)
-  {
-    raft::distance::euclideanAlgo2<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, false, fin_op, stream, isRowMajor);
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2Unexpanded, InType, AccType,
+                    OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+    raft::distance::euclideanAlgo2<InType, AccType, OutType, FinalLambda,
+                                   Index_>(m, n, k, x, y, dist, false, fin_op,
+                                           stream, isRowMajor);
   }
 };
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2SqrtUnexpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg)
-  {
-    raft::distance::euclideanAlgo2<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, true, fin_op, stream, isRowMajor);
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2SqrtUnexpanded, InType,
+                    AccType, OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+    raft::distance::euclideanAlgo2<InType, AccType, OutType, FinalLambda,
+                                   Index_>(m, n, k, x, y, dist, true, fin_op,
+                                           stream, isRowMajor);
   }
 };
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L1,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg)
-  {
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L1, InType, AccType, OutType,
+                    FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
     raft::distance::l1Impl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::Linf,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg)
-  {
-    raft::distance::chebyshevImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::Linf, InType, AccType,
+                    OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+    raft::distance::chebyshevImpl<InType, AccType, OutType, FinalLambda,
+                                  Index_>(m, n, k, x, y, dist, fin_op, stream,
+                                          isRowMajor);
   }
 };
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::HellingerExpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg)
-  {
-    raft::distance::hellingerImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::HellingerExpanded, InType,
+                    AccType, OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+    raft::distance::hellingerImpl<InType, AccType, OutType, FinalLambda,
+                                  Index_>(m, n, k, x, y, dist, fin_op, stream,
+                                          isRowMajor);
   }
 };
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::LpUnexpanded,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg)
-  {
-    raft::distance::minkowskiImpl<InType, AccType, OutType, FinalLambda, Index_>(
-      m, n, k, x, y, dist, fin_op, stream, isRowMajor, metric_arg);
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::LpUnexpanded, InType, AccType,
+                    OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+    raft::distance::minkowskiImpl<InType, AccType, OutType, FinalLambda,
+                                  Index_>(m, n, k, x, y, dist, fin_op, stream,
+                                          isRowMajor, metric_arg);
   }
 };
 
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::Canberra,
-                    InType,
-                    AccType,
-                    OutType,
-                    FinalLambda,
-                    Index_> {
-  void run(const InType* x,
-           const InType* y,
-           OutType* dist,
-           Index_ m,
-           Index_ n,
-           Index_ k,
-           void* workspace,
-           size_t worksize,
-           FinalLambda fin_op,
-           cudaStream_t stream,
-           bool isRowMajor,
-           InType metric_arg)
-  {
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::Canberra, InType, AccType,
+                    OutType, FinalLambda, Index_> {
+  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
+           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
+           cudaStream_t stream, bool isRowMajor, InType metric_arg) {
     raft::distance::canberraImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
@@ -363,15 +189,13 @@ struct DistanceImpl<raft::distance::DistanceType::Canberra,
  * @note If the specifed distanceType doesn't need the workspace at all, it
  * returns 0.
  */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
-{
-  size_t worksize             = 0;
-  constexpr bool is_allocated = distanceType <= raft::distance::DistanceType::CosineExpanded;
+template <raft::distance::DistanceType distanceType, typename InType,
+          typename AccType, typename OutType, typename Index_ = int>
+size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
+                        Index_ k) {
+  size_t worksize = 0;
+  constexpr bool is_allocated =
+    distanceType <= raft::distance::DistanceType::CosineExpanded;
   if (is_allocated) {
     worksize += m * sizeof(AccType);
     if (x != y) worksize += n * sizeof(AccType);
@@ -404,27 +228,17 @@ size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, In
  * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
  * any other parameters, feel free to pass them via closure.
  */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
+template <raft::distance::DistanceType distanceType, typename InType,
+          typename AccType, typename OutType, typename FinalLambda,
           typename Index_ = int>
-void distance(const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              void* workspace,
-              size_t worksize,
-              FinalLambda fin_op,
-              cudaStream_t stream,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
-{
-  DistanceImpl<distanceType, InType, AccType, OutType, FinalLambda, Index_> distImpl;
-  distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
+void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
+              Index_ n, Index_ k, void *workspace, size_t worksize,
+              FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true,
+              InType metric_arg = 2.0f) {
+  DistanceImpl<distanceType, InType, AccType, OutType, FinalLambda, Index_>
+    distImpl;
+  distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream,
+               isRowMajor, metric_arg);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -449,26 +263,18 @@ void distance(const InType* x,
  * @note if workspace is passed as nullptr, this will return in
  *  worksize, the number of bytes of workspace required
  */
-template <raft::distance::DistanceType distanceType,
-          typename InType,
-          typename AccType,
-          typename OutType,
-          typename Index_ = int>
-void distance(const InType* x,
-              const InType* y,
-              OutType* dist,
-              Index_ m,
-              Index_ n,
-              Index_ k,
-              void* workspace,
-              size_t worksize,
-              cudaStream_t stream,
-              bool isRowMajor   = true,
-              InType metric_arg = 2.0f)
-{
-  auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; };
-  distance<distanceType, InType, AccType, OutType, decltype(default_fin_op), Index_>(
-    x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream, isRowMajor, metric_arg);
+template <raft::distance::DistanceType distanceType, typename InType,
+          typename AccType, typename OutType, typename Index_ = int>
+void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
+              Index_ n, Index_ k, void *workspace, size_t worksize,
+              cudaStream_t stream, bool isRowMajor = true,
+              InType metric_arg = 2.0f) {
+  auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) {
+    return d_val;
+  };
+  distance<distanceType, InType, AccType, OutType, decltype(default_fin_op),
+           Index_>(x, y, dist, m, n, k, workspace, worksize, default_fin_op,
+                   stream, isRowMajor, metric_arg);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -492,47 +298,39 @@ void distance(const InType* x,
  * @param isRowMajor whether the matrices are row-major or col-major
  */
 template <typename Type, typename Index_, raft::distance::DistanceType DistType>
-void pairwise_distance_impl(const Type* x,
-                            const Type* y,
-                            Type* dist,
-                            Index_ m,
-                            Index_ n,
-                            Index_ k,
-                            raft::mr::device::buffer<char>& workspace,
-                            cudaStream_t stream,
-                            bool isRowMajor,
-                            Type metric_arg = 2.0f)
-{
-  auto worksize = getWorkspaceSize<DistType, Type, Type, Type, Index_>(x, y, m, n, k);
+void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m,
+                            Index_ n, Index_ k,
+                            raft::mr::device::buffer<char> &workspace,
+                            cudaStream_t stream, bool isRowMajor,
+                            Type metric_arg = 2.0f) {
+  auto worksize =
+    getWorkspaceSize<DistType, Type, Type, Type, Index_>(x, y, m, n, k);
   workspace.resize(worksize, stream);
-  distance<DistType, Type, Type, Type, Index_>(
-    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
+  distance<DistType, Type, Type, Type, Index_>(x, y, dist, m, n, k,
+                                               workspace.data(), worksize,
+                                               stream, isRowMajor, metric_arg);
 }
 
 template <typename Type, typename Index_ = int>
-void pairwise_distance(const Type* x,
-                       const Type* y,
-                       Type* dist,
-                       Index_ m,
-                       Index_ n,
-                       Index_ k,
-                       raft::mr::device::buffer<char>& workspace,
-                       raft::distance::DistanceType metric,
-                       cudaStream_t stream,
-                       bool isRowMajor = true,
-                       Type metric_arg = 2.0f)
-{
+void pairwise_distance(const Type *x, const Type *y, Type *dist, Index_ m,
+                       Index_ n, Index_ k,
+                       raft::mr::device::buffer<char> &workspace,
+                       raft::distance::DistanceType metric, cudaStream_t stream,
+                       bool isRowMajor = true, Type metric_arg = 2.0f) {
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
-      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Expanded>(
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::L2Expanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtExpanded:
-      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::L2SqrtExpanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CosineExpanded>(
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::CosineExpanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::L1:
@@ -540,11 +338,13 @@ void pairwise_distance(const Type* x,
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::L2Unexpanded:
-      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::L2Unexpanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtUnexpanded:
-      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::L2SqrtUnexpanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::Linf:
@@ -552,18 +352,22 @@ void pairwise_distance(const Type* x,
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::HellingerExpanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
-      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::LpUnexpanded>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::Canberra:
-      pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Canberra>(
+      pairwise_distance_impl<Type, Index_,
+                             raft::distance::DistanceType::Canberra>(
         x, y, dist, m, n, k, workspace, stream, isRowMajor);
       break;
-    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
+    default:
+      THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
 }
 /** @} */
diff --git a/cpp/include/raft/distance/euclidean.cuh b/cpp/include/raft/distance/euclidean.cuh
index 46d0a1a4a9..484da0e5bf 100644
--- a/cpp/include/raft/distance/euclidean.cuh
+++ b/cpp/include/raft/distance/euclidean.cuh
@@ -48,44 +48,30 @@ namespace distance {
  * @param fin_op  the final gemm epilogue lambda
 *  @param stream  cuda stream to launch cuda operations.
  */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-void euclideanExpImpl(const DataT* x,
-                      const DataT* y,
-                      const DataT* xn,
-                      const DataT* yn,
-                      IdxT m,
-                      IdxT n,
-                      IdxT k,
-                      IdxT lda,
-                      IdxT ldb,
-                      IdxT ldd,
-                      bool sqrt,
-                      OutT* dOutput,
-                      FinalLambda fin_op,
-                      cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn,
+                      const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda,
+                      IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput,
+                      FinalLambda fin_op, cudaStream_t stream) {
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
+    acc += x * y;
+  };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                         DataT * regxn,
-                                         DataT * regyn,
-                                         IdxT gridStrideX,
-                                         IdxT gridStrideY) {
+  auto epilog_lambda = [sqrt] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -107,68 +93,47 @@ void euclideanExpImpl(const DataT* x,
   constexpr size_t shmemSize =
     KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
   if (isRowMajor) {
-    auto euclideanExpRowMajor = pairwiseDistanceMatKernel<true,
-                                                          DataT,
-                                                          AccT,
-                                                          OutT,
-                                                          IdxT,
-                                                          KPolicy,
-                                                          decltype(core_lambda),
-                                                          decltype(epilog_lambda),
-                                                          FinalLambda,
-                                                          true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
+    auto euclideanExpRowMajor =
+      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid =
+      launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
 
     euclideanExpRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
+      fin_op);
   } else {
-    auto euclideanExpColMajor = pairwiseDistanceMatKernel<true,
-                                                          DataT,
-                                                          AccT,
-                                                          OutT,
-                                                          IdxT,
-                                                          KPolicy,
-                                                          decltype(core_lambda),
-                                                          decltype(epilog_lambda),
-                                                          FinalLambda,
-                                                          false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
+    auto euclideanExpColMajor =
+      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid =
+      launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
     euclideanExpColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
+      fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void euclideanExp(IdxT m,
-                  IdxT n,
-                  IdxT k,
-                  IdxT lda,
-                  IdxT ldb,
-                  IdxT ldd,
-                  const DataT* x,
-                  const DataT* y,
-                  const DataT* xn,
-                  const DataT* yn,
-                  bool sqrt,
-                  OutT* dOutput,
-                  FinalLambda fin_op,
-                  cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                  const DataT *x, const DataT *y, const DataT *xn,
+                  const DataT *yn, bool sqrt, OutT *dOutput, FinalLambda fin_op,
+                  cudaStream_t stream) {
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
+    euclideanExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                     isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt,
+                                 dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
+    euclideanExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                     isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt,
+                                 dOutput, fin_op, stream);
   } else {
     euclideanExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
@@ -196,59 +161,53 @@ void euclideanExp(IdxT m,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void euclideanAlgo1(Index_ m,
-                    Index_ n,
-                    Index_ k,
-                    const InType* pA,
-                    const InType* pB,
-                    OutType* pD,
-                    bool enable_sqrt,
-                    AccType* workspace,
-                    size_t& worksize,
-                    FinalLambda fin_op,
-                    cudaStream_t stream,
-                    bool isRowMajor)
-{
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
+                    const InType *pB, OutType *pD, bool enable_sqrt,
+                    AccType *workspace, size_t &worksize, FinalLambda fin_op,
+                    cudaStream_t stream, bool isRowMajor) {
   auto norm_op = [] __device__(InType in) { return in; };
 
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type ExpOutType;
-  ExpOutType* pDcast = reinterpret_cast<ExpOutType*>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    ExpOutType;
+  ExpOutType *pDcast = reinterpret_cast<ExpOutType *>(pD);
 
-  ASSERT(
-    !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))),
-    "workspace size error");
+  ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) ||
+           (worksize < m * sizeof(AccType))),
+         "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
   Index_ lda, ldb, ldd;
-  InType* col_vec = workspace;
-  InType* row_vec = workspace;
+  InType *col_vec = workspace;
+  InType *row_vec = workspace;
   if (pA != pB) {
     row_vec += m;
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
-    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
+                          stream, norm_op);
+    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor,
+                          stream, norm_op);
   } else {
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
+                          stream, norm_op);
   }
 
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     euclideanExp<InType, AccType, ExpOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, fin_op, stream);
+      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast,
+      fin_op, stream);
   } else {
     lda = n, ldb = m, ldd = m;
     euclideanExp<InType, AccType, ExpOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, fin_op, stream);
+      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast,
+      fin_op, stream);
   }
 }
 
 /**
- * @brief the unexpanded euclidean distance matrix calculation
+ * @brief the unexpanded euclidean distance matrix calculation 
  *  It computes the following equation: cij = op((ai-bj)^2)
  * @tparam DataT          input data-type (for A and B matrices)
  * @tparam AccT           accumulation data-type
@@ -268,30 +227,16 @@ void euclideanAlgo1(Index_ m,
  * @param[output]   pD output matrix
  * @param fin_op    the final gemm epilogue lambda
  */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-void euclideanUnExpImpl(const DataT* x,
-                        const DataT* y,
-                        IdxT m,
-                        IdxT n,
-                        IdxT k,
-                        IdxT lda,
-                        IdxT ldb,
-                        IdxT ldd,
-                        bool sqrt,
-                        OutT* dOutput,
-                        FinalLambda fin_op,
-                        cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
+                        IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput,
+                        FinalLambda fin_op, cudaStream_t stream) {
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -302,11 +247,10 @@ void euclideanUnExpImpl(const DataT* x,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                         DataT * regxn,
-                                         DataT * regyn,
-                                         IdxT gridStrideX,
-                                         IdxT gridStrideY) {
+  auto epilog_lambda = [sqrt] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
     if (sqrt) {
 #pragma unroll
       for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
@@ -319,68 +263,48 @@ void euclideanUnExpImpl(const DataT* x,
   };
 
   if (isRowMajor) {
-    auto euclideanUnExpRowMajor = pairwiseDistanceMatKernel<false,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, euclideanUnExpRowMajor);
+    auto euclideanUnExpRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               euclideanUnExpRowMajor);
 
     euclideanUnExpRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
 
   } else {
-    auto euclideanUnExpColMajor = pairwiseDistanceMatKernel<false,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, euclideanUnExpColMajor);
+    auto euclideanUnExpColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               euclideanUnExpColMajor);
 
     euclideanUnExpColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void euclideanUnExp(IdxT m,
-                    IdxT n,
-                    IdxT k,
-                    IdxT lda,
-                    IdxT ldb,
-                    IdxT ldd,
-                    const DataT* x,
-                    const DataT* y,
-                    bool sqrt,
-                    OutT* dOutput,
-                    FinalLambda fin_op,
-                    cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                    const DataT *x, const DataT *y, bool sqrt, OutT *dOutput,
+                    FinalLambda fin_op, cudaStream_t stream) {
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
+    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput,
+                                   fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
+    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput,
+                                   fin_op, stream);
   } else {
     euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
@@ -406,25 +330,15 @@ void euclideanUnExp(IdxT m,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void euclideanAlgo2(Index_ m,
-                    Index_ n,
-                    Index_ k,
-                    const InType* pA,
-                    const InType* pB,
-                    OutType* pD,
-                    bool enable_sqrt,
-                    FinalLambda fin_op,
-                    cudaStream_t stream,
-                    bool isRowMajor)
-{
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void euclideanAlgo2(Index_ m, Index_ n, Index_ k, const InType *pA,
+                    const InType *pB, OutType *pD, bool enable_sqrt,
+                    FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) {
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type UnExpOutType;
-  UnExpOutType* pDcast = reinterpret_cast<UnExpOutType*>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    UnExpOutType;
+  UnExpOutType *pDcast = reinterpret_cast<UnExpOutType *>(pD);
   Index_ lda, ldb, ldd;
 
   if (isRowMajor) {
diff --git a/cpp/include/raft/distance/fused_l2_nn.cuh b/cpp/include/raft/distance/fused_l2_nn.cuh
index f80b4eb8f7..b96a536e38 100644
--- a/cpp/include/raft/distance/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/fused_l2_nn.cuh
@@ -35,24 +35,24 @@ template <typename LabelT, typename DataT>
 struct KVPMinReduce {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) {
+    return b.value < a.value ? b : a;
+  }
 
 };  // KVPMinReduce
 
 template <typename LabelT, typename DataT>
 struct MinAndDistanceReduceOp {
   typedef typename cub::KeyValuePair<LabelT, DataT> KVP;
-  DI void operator()(LabelT rid, KVP* out, const KVP& other)
-  {
+  DI void operator()(LabelT rid, KVP* out, const KVP& other) {
     if (other.value < out->value) {
-      out->key   = other.key;
+      out->key = other.key;
       out->value = other.value;
     }
   }
 
-  DI void init(KVP* out, DataT maxVal)
-  {
-    out->key   = -1;
+  DI void init(KVP* out, DataT maxVal) {
+    out->key = -1;
     out->value = maxVal;
   }
 };
@@ -60,28 +60,30 @@ struct MinAndDistanceReduceOp {
 template <typename LabelT, typename DataT>
 struct MinReduceOp {
   typedef typename cub::KeyValuePair<LabelT, DataT> KVP;
-  DI void operator()(LabelT rid, DataT* out, const KVP& other)
-  {
-    if (other.value < *out) { *out = other.value; }
+  DI void operator()(LabelT rid, DataT* out, const KVP& other) {
+    if (other.value < *out) {
+      *out = other.value;
+    }
   }
 
   DI void init(DataT* out, DataT maxVal) { *out = maxVal; }
 };
 
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
-{
+__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) {
   auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (tid < m) { redOp.init(min + tid, maxVal); }
+  if (tid < m) {
+    redOp.init(min + tid, maxVal);
+  }
 }
 
 // TODO: specialize this function for MinAndDistanceReduceOp<int, float>
 // with atomicCAS of 64 bit which will eliminate mutex and shfls
-template <typename P, typename OutT, typename IdxT, typename KVPair, typename ReduceOpT>
-DI void updateReducedVal(
-  int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY)
-{
-  const auto lid      = threadIdx.x % raft::WarpSize;
+template <typename P, typename OutT, typename IdxT, typename KVPair,
+          typename ReduceOpT>
+DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op,
+                         IdxT m, IdxT gridStrideY) {
+  const auto lid = threadIdx.x % raft::WarpSize;
   const auto accrowid = threadIdx.x / P::AccThCols;
 
   // for now have first lane from each warp update a unique output row. This
@@ -106,38 +108,21 @@ DI void updateReducedVal(
     if (j < (raft::WarpSize / P::AccThCols) - 1) {
 #pragma unroll
       for (int i = 0; i < P::AccRowsPerTh; ++i) {
-        auto tmpkey   = raft::shfl(val[i].key, (j + 1) * P::AccThCols);
+        auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols);
         auto tmpvalue = raft::shfl(val[i].value, (j + 1) * P::AccThCols);
-        val[i]        = {tmpkey, tmpvalue};
+        val[i] = {tmpkey, tmpvalue};
       }
     }
   }
 }
 
-template <typename DataT,
-          typename OutT,
-          typename IdxT,
-          bool Sqrt,
-          typename P,
-          typename ReduceOpT,
-          typename KVPReduceOpT,
-          typename CoreLambda,
+template <typename DataT, typename OutT, typename IdxT, bool Sqrt, typename P,
+          typename ReduceOpT, typename KVPReduceOpT, typename CoreLambda,
           typename FinalLambda>
-__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
-                                                                  const DataT* x,
-                                                                  const DataT* y,
-                                                                  const DataT* xn,
-                                                                  const DataT* yn,
-                                                                  IdxT m,
-                                                                  IdxT n,
-                                                                  IdxT k,
-                                                                  DataT maxVal,
-                                                                  int* mutex,
-                                                                  ReduceOpT redOp,
-                                                                  KVPReduceOpT pairRedOp,
-                                                                  CoreLambda core_op,
-                                                                  FinalLambda fin_op)
-{
+__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(
+  OutT* min, const DataT* x, const DataT* y, const DataT* xn, const DataT* yn,
+  IdxT m, IdxT n, IdxT k, DataT maxVal, int* mutex, ReduceOpT redOp,
+  KVPReduceOpT pairRedOp, CoreLambda core_op, FinalLambda fin_op) {
   extern __shared__ char smem[];
 
   typedef cub::KeyValuePair<IdxT, DataT> KVPair;
@@ -150,9 +135,7 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
   // epilogue operation lambda for final value calculation
   auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__(
                          DataT acc[P::AccRowsPerTh][P::AccColsPerTh],
-                         DataT * regxn,
-                         DataT * regyn,
-                         IdxT gridStrideX,
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
                          IdxT gridStrideY) {
     KVPReduceOpT pairRed_op(pairRedOp);
 
@@ -181,105 +164,72 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
 #pragma unroll
       for (int j = 0; j < P::AccColsPerTh; ++j) {
         auto tmpkey = acccolid + j * P::AccThCols + gridStrideX;
-        KVPair tmp  = {tmpkey, acc[i][j]};
+        KVPair tmp = {tmpkey, acc[i][j]};
         if (tmpkey < n) {
-          val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
+          val[i] =
+            pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
         }
       }
     }
   };
 
-  auto rowEpilog_lambda =
-    [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT gridStrideY) {
-      KVPReduceOpT pairRed_op(pairRedOp);
-      ReduceOpT red_op(redOp);
+  auto rowEpilog_lambda = [m, mutex, min, pairRedOp, redOp, &val,
+                           maxVal] __device__(IdxT gridStrideY) {
+    KVPReduceOpT pairRed_op(pairRedOp);
+    ReduceOpT red_op(redOp);
 
-      const auto accrowid = threadIdx.x / P::AccThCols;
-      const auto lid      = raft::laneId();
+    const auto accrowid = threadIdx.x / P::AccThCols;
+    const auto lid = raft::laneId();
 
     // reduce
 #pragma unroll
-      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
-        for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
-          auto tmpkey   = raft::shfl(val[i].key, lid + j);
-          auto tmpvalue = raft::shfl(val[i].value, lid + j);
-          KVPair tmp    = {tmpkey, tmpvalue};
-          val[i]        = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
-        }
+      for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
+        auto tmpkey = raft::shfl(val[i].key, lid + j);
+        auto tmpvalue = raft::shfl(val[i].value, lid + j);
+        KVPair tmp = {tmpkey, tmpvalue};
+        val[i] =
+          pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
       }
+    }
 
-      updateReducedVal<P, OutT, IdxT, KVPair, ReduceOpT>(mutex, min, val, red_op, m, gridStrideY);
+    updateReducedVal<P, OutT, IdxT, KVPair, ReduceOpT>(mutex, min, val, red_op,
+                                                       m, gridStrideY);
 
     // reset the val array.
 #pragma unroll
-      for (int i = 0; i < P::AccRowsPerTh; ++i) {
-        val[i] = {-1, maxVal};
-      }
-    };
+    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      val[i] = {-1, maxVal};
+    }
+  };
 
   IdxT lda = k, ldb = k, ldd = n;
-  PairwiseDistances<true,
-                    DataT,
-                    DataT,
-                    DataT,
-                    IdxT,
-                    P,
-                    CoreLambda,
-                    decltype(epilog_lambda),
-                    FinalLambda,
-                    decltype(rowEpilog_lambda),
-                    true,
-                    false>
-    obj(x,
-        y,
-        m,
-        n,
-        k,
-        lda,
-        ldb,
-        ldd,
-        xn,
-        yn,
-        nullptr,
-        smem,
-        core_op,
-        epilog_lambda,
-        fin_op,
-        rowEpilog_lambda);
+  PairwiseDistances<true, DataT, DataT, DataT, IdxT, P, CoreLambda,
+                    decltype(epilog_lambda), FinalLambda,
+                    decltype(rowEpilog_lambda), true, false>
+    obj(x, y, m, n, k, lda, ldb, ldd, xn, yn, nullptr, smem, core_op,
+        epilog_lambda, fin_op, rowEpilog_lambda);
   obj.run();
 }
 
-template <typename DataT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename ReduceOpT,
-          typename KVPReduceOpT>
-void fusedL2NNImpl(OutT* min,
-                   const DataT* x,
-                   const DataT* y,
-                   const DataT* xn,
-                   const DataT* yn,
-                   IdxT m,
-                   IdxT n,
-                   IdxT k,
-                   int* workspace,
-                   ReduceOpT redOp,
-                   KVPReduceOpT pairRedOp,
-                   bool sqrt,
-                   bool initOutBuffer,
-                   cudaStream_t stream)
-{
+template <typename DataT, typename OutT, typename IdxT, int VecLen,
+          typename ReduceOpT, typename KVPReduceOpT>
+void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
+                   const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace,
+                   ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt,
+                   bool initOutBuffer, cudaStream_t stream) {
   typedef typename linalg::Policy4x4<DataT, VecLen>::Policy P;
 
   dim3 blk(P::Nthreads);
-  auto nblks            = raft::ceildiv<int>(m, P::Nthreads);
+  auto nblks = raft::ceildiv<int>(m, P::Nthreads);
   constexpr auto maxVal = std::numeric_limits<DataT>::max();
   typedef cub::KeyValuePair<IdxT, DataT> KVPair;
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; };
+  auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) {
+    acc += x * y;
+  };
 
   CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
   if (initOutBuffer) {
@@ -290,34 +240,25 @@ void fusedL2NNImpl(OutT* min,
 
   auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; };
 
-  constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
+  constexpr size_t shmemSize =
+    P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
   if (sqrt) {
-    auto fusedL2NNSqrt = fusedL2NNkernel<DataT,
-                                         OutT,
-                                         IdxT,
-                                         true,
-                                         P,
-                                         ReduceOpT,
-                                         KVPReduceOpT,
-                                         decltype(core_lambda),
-                                         decltype(fin_op)>;
-    dim3 grid          = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NNSqrt);
+    auto fusedL2NNSqrt =
+      fusedL2NNkernel<DataT, OutT, IdxT, true, P, ReduceOpT, KVPReduceOpT,
+                      decltype(core_lambda), decltype(fin_op)>;
+    dim3 grid = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NNSqrt);
 
     fusedL2NNSqrt<<<grid, blk, shmemSize, stream>>>(
-      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op);
+      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp,
+      core_lambda, fin_op);
   } else {
-    auto fusedL2NN = fusedL2NNkernel<DataT,
-                                     OutT,
-                                     IdxT,
-                                     false,
-                                     P,
-                                     ReduceOpT,
-                                     KVPReduceOpT,
-                                     decltype(core_lambda),
-                                     decltype(fin_op)>;
-    dim3 grid      = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NN);
-    fusedL2NN<<<grid, blk, shmemSize, stream>>>(
-      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op);
+    auto fusedL2NN =
+      fusedL2NNkernel<DataT, OutT, IdxT, false, P, ReduceOpT, KVPReduceOpT,
+                      decltype(core_lambda), decltype(fin_op)>;
+    dim3 grid = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NN);
+    fusedL2NN<<<grid, blk, shmemSize, stream>>>(min, x, y, xn, yn, m, n, k,
+                                                maxVal, workspace, redOp,
+                                                pairRedOp, core_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
@@ -358,32 +299,25 @@ void fusedL2NNImpl(OutT* min,
  *                           main kernel launch
  * @param[in]  stream        cuda stream
  */
-template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
-void fusedL2NN(OutT* min,
-               const DataT* x,
-               const DataT* y,
-               const DataT* xn,
-               const DataT* yn,
-               IdxT m,
-               IdxT n,
-               IdxT k,
-               void* workspace,
-               ReduceOpT redOp,
-               KVPReduceOpT pairRedOp,
-               bool sqrt,
-               bool initOutBuffer,
-               cudaStream_t stream)
-{
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT,
+          typename KVPReduceOpT>
+void fusedL2NN(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
+               const DataT* yn, IdxT m, IdxT n, IdxT k, void* workspace,
+               ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt,
+               bool initOutBuffer, cudaStream_t stream) {
   size_t bytes = sizeof(DataT) * k;
   if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
     fusedL2NNImpl<DataT, OutT, IdxT, 16 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
+      initOutBuffer, stream);
   } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
     fusedL2NNImpl<DataT, OutT, IdxT, 8 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
+      initOutBuffer, stream);
   } else {
     fusedL2NNImpl<DataT, OutT, IdxT, 1, ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
+      initOutBuffer, stream);
   }
 }
 
diff --git a/cpp/include/raft/distance/hellinger.cuh b/cpp/include/raft/distance/hellinger.cuh
index c8c7dad7d4..f7ad3ed1ba 100644
--- a/cpp/include/raft/distance/hellinger.cuh
+++ b/cpp/include/raft/distance/hellinger.cuh
@@ -23,7 +23,7 @@ namespace distance {
 
 /**
  * @brief the Hellinger distance matrix using the expanded form:
- *  It computes the following equation:
+ *  It computes the following equation: 
     cij = sqrt(1 - sum(sqrt(x_k * y_k)))
  * This distance computation modifies A and B by computing a sqrt
  * and then performing a `pow(x, 2)` to convert it back. Because of this,
@@ -51,40 +51,29 @@ namespace distance {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void hellingerImpl(const DataT* x,
-                          const DataT* y,
-                          IdxT m,
-                          IdxT n,
-                          IdxT k,
-                          IdxT lda,
-                          IdxT ldb,
-                          IdxT ldd,
-                          OutT* dOutput,
-                          FinalLambda fin_op,
-                          cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
+                          IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
+                          FinalLambda fin_op, cudaStream_t stream) {
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
-  auto unaryOp_lambda = [] __device__(DataT input) { return raft::mySqrt(input); };
+  auto unaryOp_lambda = [] __device__(DataT input) {
+    return raft::mySqrt(input);
+  };
   // First sqrt x and y
   raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-    (DataT*)x, x, m * k, unaryOp_lambda, stream);
+    (DataT *)x, x, m * k, unaryOp_lambda, stream);
 
   if (x != y) {
     raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-      (DataT*)y, y, n * k, unaryOp_lambda, stream);
+      (DataT *)y, y, n * k, unaryOp_lambda, stream);
   }
 
   // Accumulation operation lambda
@@ -95,91 +84,71 @@ static void hellingerImpl(const DataT* x,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) {
+  auto epilog_lambda = [] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
-        const auto finalVal  = (1 - acc[i][j]);
+        const auto finalVal = (1 - acc[i][j]);
         const auto rectifier = (!signbit(finalVal));
-        acc[i][j]            = raft::mySqrt(rectifier * finalVal);
+        acc[i][j] = raft::mySqrt(rectifier * finalVal);
       }
     }
   };
 
   if (isRowMajor) {
-    auto hellingerRowMajor = pairwiseDistanceMatKernel<false,
-                                                       DataT,
-                                                       AccT,
-                                                       OutT,
-                                                       IdxT,
-                                                       KPolicy,
-                                                       decltype(core_lambda),
-                                                       decltype(epilog_lambda),
-                                                       FinalLambda,
-                                                       true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hellingerRowMajor);
+    auto hellingerRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               hellingerRowMajor);
 
     hellingerRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
   } else {
-    auto hellingerColMajor = pairwiseDistanceMatKernel<false,
-                                                       DataT,
-                                                       AccT,
-                                                       OutT,
-                                                       IdxT,
-                                                       KPolicy,
-                                                       decltype(core_lambda),
-                                                       decltype(epilog_lambda),
-                                                       FinalLambda,
-                                                       false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hellingerColMajor);
+    auto hellingerColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               hellingerColMajor);
     hellingerColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
   }
 
   // Revert sqrt of x and y
   raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-    (DataT*)x, x, m * k, unaryOp_lambda, stream);
+    (DataT *)x, x, m * k, unaryOp_lambda, stream);
   if (x != y) {
     raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-      (DataT*)y, y, n * k, unaryOp_lambda, stream);
+      (DataT *)y, y, n * k, unaryOp_lambda, stream);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void hellinger(IdxT m,
-               IdxT n,
-               IdxT k,
-               IdxT lda,
-               IdxT ldb,
-               IdxT ldd,
-               const DataT* x,
-               const DataT* y,
-               OutT* dOutput,
-               FinalLambda fin_op,
-               cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+               const DataT *x, const DataT *y, OutT *dOutput,
+               FinalLambda fin_op, cudaStream_t stream) {
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    hellingerImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                              stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    hellingerImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
+                              stream);
   } else {
     hellingerImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -188,7 +157,7 @@ void hellinger(IdxT m,
 
 /**
  * @brief the Hellinger distance matrix calculation
- *  It computes the following equation:
+ *  It computes the following equation: 
     sqrt(1 - sum(sqrt(x_k * y_k))
  * This distance computation modifies A and B by computing a sqrt
  * and then performing a `pow(x, 2)` to convert it back. Because of this,
@@ -210,25 +179,16 @@ void hellinger(IdxT m,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void hellingerImpl(int m,
-                   int n,
-                   int k,
-                   const InType* pA,
-                   const InType* pB,
-                   OutType* pD,
-                   FinalLambda fin_op,
-                   cudaStream_t stream,
-                   bool isRowMajor)
-{
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void hellingerImpl(int m, int n, int k, const InType *pA, const InType *pB,
+                   OutType *pD, FinalLambda fin_op, cudaStream_t stream,
+                   bool isRowMajor) {
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type hellingerOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    hellingerOutType;
   Index_ lda, ldb, ldd;
-  hellingerOutType* pDcast = reinterpret_cast<hellingerOutType*>(pD);
+  hellingerOutType *pDcast = reinterpret_cast<hellingerOutType *>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     hellinger<InType, AccType, hellingerOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/l1.cuh b/cpp/include/raft/distance/l1.cuh
index 268e269391..6ab084f041 100644
--- a/cpp/include/raft/distance/l1.cuh
+++ b/cpp/include/raft/distance/l1.cuh
@@ -42,29 +42,16 @@ namespace distance {
  * @param[output]   pD output matrix
  * @param fin_op    the final gemm epilogue lambda
  */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-static void l1Impl(const DataT* x,
-                   const DataT* y,
-                   IdxT m,
-                   IdxT n,
-                   IdxT k,
-                   IdxT lda,
-                   IdxT ldb,
-                   IdxT ldd,
-                   OutT* dOutput,
-                   FinalLambda fin_op,
-                   cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
+                   IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
+                   FinalLambda fin_op, cudaStream_t stream) {
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -75,69 +62,47 @@ static void l1Impl(const DataT* x,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                     DataT * regxn,
-                                     DataT * regyn,
-                                     IdxT gridStrideX,
-                                     IdxT gridStrideY) { return; };
+  auto epilog_lambda = [] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) { return; };
 
   if (isRowMajor) {
-    auto l1RowMajor = pairwiseDistanceMatKernel<false,
-                                                DataT,
-                                                AccT,
-                                                OutT,
-                                                IdxT,
-                                                KPolicy,
-                                                decltype(core_lambda),
-                                                decltype(epilog_lambda),
-                                                FinalLambda,
-                                                true>;
-    dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1RowMajor);
+    auto l1RowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid =
+      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1RowMajor);
 
     l1RowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
   } else {
-    auto l1ColMajor = pairwiseDistanceMatKernel<false,
-                                                DataT,
-                                                AccT,
-                                                OutT,
-                                                IdxT,
-                                                KPolicy,
-                                                decltype(core_lambda),
-                                                decltype(epilog_lambda),
-                                                FinalLambda,
-                                                false>;
-    dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1ColMajor);
+    auto l1ColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid =
+      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1ColMajor);
     l1ColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void l1(IdxT m,
-        IdxT n,
-        IdxT k,
-        IdxT lda,
-        IdxT ldb,
-        IdxT ldd,
-        const DataT* x,
-        const DataT* y,
-        OutT* dOutput,
-        FinalLambda fin_op,
-        cudaStream_t stream)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x,
+        const DataT *y, OutT *dOutput, FinalLambda fin_op,
+        cudaStream_t stream) {
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    l1Impl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    l1Impl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+           isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
     l1Impl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -165,25 +130,16 @@ void l1(IdxT m,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void l1Impl(int m,
-            int n,
-            int k,
-            const InType* pA,
-            const InType* pB,
-            OutType* pD,
-            FinalLambda fin_op,
-            cudaStream_t stream,
-            bool isRowMajor)
-{
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void l1Impl(int m, int n, int k, const InType *pA, const InType *pB,
+            OutType *pD, FinalLambda fin_op, cudaStream_t stream,
+            bool isRowMajor) {
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type L1OutType;
+  typedef
+    typename std::conditional<is_bool::value, OutType, AccType>::type L1OutType;
   Index_ lda, ldb, ldd;
-  L1OutType* pDcast = reinterpret_cast<L1OutType*>(pD);
+  L1OutType *pDcast = reinterpret_cast<L1OutType *>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     l1<InType, AccType, L1OutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/minkowski.cuh b/cpp/include/raft/distance/minkowski.cuh
index c021954f32..803f5fc78a 100644
--- a/cpp/include/raft/distance/minkowski.cuh
+++ b/cpp/include/raft/distance/minkowski.cuh
@@ -21,7 +21,7 @@ namespace raft {
 namespace distance {
 
 /**
- * @brief the unexpanded Minkowski distance matrix calculation
+ * @brief the unexpanded Minkowski distance matrix calculation 
  *  It computes the following equation: cij = sum(|x - y|^p)^(1/p)
  * @tparam DataT          input data-type (for A and B matrices)
  * @tparam AccT           accumulation data-type
@@ -44,30 +44,16 @@ namespace distance {
  * @param[in]       stream cuda stream to launch work
  * @param[in]       the value of `p` for Minkowski (l-p) distances.
  */
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          int VecLen,
-          typename FinalLambda,
-          bool isRowMajor>
-void minkowskiUnExpImpl(const DataT* x,
-                        const DataT* y,
-                        IdxT m,
-                        IdxT n,
-                        IdxT k,
-                        IdxT lda,
-                        IdxT ldb,
-                        IdxT ldd,
-                        OutT* dOutput,
-                        FinalLambda fin_op,
-                        cudaStream_t stream,
-                        DataT p)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          int VecLen, typename FinalLambda, bool isRowMajor>
+void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
+                        IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
+                        FinalLambda fin_op, cudaStream_t stream, DataT p) {
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef
+    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -78,11 +64,10 @@ void minkowskiUnExpImpl(const DataT* x,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [p] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                                      DataT * regxn,
-                                      DataT * regyn,
-                                      IdxT gridStrideX,
-                                      IdxT gridStrideY) {
+  auto epilog_lambda = [p] __device__(
+                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         IdxT gridStrideY) {
     const auto one_over_p = 1.0f / p;
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
@@ -94,68 +79,48 @@ void minkowskiUnExpImpl(const DataT* x,
   };
 
   if (isRowMajor) {
-    auto minkowskiUnExpRowMajor = pairwiseDistanceMatKernel<false,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, minkowskiUnExpRowMajor);
+    auto minkowskiUnExpRowMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               minkowskiUnExpRowMajor);
 
     minkowskiUnExpRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
 
   } else {
-    auto minkowskiUnExpColMajor = pairwiseDistanceMatKernel<false,
-                                                            DataT,
-                                                            AccT,
-                                                            OutT,
-                                                            IdxT,
-                                                            KPolicy,
-                                                            decltype(core_lambda),
-                                                            decltype(epilog_lambda),
-                                                            FinalLambda,
-                                                            false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, minkowskiUnExpColMajor);
+    auto minkowskiUnExpColMajor =
+      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+                                decltype(core_lambda), decltype(epilog_lambda),
+                                FinalLambda, false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
+                                               minkowskiUnExpColMajor);
 
     minkowskiUnExpColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
+      epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename FinalLambda,
-          bool isRowMajor>
-void minkowskiUnExp(IdxT m,
-                    IdxT n,
-                    IdxT k,
-                    IdxT lda,
-                    IdxT ldb,
-                    IdxT ldd,
-                    const DataT* x,
-                    const DataT* y,
-                    OutT* dOutput,
-                    FinalLambda fin_op,
-                    cudaStream_t stream,
-                    DataT metric_arg)
-{
+template <typename DataT, typename AccT, typename OutT, typename IdxT,
+          typename FinalLambda, bool isRowMajor>
+void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
+                    const DataT *x, const DataT *y, OutT *dOutput,
+                    FinalLambda fin_op, cudaStream_t stream, DataT metric_arg) {
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
+    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
+                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput,
+                                   fin_op, stream, metric_arg);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
+    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
+                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput,
+                                   fin_op, stream, metric_arg);
   } else {
     minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
@@ -181,25 +146,15 @@ void minkowskiUnExp(IdxT m,
  * @param[in] isRowMajor whether the input and output matrices are row major
  * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances.
  */
-template <typename InType,
-          typename AccType,
-          typename OutType,
-          typename FinalLambda,
-          typename Index_ = int>
-void minkowskiImpl(Index_ m,
-                   Index_ n,
-                   Index_ k,
-                   const InType* pA,
-                   const InType* pB,
-                   OutType* pD,
-                   FinalLambda fin_op,
-                   cudaStream_t stream,
-                   bool isRowMajor,
-                   InType metric_arg)
-{
+template <typename InType, typename AccType, typename OutType,
+          typename FinalLambda, typename Index_ = int>
+void minkowskiImpl(Index_ m, Index_ n, Index_ k, const InType *pA,
+                   const InType *pB, OutType *pD, FinalLambda fin_op,
+                   cudaStream_t stream, bool isRowMajor, InType metric_arg) {
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type LpUnexpOutType;
-  LpUnexpOutType* pDcast = reinterpret_cast<LpUnexpOutType*>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
+    LpUnexpOutType;
+  LpUnexpOutType *pDcast = reinterpret_cast<LpUnexpOutType *>(pD);
   Index_ lda, ldb, ldd;
 
   if (isRowMajor) {
diff --git a/cpp/include/raft/distance/pairwise_distance_base.cuh b/cpp/include/raft/distance/pairwise_distance_base.cuh
index 3db4dc0131..43abc9eb65 100644
--- a/cpp/include/raft/distance/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/pairwise_distance_base.cuh
@@ -31,11 +31,11 @@ namespace distance {
  * @tparam OutT           output data-type (for C and D matrices)
  * @tparam IdxT           index data-type
  * @tparam Policy         struct which tunes the Contraction kernel
- * @tparam CoreLambda     tells how to accumulate an x and y into
+ * @tparam CoreLambda     tells how to accumulate an x and y into 
                           acc. its signature:
     template <typename AccT, typename DataT> void core_lambda(AccT& acc,
       const DataT& x, const DataT& y)
- * @tparam EpilogueLambda applies an elementwise function to compute final
+ * @tparam EpilogueLambda applies an elementwise function to compute final 
     values. Its signature is:
     template <typename AccT, typename DataT> void epilogue_lambda
     (AccT acc[][], DataT* regxn, DataT* regyn);
@@ -57,19 +57,13 @@ namespace distance {
  * @param fin_op the final gemm epilogue lambda
  */
 
-template <bool useNorms,
-          typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename Policy,
-          typename CoreLambda,
-          typename EpilogueLambda,
-          typename FinalLambda,
-          typename rowEpilogueLambda,
-          bool isRowMajor    = true,
-          bool writeOut      = true,
-          typename BaseClass = raft::linalg::Contractions_NT<DataT, IdxT, Policy, isRowMajor>>
+template <bool useNorms, typename DataT, typename AccT, typename OutT,
+          typename IdxT, typename Policy, typename CoreLambda,
+          typename EpilogueLambda, typename FinalLambda,
+          typename rowEpilogueLambda, bool isRowMajor = true,
+          bool writeOut = true,
+          typename BaseClass =
+            raft::linalg::Contractions_NT<DataT, IdxT, Policy, isRowMajor>>
 struct PairwiseDistances : public BaseClass {
  private:
   typedef Policy P;
@@ -87,21 +81,11 @@ struct PairwiseDistances : public BaseClass {
 
  public:
   // Constructor
-  DI PairwiseDistances(const DataT* _x,
-                       const DataT* _y,
-                       IdxT _m,
-                       IdxT _n,
-                       IdxT _k,
-                       IdxT _lda,
-                       IdxT _ldb,
-                       IdxT _ldd,
-                       const DataT* _xn,
-                       const DataT* _yn,
-                       OutT* _dOutput,
-                       char* _smem,
-                       CoreLambda _core_op,
-                       EpilogueLambda _epilog_op,
-                       FinalLambda _fin_op,
+  DI PairwiseDistances(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
+                       IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd,
+                       const DataT* _xn, const DataT* _yn, OutT* _dOutput,
+                       char* _smem, CoreLambda _core_op,
+                       EpilogueLambda _epilog_op, FinalLambda _fin_op,
                        rowEpilogueLambda _rowEpilog_op)
     : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem),
       xn(_xn),
@@ -112,12 +96,9 @@ struct PairwiseDistances : public BaseClass {
       core_op(_core_op),
       epilog_op(_epilog_op),
       fin_op(_fin_op),
-      rowEpilog_op(_rowEpilog_op)
-  {
-  }
+      rowEpilog_op(_rowEpilog_op) {}
 
-  DI void run()
-  {
+  DI void run() {
     for (auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m;
          gridStrideY += P::Mblk * gridDim.y) {
       for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n;
@@ -131,8 +112,7 @@ struct PairwiseDistances : public BaseClass {
   }
 
  private:
-  DI void updateIndicesY()
-  {
+  DI void updateIndicesY() {
     const auto stride = P::Nblk * gridDim.x;
     if (isRowMajor) {
       this->y += stride * this->ldb;
@@ -142,23 +122,21 @@ struct PairwiseDistances : public BaseClass {
     this->yrowid += stride;
   }
 
-  DI void updateIndicesXY()
-  {
+  DI void updateIndicesXY() {
     const auto stride = P::Mblk * gridDim.y;
     if (isRowMajor) {
       this->x += stride * this->lda;
       this->yrowid = IdxT(blockIdx.x) * P::Nblk + this->srowid;
-      this->y      = yBase + this->yrowid * this->ldb;
+      this->y = yBase + this->yrowid * this->ldb;
     } else {
       this->x += stride;
       this->yrowid = IdxT(blockIdx.x) * P::Nblk;
-      this->y      = yBase + this->yrowid + this->srowid * this->ldb;
+      this->y = yBase + this->yrowid + this->srowid * this->ldb;
     }
     this->xrowid += stride;
   }
 
-  DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY)
-  {
+  DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) {
     // Fetch next grid stride ldg if within range
     if ((gridStrideX + gridDim.x * P::Nblk) < this->n) {
       updateIndicesY();
@@ -169,9 +147,10 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void prolog(IdxT gridStrideX, IdxT gridStrideY)
-  {
-    if (gridStrideX == blockIdx.x * P::Nblk) { this->ldgXY(0); }
+  DI void prolog(IdxT gridStrideX, IdxT gridStrideY) {
+    if (gridStrideX == blockIdx.x * P::Nblk) {
+      this->ldgXY(0);
+    }
 
 #pragma unroll
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
@@ -186,8 +165,7 @@ struct PairwiseDistances : public BaseClass {
     this->pageWr ^= 1;
   }
 
-  DI void loop()
-  {
+  DI void loop() {
     for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
       this->ldgXY(kidx);
       accumulate();  // on the previous k-block
@@ -204,8 +182,7 @@ struct PairwiseDistances : public BaseClass {
     this->pageRd ^= 1;
   }
 
-  DI void accumulate()
-  {
+  DI void accumulate() {
 #pragma unroll
     for (int ki = 0; ki < P::Kblk; ki += P::Veclen) {
       this->ldsXY(ki);
@@ -222,8 +199,7 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void epilog(IdxT gridStrideX, IdxT gridStrideY)
-  {
+  DI void epilog(IdxT gridStrideX, IdxT gridStrideY) {
     if (useNorms) {
       DataT* sxNorm = (DataT*)(&smem[P::SmemSize]);
       DataT* syNorm = (&sxNorm[P::Mblk]);
@@ -231,13 +207,13 @@ struct PairwiseDistances : public BaseClass {
       // Load x & y norms required by this threadblock in shmem buffer
       if (gridStrideX == blockIdx.x * P::Nblk) {
         for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) {
-          auto idx  = gridStrideY + i;
+          auto idx = gridStrideY + i;
           sxNorm[i] = idx < this->m ? xn[idx] : 0;
         }
       }
 
       for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) {
-        auto idx  = gridStrideX + i;
+        auto idx = gridStrideX + i;
         syNorm[i] = idx < this->n ? yn[idx] : 0;
       }
 
@@ -312,67 +288,42 @@ struct PairwiseDistances : public BaseClass {
  * @param fin_op    the final gemm epilogue lambda
  */
 
-template <bool useNorms,
-          typename DataT,
-          typename AccT,
-          typename OutT,
-          typename IdxT,
-          typename Policy,
-          typename CoreLambda,
-          typename EpilogueLambda,
-          typename FinalLambda,
-          bool isRowMajor = true,
-          bool writeOut   = true>
-__global__ __launch_bounds__(Policy::Nthreads,
-                             2) void pairwiseDistanceMatKernel(const DataT* x,
-                                                               const DataT* y,
-                                                               const DataT* _xn,
-                                                               const DataT* _yn,
-                                                               IdxT m,
-                                                               IdxT n,
-                                                               IdxT k,
-                                                               IdxT lda,
-                                                               IdxT ldb,
-                                                               IdxT ldd,
-                                                               OutT* dOutput,
-                                                               CoreLambda core_op,
-                                                               EpilogueLambda epilog_op,
-                                                               FinalLambda fin_op)
-{
+template <bool useNorms, typename DataT, typename AccT, typename OutT,
+          typename IdxT, typename Policy, typename CoreLambda,
+          typename EpilogueLambda, typename FinalLambda, bool isRowMajor = true,
+          bool writeOut = true>
+__global__ __launch_bounds__(
+  Policy::Nthreads,
+  2) void pairwiseDistanceMatKernel(const DataT* x, const DataT* y,
+                                    const DataT* _xn, const DataT* _yn, IdxT m,
+                                    IdxT n, IdxT k, IdxT lda, IdxT ldb,
+                                    IdxT ldd, OutT* dOutput, CoreLambda core_op,
+                                    EpilogueLambda epilog_op,
+                                    FinalLambda fin_op) {
   extern __shared__ char smem[];
   auto rowEpilog = [] __device__(IdxT starty) { return; };
 
-  PairwiseDistances<useNorms,
-                    DataT,
-                    AccT,
-                    OutT,
-                    IdxT,
-                    Policy,
-                    CoreLambda,
-                    EpilogueLambda,
-                    FinalLambda,
-                    decltype(rowEpilog),
-                    isRowMajor,
-                    writeOut>
-    obj(
-      x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog);
+  PairwiseDistances<useNorms, DataT, AccT, OutT, IdxT, Policy, CoreLambda,
+                    EpilogueLambda, FinalLambda, decltype(rowEpilog),
+                    isRowMajor, writeOut>
+    obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op,
+        epilog_op, fin_op, rowEpilog);
   obj.run();
 }
 
 template <typename P, typename IdxT, typename T>
-dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func)
-{
-  const auto numSMs  = raft::getMultiProcessorCount();
+dim3 launchConfigGenerator(IdxT m, IdxT n, size_t sMemSize, T func) {
+  const auto numSMs = raft::getMultiProcessorCount();
   int numBlocksPerSm = 0;
   dim3 grid;
 
-  CUDA_CHECK(
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, func, P::Nthreads, sMemSize));
+  CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &numBlocksPerSm, func, P::Nthreads, sMemSize));
   int minGridSize = numSMs * numBlocksPerSm;
-  int yChunks     = raft::ceildiv<int>(m, P::Mblk);
-  int xChunks     = raft::ceildiv<int>(n, P::Nblk);
-  grid.y          = yChunks > minGridSize ? minGridSize : yChunks;
-  grid.x          = (minGridSize - grid.y) <= 0 ? 1 : xChunks;
+  int yChunks = raft::ceildiv<int>(m, P::Mblk);
+  int xChunks = raft::ceildiv<int>(n, P::Nblk);
+  grid.y = yChunks > minGridSize ? minGridSize : yChunks;
+  grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks;
   if (grid.x != 1) {
     int i = 1;
     while (grid.y * i < minGridSize) {
diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index 773b83ab13..c62f2e5f79 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -31,14 +31,14 @@ class exception : public std::exception {
   explicit exception() noexcept : std::exception(), msg_() {}
 
   /** copy ctor */
-  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
-  {
+  exception(exception const& src) noexcept
+    : std::exception(), msg_(src.what()) {
     collect_call_stack();
   }
 
   /** ctor from an input message */
-  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
-  {
+  explicit exception(std::string const msg) noexcept
+    : std::exception(), msg_(std::move(msg)) {
     collect_call_stack();
   }
 
@@ -51,8 +51,7 @@ class exception : public std::exception {
 
   /** append call stack info to this exception's message for ease of debug */
   // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-  void collect_call_stack() noexcept
-  {
+  void collect_call_stack() noexcept {
 #ifdef __GNUC__
     constexpr int kMaxStackDepth = 64;
     void* stack[kMaxStackDepth];  // NOLINT
@@ -91,16 +90,16 @@ struct logic_error : public raft::exception {
 
 // FIXME: Need to be replaced with RAFT_FAIL
 /** macro to throw a runtime error */
-#define THROW(fmt, ...)                                                                    \
-  do {                                                                                     \
-    std::string msg;                                                                       \
-    char errMsg[2048]; /* NOLINT */                                                        \
-    std::snprintf(                                                                         \
-      errMsg, sizeof(errMsg), "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
-    msg += errMsg;                                                                         \
-    std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__);                             \
-    msg += errMsg;                                                                         \
-    throw raft::exception(msg);                                                            \
+#define THROW(fmt, ...)                                                        \
+  do {                                                                         \
+    std::string msg;                                                           \
+    char errMsg[2048]; /* NOLINT */                                            \
+    std::snprintf(errMsg, sizeof(errMsg),                                      \
+                  "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
+    msg += errMsg;                                                             \
+    std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__);                 \
+    msg += errMsg;                                                             \
+    throw raft::exception(msg);                                                \
   } while (0)
 
 // FIXME: Need to be replaced with RAFT_EXPECTS
@@ -110,15 +109,16 @@ struct logic_error : public raft::exception {
     if (!(check)) THROW(fmt, ##__VA_ARGS__); \
   } while (0)
 
-#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                 \
-  do {                                                                                \
-    char err_msg[2048]; /* NOLINT */                                                  \
-    std::snprintf(err_msg, sizeof(err_msg), location_prefix);                         \
-    msg += err_msg;                                                                   \
-    std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, __LINE__); \
-    msg += err_msg;                                                                   \
-    std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__);                      \
-    msg += err_msg;                                                                   \
+#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                      \
+  do {                                                                     \
+    char err_msg[2048]; /* NOLINT */                                       \
+    std::snprintf(err_msg, sizeof(err_msg), location_prefix);              \
+    msg += err_msg;                                                        \
+    std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \
+                  __LINE__);                                               \
+    msg += err_msg;                                                        \
+    std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__);           \
+    msg += err_msg;                                                        \
   } while (0)
 
 /**
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index bb7d22e079..dbe7e83189 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -65,29 +65,29 @@ class handle_t {
       }()),
       streams_(n_streams),
       device_allocator_(std::make_shared<mr::device::default_allocator>()),
-      host_allocator_(std::make_shared<mr::host::default_allocator>())
-  {
+      host_allocator_(std::make_shared<mr::host::default_allocator>()) {
     create_resources();
   }
 
   /**
-   * @brief Construct a light handle copy from another
+   * @brief Construct a light handle copy from another 
    * user stream, cuda handles, comms and worker pool are not copied
-   * The user_stream of the returned handle is set to the specified stream
-   * of the other handle worker pool
-   * @param[in] stream_id stream id in `other` worker streams
+   * The user_stream of the returned handle is set to the specified stream 
+   * of the other handle worker pool 
+   * @param[in] stream_id stream id in `other` worker streams 
    * to be set as user stream in the constructed handle
    * @param[in] n_streams number worker streams to be created
    */
-  handle_t(const handle_t& other, int stream_id, int n_streams = kNumDefaultWorkerStreams)
-    : dev_id_(other.get_device()), streams_(n_streams)
-  {
-    RAFT_EXPECTS(other.get_num_internal_streams() > 0,
-                 "ERROR: the main handle must have at least one worker stream\n");
-    prop_                    = other.get_device_properties();
+  handle_t(const handle_t& other, int stream_id,
+           int n_streams = kNumDefaultWorkerStreams)
+    : dev_id_(other.get_device()), streams_(n_streams) {
+    RAFT_EXPECTS(
+      other.get_num_internal_streams() > 0,
+      "ERROR: the main handle must have at least one worker stream\n");
+    prop_ = other.get_device_properties();
     device_prop_initialized_ = true;
-    device_allocator_        = other.get_device_allocator();
-    host_allocator_          = other.get_host_allocator();
+    device_allocator_ = other.get_device_allocator();
+    host_allocator_ = other.get_host_allocator();
     create_resources();
     set_stream(other.get_internal_stream(stream_id));
   }
@@ -99,22 +99,25 @@ class handle_t {
 
   void set_stream(cudaStream_t stream) { user_stream_ = stream; }
   cudaStream_t get_stream() const { return user_stream_; }
-  rmm::cuda_stream_view get_stream_view() const { return rmm::cuda_stream_view(user_stream_); }
+  rmm::cuda_stream_view get_stream_view() const {
+    return rmm::cuda_stream_view(user_stream_);
+  }
 
-  void set_device_allocator(std::shared_ptr<mr::device::allocator> allocator)
-  {
+  void set_device_allocator(std::shared_ptr<mr::device::allocator> allocator) {
     device_allocator_ = allocator;
   }
-  std::shared_ptr<mr::device::allocator> get_device_allocator() const { return device_allocator_; }
+  std::shared_ptr<mr::device::allocator> get_device_allocator() const {
+    return device_allocator_;
+  }
 
-  void set_host_allocator(std::shared_ptr<mr::host::allocator> allocator)
-  {
+  void set_host_allocator(std::shared_ptr<mr::host::allocator> allocator) {
     host_allocator_ = allocator;
   }
-  std::shared_ptr<mr::host::allocator> get_host_allocator() const { return host_allocator_; }
+  std::shared_ptr<mr::host::allocator> get_host_allocator() const {
+    return host_allocator_;
+  }
 
-  cublasHandle_t get_cublas_handle() const
-  {
+  cublasHandle_t get_cublas_handle() const {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cublas_initialized_) {
       CUBLAS_CHECK(cublasCreate(&cublas_handle_));
@@ -123,8 +126,7 @@ class handle_t {
     return cublas_handle_;
   }
 
-  cusolverDnHandle_t get_cusolver_dn_handle() const
-  {
+  cusolverDnHandle_t get_cusolver_dn_handle() const {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_dn_initialized_) {
       CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_));
@@ -133,8 +135,7 @@ class handle_t {
     return cusolver_dn_handle_;
   }
 
-  cusolverSpHandle_t get_cusolver_sp_handle() const
-  {
+  cusolverSpHandle_t get_cusolver_sp_handle() const {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_sp_initialized_) {
       CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_));
@@ -143,8 +144,7 @@ class handle_t {
     return cusolver_sp_handle_;
   }
 
-  cusparseHandle_t get_cusparse_handle() const
-  {
+  cusparseHandle_t get_cusparse_handle() const {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusparse_initialized_) {
       CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
@@ -154,13 +154,16 @@ class handle_t {
   }
 
   // legacy compatibility for cuML
-  cudaStream_t get_internal_stream(int sid) const { return streams_.get_stream(sid).value(); }
+  cudaStream_t get_internal_stream(int sid) const {
+    return streams_.get_stream(sid).value();
+  }
   // new accessor return rmm::cuda_stream_view
-  rmm::cuda_stream_view get_internal_stream_view(int sid) const { return streams_.get_stream(sid); }
+  rmm::cuda_stream_view get_internal_stream_view(int sid) const {
+    return streams_.get_stream(sid);
+  }
 
   int get_num_internal_streams() const { return streams_.get_pool_size(); }
-  std::vector<cudaStream_t> get_internal_streams() const
-  {
+  std::vector<cudaStream_t> get_internal_streams() const {
     std::vector<cudaStream_t> int_streams_vec;
     for (int i = 0; i < get_num_internal_streams(); i++) {
       int_streams_vec.push_back(get_internal_stream(i));
@@ -168,51 +171,49 @@ class handle_t {
     return int_streams_vec;
   }
 
-  void wait_on_user_stream() const
-  {
+  void wait_on_user_stream() const {
     CUDA_CHECK(cudaEventRecord(event_, user_stream_));
     for (int i = 0; i < get_num_internal_streams(); i++) {
       CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0));
     }
   }
 
-  void wait_on_internal_streams() const
-  {
+  void wait_on_internal_streams() const {
     for (int i = 0; i < get_num_internal_streams(); i++) {
       CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i)));
       CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0));
     }
   }
 
-  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
+  void set_comms(std::shared_ptr<comms::comms_t> communicator) {
+    communicator_ = communicator;
+  }
 
-  const comms::comms_t& get_comms() const
-  {
-    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
+  const comms::comms_t& get_comms() const {
+    RAFT_EXPECTS(this->comms_initialized(),
+                 "ERROR: Communicator was not initialized\n");
     return *communicator_;
   }
 
-  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
-  {
+  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm) {
     subcomms_[key] = subcomm;
   }
 
-  const comms::comms_t& get_subcomm(std::string key) const
-  {
-    RAFT_EXPECTS(
-      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
+  const comms::comms_t& get_subcomm(std::string key) const {
+    RAFT_EXPECTS(subcomms_.find(key) != subcomms_.end(),
+                 "%s was not found in subcommunicators.", key.c_str());
 
     auto subcomm = subcomms_.at(key);
 
-    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
+    RAFT_EXPECTS(nullptr != subcomm.get(),
+                 "ERROR: Subcommunicator was not initialized");
 
     return *subcomm;
   }
 
   bool comms_initialized() const { return (nullptr != communicator_.get()); }
 
-  const cudaDeviceProp& get_device_properties() const
-  {
+  const cudaDeviceProp& get_device_properties() const {
     std::lock_guard<std::mutex> _(mutex_);
     if (!device_prop_initialized_) {
       CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_));
@@ -243,28 +244,29 @@ class handle_t {
   mutable bool device_prop_initialized_{false};
   mutable std::mutex mutex_;
 
-  void create_resources() { CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); }
+  void create_resources() {
+    CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+  }
 
-  void destroy_resources()
-  {
+  void destroy_resources() {
     ///@todo: enable *_NO_THROW variants once we have enabled logging
     if (cusparse_initialized_) {
-      // CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_));
+      //CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_));
       CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_));
     }
     if (cusolver_dn_initialized_) {
-      // CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+      //CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
       CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_));
     }
     if (cusolver_sp_initialized_) {
-      // CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+      //CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
       CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_));
     }
     if (cublas_initialized_) {
-      // CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_));
+      //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_));
       CUBLAS_CHECK(cublasDestroy(cublas_handle_));
     }
-    // CUDA_CHECK_NO_THROW(cudaEventDestroy(event_));
+    //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_));
     CUDA_CHECK(cudaEventDestroy(event_));
   }
 };  // class handle_t
@@ -274,8 +276,7 @@ class handle_t {
  */
 class stream_syncer {
  public:
-  explicit stream_syncer(const handle_t& handle) : handle_(handle)
-  {
+  explicit stream_syncer(const handle_t& handle) : handle_(handle) {
     handle_.wait_on_user_stream();
   }
   ~stream_syncer() { handle_.wait_on_internal_streams(); }
diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h
index 5fc56de14b..a7cfb9287b 100644
--- a/cpp/include/raft/integer_utils.h
+++ b/cpp/include/raft/integer_utils.h
@@ -34,13 +34,15 @@ namespace raft {
  * `modulus` is positive.
  */
 template <typename S>
-inline S round_up_safe(S number_to_round, S modulus)
-{
+inline S round_up_safe(S number_to_round, S modulus) {
   auto remainder = number_to_round % modulus;
-  if (remainder == 0) { return number_to_round; }
+  if (remainder == 0) {
+    return number_to_round;
+  }
   auto rounded_up = number_to_round - remainder + modulus;
   if (rounded_up < number_to_round) {
-    throw std::invalid_argument("Attempt to round up beyond the type's maximum value");
+    throw std::invalid_argument(
+      "Attempt to round up beyond the type's maximum value");
   }
   return rounded_up;
 }
@@ -51,9 +53,8 @@ inline S round_up_safe(S number_to_round, S modulus)
  * `modulus` is positive.
  */
 template <typename S>
-inline S round_down_safe(S number_to_round, S modulus)
-{
-  auto remainder    = number_to_round % modulus;
+inline S round_down_safe(S number_to_round, S modulus) {
+  auto remainder = number_to_round % modulus;
   auto rounded_down = number_to_round - remainder;
   return rounded_down;
 }
@@ -71,28 +72,25 @@ inline S round_down_safe(S number_to_round, S modulus)
  * the result will be incorrect
  */
 template <typename S, typename T>
-constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept
-{
+constexpr inline S div_rounding_up_unsafe(const S& dividend,
+                                          const T& divisor) noexcept {
   return (dividend + divisor - 1) / divisor;
 }
 
 namespace detail {
 template <typename I>
 constexpr inline I div_rounding_up_safe(std::integral_constant<bool, false>,
-                                        I dividend,
-                                        I divisor) noexcept
-{
+                                        I dividend, I divisor) noexcept {
   // TODO: This could probably be implemented faster
-  return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
-                              : (dividend > 0);
+  return (dividend > divisor)
+           ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
+           : (dividend > 0);
 }
 
 template <typename I>
 constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
-                                        I dividend,
-                                        I divisor) noexcept
-{
-  auto quotient  = dividend / divisor;
+                                        I dividend, I divisor) noexcept {
+  auto quotient = dividend / divisor;
   auto remainder = dividend % divisor;
   return quotient + (remainder != 0);
 }
@@ -112,17 +110,16 @@ constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
  * approach of using (dividend + divisor - 1) / divisor
  */
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, I> div_rounding_up_safe(
-  I dividend, I divisor) noexcept
-{
-  using i_is_a_signed_type = std::integral_constant<bool, std::is_signed<I>::value>;
+constexpr inline std::enable_if_t<std::is_integral<I>::value, I>
+div_rounding_up_safe(I dividend, I divisor) noexcept {
+  using i_is_a_signed_type =
+    std::integral_constant<bool, std::is_signed<I>::value>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, bool> is_a_power_of_two(
-  I val) noexcept
-{
+constexpr inline std::enable_if_t<std::is_integral<I>::value, bool>
+is_a_power_of_two(I val) noexcept {
   return ((val - 1) & val) == 0;
 }
 
@@ -150,14 +147,14 @@ constexpr inline std::enable_if_t<std::is_integral<I>::value, bool> is_a_power_o
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
-{
+std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(
+  T value) {
   return std::abs(value);
 }
 // Unsigned type just returns itself.
 template <typename T>
-std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
-{
+std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(
+  T value) {
   return value;
 }
 
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index 0bbfa2bb3c..0da7da2eb6 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -43,35 +43,33 @@ namespace label {
  * \param [in] allocator device allocator
  */
 template <typename value_t>
-void getUniquelabels(value_t* y,
-                     size_t n,
-                     value_t** y_unique,
-                     int* n_unique,
+void getUniquelabels(value_t *y, size_t n, value_t **y_unique, int *n_unique,
                      cudaStream_t stream,
-                     std::shared_ptr<raft::mr::device::allocator> allocator)
-{
+                     std::shared_ptr<raft::mr::device::allocator> allocator) {
   raft::mr::device::buffer<value_t> y2(allocator, stream, n);
   raft::mr::device::buffer<value_t> y3(allocator, stream, n);
   raft::mr::device::buffer<int> d_num_selected(allocator, stream, 1);
-  size_t bytes  = 0;
+  size_t bytes = 0;
   size_t bytes2 = 0;
 
   // Query how much temporary storage we will need for cub operations
   // and allocate it
   cub::DeviceRadixSort::SortKeys(NULL, bytes, y, y2.data(), n);
-  cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(), d_num_selected.data(), n);
+  cub::DeviceSelect::Unique(NULL, bytes2, y2.data(), y3.data(),
+                            d_num_selected.data(), n);
   bytes = max(bytes, bytes2);
   raft::mr::device::buffer<char> cub_storage(allocator, stream, bytes);
 
   // Select Unique classes
   cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, y2.data(), n);
-  cub::DeviceSelect::Unique(
-    cub_storage.data(), bytes, y2.data(), y3.data(), d_num_selected.data(), n);
+  cub::DeviceSelect::Unique(cub_storage.data(), bytes, y2.data(), y3.data(),
+                            d_num_selected.data(), n);
   raft::update_host(n_unique, d_num_selected.data(), 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   // Copy unique classes to output
-  *y_unique = (value_t*)allocator->allocate(*n_unique * sizeof(value_t), stream);
+  *y_unique =
+    (value_t *)allocator->allocate(*n_unique * sizeof(value_t), stream);
   raft::copy(*y_unique, y3.data(), *n_unique, stream);
 }
 
@@ -94,17 +92,16 @@ void getUniquelabels(value_t* y,
  * \param [in] stream cuda stream
  */
 template <typename value_t>
-void getOvrlabels(
-  value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream)
-{
+void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes,
+                  value_t *y_out, int idx, cudaStream_t stream) {
   ASSERT(idx < n_classes,
          "Parameter idx should not be larger than the number "
          "of classes");
   raft::linalg::unaryOp(
-    y_out,
-    y,
-    n,
-    [idx, y_unique] __device__(value_t y) { return y == y_unique[idx] ? +1 : -1; },
+    y_out, y, n,
+    [idx, y_unique] __device__(value_t y) {
+      return y == y_unique[idx] ? +1 : -1;
+    },
     stream);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -113,14 +110,9 @@ void getOvrlabels(
 // +/-1, return array with the new class labels and corresponding indices.
 
 template <typename Type, int TPB_X, typename Lambda>
-__global__ void map_label_kernel(Type* map_ids,
-                                 size_t N_labels,
-                                 Type* in,
-                                 Type* out,
-                                 size_t N,
-                                 Lambda filter_op,
-                                 bool zero_based = false)
-{
+__global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in,
+                                 Type *out, size_t N, Lambda filter_op,
+                                 bool zero_based = false) {
   int tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     if (!filter_op(in[tid])) {
@@ -135,75 +127,68 @@ __global__ void map_label_kernel(Type* map_ids,
 }
 
 /**
- * Maps an input array containing a series of numbers into a new array
- * where numbers have been mapped to a monotonically increasing set
- * of labels. This can be useful in machine learning algorithms, for instance,
- * where a given set of labels is not taken from a monotonically increasing
- * set. This can happen if they are filtered or if only a subset of the
- * total labels are used in a dataset. This is also useful in graph algorithms
- * where a set of vertices need to be labeled in a monotonically increasing
- * order.
- * @tparam Type the numeric type of the input and output arrays
- * @tparam Lambda the type of an optional filter function, which determines
- * which items in the array to map.
- * @param out the output monotonic array
- * @param in input label array
- * @param N number of elements in the input array
- * @param stream cuda stream to use
- * @param filter_op an optional function for specifying which values
- * should have monotonically increasing labels applied to them.
- */
+   * Maps an input array containing a series of numbers into a new array
+   * where numbers have been mapped to a monotonically increasing set
+   * of labels. This can be useful in machine learning algorithms, for instance,
+   * where a given set of labels is not taken from a monotonically increasing
+   * set. This can happen if they are filtered or if only a subset of the
+   * total labels are used in a dataset. This is also useful in graph algorithms
+   * where a set of vertices need to be labeled in a monotonically increasing
+   * order.
+   * @tparam Type the numeric type of the input and output arrays
+   * @tparam Lambda the type of an optional filter function, which determines
+   * which items in the array to map.
+   * @param out the output monotonic array
+   * @param in input label array
+   * @param N number of elements in the input array
+   * @param stream cuda stream to use
+   * @param filter_op an optional function for specifying which values
+   * should have monotonically increasing labels applied to them.
+   */
 template <typename Type, typename Lambda>
-void make_monotonic(Type* out,
-                    Type* in,
-                    size_t N,
-                    cudaStream_t stream,
+void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
                     Lambda filter_op,
                     std::shared_ptr<raft::mr::device::allocator> allocator,
-                    bool zero_based = false)
-{
+                    bool zero_based = false) {
   static const size_t TPB_X = 256;
 
   dim3 blocks(raft::ceildiv(N, TPB_X));
   dim3 threads(TPB_X);
 
-  Type* map_ids;
+  Type *map_ids;
   int num_clusters;
   getUniquelabels(in, N, &map_ids, &num_clusters, stream, allocator);
 
-  map_label_kernel<Type, TPB_X>
-    <<<blocks, threads, 0, stream>>>(map_ids, num_clusters, in, out, N, filter_op, zero_based);
+  map_label_kernel<Type, TPB_X><<<blocks, threads, 0, stream>>>(
+    map_ids, num_clusters, in, out, N, filter_op, zero_based);
 
   allocator->deallocate(map_ids, num_clusters * sizeof(Type), stream);
 }
 
 /**
- * Maps an input array containing a series of numbers into a new array
- * where numbers have been mapped to a monotonically increasing set
- * of labels. This can be useful in machine learning algorithms, for instance,
- * where a given set of labels is not taken from a monotonically increasing
- * set. This can happen if they are filtered or if only a subset of the
- * total labels are used in a dataset. This is also useful in graph algorithms
- * where a set of vertices need to be labeled in a monotonically increasing
- * order.
- * @tparam Type the numeric type of the input and output arrays
- * @tparam Lambda the type of an optional filter function, which determines
- * which items in the array to map.
- * @param out output label array with labels assigned monotonically
- * @param in input label array
- * @param N number of elements in the input array
- * @param stream cuda stream to use
- */
+   * Maps an input array containing a series of numbers into a new array
+   * where numbers have been mapped to a monotonically increasing set
+   * of labels. This can be useful in machine learning algorithms, for instance,
+   * where a given set of labels is not taken from a monotonically increasing
+   * set. This can happen if they are filtered or if only a subset of the
+   * total labels are used in a dataset. This is also useful in graph algorithms
+   * where a set of vertices need to be labeled in a monotonically increasing
+   * order.
+   * @tparam Type the numeric type of the input and output arrays
+   * @tparam Lambda the type of an optional filter function, which determines
+   * which items in the array to map.
+   * @param out output label array with labels assigned monotonically
+   * @param in input label array
+   * @param N number of elements in the input array
+   * @param stream cuda stream to use
+   */
 template <typename Type>
-void make_monotonic(Type* out,
-                    Type* in,
-                    size_t N,
-                    cudaStream_t stream,
+void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
                     std::shared_ptr<raft::mr::device::allocator> allocator,
-                    bool zero_based = false)
-{
+                    bool zero_based = false) {
   make_monotonic<Type>(
-    out, in, N, stream, [] __device__(Type val) { return false; }, allocator, zero_based);
+    out, in, N, stream, [] __device__(Type val) { return false; }, allocator,
+    zero_based);
 }
 };  // namespace label
 };  // end namespace raft
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
index 1ee0659b0d..bed74581a2 100644
--- a/cpp/include/raft/label/merge_labels.cuh
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -35,10 +35,8 @@ __global__ void __launch_bounds__(TPB_X)
   propagate_label_kernel(const value_idx* __restrict__ labels_a,
                          const value_idx* __restrict__ labels_b,
                          value_idx* __restrict__ R,
-                         const bool* __restrict__ mask,
-                         bool* __restrict__ m,
-                         value_idx N)
-{
+                         const bool* __restrict__ mask, bool* __restrict__ m,
+                         value_idx N) {
   value_idx tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     if (__ldg((char*)mask + tid)) {
@@ -67,17 +65,15 @@ template <typename value_idx, int TPB_X = 256>
 __global__ void __launch_bounds__(TPB_X)
   reassign_label_kernel(value_idx* __restrict__ labels_a,
                         const value_idx* __restrict__ labels_b,
-                        const value_idx* __restrict__ R,
-                        value_idx N,
-                        value_idx MAX_LABEL)
-{
+                        const value_idx* __restrict__ R, value_idx N,
+                        value_idx MAX_LABEL) {
   value_idx tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     // Note: labels are from 1 to N
-    value_idx la  = labels_a[tid];
-    value_idx lb  = __ldg(labels_b + tid);
-    value_idx ra  = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1;
-    value_idx rb  = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1;
+    value_idx la = labels_a[tid];
+    value_idx lb = __ldg(labels_b + tid);
+    value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1;
+    value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1;
     labels_a[tid] = min(ra, rb);
   }
 }
@@ -112,14 +108,9 @@ __global__ void __launch_bounds__(TPB_X)
  * @param[in]    stream      CUDA stream
  */
 template <typename value_idx = int, int TPB_X = 256>
-void merge_labels(value_idx* labels_a,
-                  const value_idx* labels_b,
-                  const bool* mask,
-                  value_idx* R,
-                  bool* m,
-                  value_idx N,
-                  cudaStream_t stream)
-{
+void merge_labels(value_idx* labels_a, const value_idx* labels_b,
+                  const bool* mask, value_idx* R, bool* m, value_idx N,
+                  cudaStream_t stream) {
   dim3 blocks(raft::ceildiv(N, value_idx(TPB_X)));
   dim3 threads(TPB_X);
   value_idx MAX_LABEL = std::numeric_limits<value_idx>::max();
diff --git a/cpp/include/raft/lap/d_structs.h b/cpp/include/raft/lap/d_structs.h
index e488dc528f..ed545b7198 100644
--- a/cpp/include/raft/lap/d_structs.h
+++ b/cpp/include/raft/lap/d_structs.h
@@ -26,18 +26,18 @@
 
 template <typename vertex_t, typename weight_t>
 struct Vertices {
-  vertex_t* row_assignments;
-  vertex_t* col_assignments;
-  int* row_covers;
-  int* col_covers;
-  weight_t* row_duals;
-  weight_t* col_duals;
-  weight_t* col_slacks;
+  vertex_t *row_assignments;
+  vertex_t *col_assignments;
+  int *row_covers;
+  int *col_covers;
+  weight_t *row_duals;
+  weight_t *col_duals;
+  weight_t *col_slacks;
 };
 
 template <typename vertex_t>
 struct VertexData {
-  vertex_t* parents;
-  vertex_t* children;
-  int* is_visited;
+  vertex_t *parents;
+  vertex_t *children;
+  int *is_visited;
 };
diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh
index 64b6a31efb..6bc1c08029 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.cuh
@@ -38,12 +38,12 @@ class LinearAssignmentProblem {
   vertex_t batchsize_;
   weight_t epsilon_;
 
-  weight_t const* d_costs_;
+  weight_t const *d_costs_;
 
   Vertices<vertex_t, weight_t> d_vertices_dev;
   VertexData<vertex_t> d_row_data_dev, d_col_data_dev;
 
-  raft::handle_t const& handle_;
+  raft::handle_t const &handle_;
   raft::mr::device::buffer<int> row_covers_v;
   raft::mr::device::buffer<int> col_covers_v;
   raft::mr::device::buffer<weight_t> row_duals_v;
@@ -59,10 +59,8 @@ class LinearAssignmentProblem {
   raft::mr::device::buffer<weight_t> obj_val_dual_v;
 
  public:
-  LinearAssignmentProblem(raft::handle_t const& handle,
-                          vertex_t size,
-                          vertex_t batchsize,
-                          weight_t epsilon)
+  LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size,
+                          vertex_t batchsize, weight_t epsilon)
     : handle_(handle),
       size_(size),
       batchsize_(batchsize),
@@ -80,13 +78,11 @@ class LinearAssignmentProblem {
       row_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
       col_children_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
       obj_val_primal_v(handle_.get_device_allocator(), handle_.get_stream(), 0),
-      obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0)
-  {
-  }
+      obj_val_dual_v(handle_.get_device_allocator(), handle_.get_stream(), 0) {}
 
   // Executes Hungarian algorithm on the input cost matrix.
-  void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment)
-  {
+  void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment,
+             vertex_t *d_col_assignment) {
     initializeDevice();
 
     d_vertices_dev.row_assignments = d_row_assignment;
@@ -98,13 +94,27 @@ class LinearAssignmentProblem {
 
     while (step != 100) {
       switch (step) {
-        case 0: step = hungarianStep0(); break;
-        case 1: step = hungarianStep1(); break;
-        case 2: step = hungarianStep2(); break;
-        case 3: step = hungarianStep3(); break;
-        case 4: step = hungarianStep4(); break;
-        case 5: step = hungarianStep5(); break;
-        case 6: step = hungarianStep6(); break;
+        case 0:
+          step = hungarianStep0();
+          break;
+        case 1:
+          step = hungarianStep1();
+          break;
+        case 2:
+          step = hungarianStep2();
+          break;
+        case 3:
+          step = hungarianStep3();
+          break;
+        case 4:
+          step = hungarianStep4();
+          break;
+        case 5:
+          step = hungarianStep5();
+          break;
+        case 6:
+          step = hungarianStep6();
+          break;
       }
     }
 
@@ -112,39 +122,36 @@ class LinearAssignmentProblem {
   }
 
   // Function for getting optimal row dual vector for subproblem spId.
-  std::pair<const weight_t*, vertex_t> getRowDualVector(int spId) const
-  {
+  std::pair<const weight_t *, vertex_t> getRowDualVector(int spId) const {
     return std::make_pair(row_duals_v.data() + spId * size_, size_);
   }
 
   // Function for getting optimal col dual vector for subproblem spId.
-  std::pair<const weight_t*, vertex_t> getColDualVector(int spId)
-  {
+  std::pair<const weight_t *, vertex_t> getColDualVector(int spId) {
     return std::make_pair(col_duals_v.data() + spId * size_, size_);
   }
 
   // Function for getting optimal primal objective value for subproblem spId.
-  weight_t getPrimalObjectiveValue(int spId)
-  {
+  weight_t getPrimalObjectiveValue(int spId) {
     weight_t result;
-    raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream());
+    raft::update_host(&result, obj_val_primal_v.data() + spId, 1,
+                      handle_.get_stream());
     CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
   // Function for getting optimal dual objective value for subproblem spId.
-  weight_t getDualObjectiveValue(int spId)
-  {
+  weight_t getDualObjectiveValue(int spId) {
     weight_t result;
-    raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream());
+    raft::update_host(&result, obj_val_dual_v.data() + spId, 1,
+                      handle_.get_stream());
     CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
  private:
   // Helper function for initializing global variables and arrays on a single host.
-  void initializeDevice()
-  {
+  void initializeDevice() {
     row_covers_v.resize(batchsize_ * size_);
     col_covers_v.resize(batchsize_ * size_);
     row_duals_v.resize(batchsize_ * size_);
@@ -162,36 +169,39 @@ class LinearAssignmentProblem {
     d_vertices_dev.row_covers = row_covers_v.data();
     d_vertices_dev.col_covers = col_covers_v.data();
 
-    d_vertices_dev.row_duals  = row_duals_v.data();
-    d_vertices_dev.col_duals  = col_duals_v.data();
+    d_vertices_dev.row_duals = row_duals_v.data();
+    d_vertices_dev.col_duals = col_duals_v.data();
     d_vertices_dev.col_slacks = col_slacks_v.data();
 
     d_row_data_dev.is_visited = row_is_visited_v.data();
     d_col_data_dev.is_visited = col_is_visited_v.data();
-    d_row_data_dev.parents    = row_parents_v.data();
-    d_row_data_dev.children   = row_children_v.data();
-    d_col_data_dev.parents    = col_parents_v.data();
-    d_col_data_dev.children   = col_children_v.data();
-
-    thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0});
-    thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0});
-    thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0});
-    thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0});
+    d_row_data_dev.parents = row_parents_v.data();
+    d_row_data_dev.children = row_children_v.data();
+    d_col_data_dev.parents = col_parents_v.data();
+    d_col_data_dev.children = col_children_v.data();
+
+    thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(),
+                 int{0});
+    thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(),
+                 int{0});
+    thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(),
+                 weight_t{0});
+    thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(),
+                 weight_t{0});
   }
 
   // Function for calculating initial zeros by subtracting row and column minima from each element.
-  int hungarianStep0()
-  {
-    detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_);
+  int hungarianStep0() {
+    detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_,
+                             size_);
 
     return 1;
   }
 
   // Function for calculating initial zeros by subtracting row and column minima from each element.
-  int hungarianStep1()
-  {
-    detail::computeInitialAssignments(
-      handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_);
+  int hungarianStep1() {
+    detail::computeInitialAssignments(handle_, d_costs_, d_vertices_dev,
+                                      batchsize_, size_, epsilon_);
 
     int next = 2;
 
@@ -207,10 +217,10 @@ class LinearAssignmentProblem {
   }
 
   // Function for checking optimality and constructing predicates and covers.
-  int hungarianStep2()
-  {
-    int cover_count = detail::computeRowCovers(
-      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
+  int hungarianStep2() {
+    int cover_count =
+      detail::computeRowCovers(handle_, d_vertices_dev, d_row_data_dev,
+                               d_col_data_dev, batchsize_, size_);
 
     int next = (cover_count == batchsize_ * size_) ? 6 : 3;
 
@@ -218,23 +228,17 @@ class LinearAssignmentProblem {
   }
 
   // Function for building alternating tree rooted at unassigned rows.
-  int hungarianStep3()
-  {
+  int hungarianStep3() {
     int next;
 
-    raft::mr::device::buffer<bool> flag_v(handle_.get_device_allocator(), handle_.get_stream(), 1);
+    raft::mr::device::buffer<bool> flag_v(handle_.get_device_allocator(),
+                                          handle_.get_stream(), 1);
 
     bool h_flag = false;
     raft::update_device(flag_v.data(), &h_flag, 1, handle_.get_stream());
 
-    detail::executeZeroCover(handle_,
-                             d_costs_,
-                             d_vertices_dev,
-                             d_row_data_dev,
-                             d_col_data_dev,
-                             flag_v.data(),
-                             batchsize_,
-                             size_,
+    detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev,
+                             d_col_data_dev, flag_v.data(), batchsize_, size_,
                              epsilon_);
 
     raft::update_host(&h_flag, flag_v.data(), 1, handle_.get_stream());
@@ -245,36 +249,31 @@ class LinearAssignmentProblem {
   }
 
   // Function for augmenting the solution along multiple node-disjoint alternating trees.
-  int hungarianStep4()
-  {
-    detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_);
+  int hungarianStep4() {
+    detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_,
+                        size_);
 
-    detail::augmentationPass(
-      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
+    detail::augmentationPass(handle_, d_vertices_dev, d_row_data_dev,
+                             d_col_data_dev, batchsize_, size_);
 
     return 2;
   }
 
   // Function for updating dual solution to introduce new zero-cost arcs.
-  int hungarianStep5()
-  {
-    detail::dualUpdate(
-      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_);
+  int hungarianStep5() {
+    detail::dualUpdate(handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev,
+                       batchsize_, size_, epsilon_);
 
     return 3;
   }
 
   // Function for calculating primal and dual objective values at optimality.
-  int hungarianStep6()
-  {
-    detail::calcObjValPrimal(handle_,
-                             obj_val_primal_v.data(),
-                             d_costs_,
-                             d_vertices_dev.row_assignments,
-                             batchsize_,
-                             size_);
+  int hungarianStep6() {
+    detail::calcObjValPrimal(handle_, obj_val_primal_v.data(), d_costs_,
+                             d_vertices_dev.row_assignments, batchsize_, size_);
 
-    detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_);
+    detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev,
+                           batchsize_, size_);
 
     return 100;
   }
diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh
index 9bbd44bf09..0079f50e82 100644
--- a/cpp/include/raft/lap/lap_functions.cuh
+++ b/cpp/include/raft/lap/lap_functions.cuh
@@ -46,26 +46,20 @@ const int BLOCKDIMX{64};
 const int BLOCKDIMY{1};
 
 // Function for calculating grid and block dimensions from the given input size.
-inline void calculateLinearDims(dim3& blocks_per_grid,
-                                dim3& threads_per_block,
-                                int& total_blocks,
-                                int size)
-{
+inline void calculateLinearDims(dim3 &blocks_per_grid, dim3 &threads_per_block,
+                                int &total_blocks, int size) {
   threads_per_block.x = BLOCKDIMX * BLOCKDIMY;
 
   int value = size / threads_per_block.x;
   if (size % threads_per_block.x > 0) value++;
 
-  total_blocks      = value;
+  total_blocks = value;
   blocks_per_grid.x = value;
 }
 
 // Function for calculating grid and block dimensions from the given input size for square grid.
-inline void calculateSquareDims(dim3& blocks_per_grid,
-                                dim3& threads_per_block,
-                                int& total_blocks,
-                                int size)
-{
+inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block,
+                                int &total_blocks, int size) {
   threads_per_block.x = BLOCKDIMX;
   threads_per_block.y = BLOCKDIMY;
 
@@ -74,16 +68,15 @@ inline void calculateSquareDims(dim3& blocks_per_grid,
   int valuex = (int)ceil((float)(sq_size) / BLOCKDIMX);
   int valuey = (int)ceil((float)(sq_size) / BLOCKDIMY);
 
-  total_blocks      = valuex * valuey;
+  total_blocks = valuex * valuey;
   blocks_per_grid.x = valuex;
   blocks_per_grid.y = valuey;
 }
 
-// Function for calculating grid and block dimensions from the given input size for rectangular
-// grid.
-inline void calculateRectangularDims(
-  dim3& blocks_per_grid, dim3& threads_per_block, int& total_blocks, int xsize, int ysize)
-{
+// Function for calculating grid and block dimensions from the given input size for rectangular grid.
+inline void calculateRectangularDims(dim3 &blocks_per_grid,
+                                     dim3 &threads_per_block, int &total_blocks,
+                                     int xsize, int ysize) {
   threads_per_block.x = BLOCKDIMX;
   threads_per_block.y = BLOCKDIMY;
 
@@ -93,18 +86,16 @@ inline void calculateRectangularDims(
   int valuey = ysize / threads_per_block.y;
   if (ysize % threads_per_block.y > 0) valuey++;
 
-  total_blocks      = valuex * valuey;
+  total_blocks = valuex * valuey;
   blocks_per_grid.x = valuex;
   blocks_per_grid.y = valuey;
 }
 
 template <typename vertex_t, typename weight_t>
-inline void initialReduction(raft::handle_t const& handle,
-                             weight_t const* d_costs,
-                             Vertices<vertex_t, weight_t>& d_vertices_dev,
-                             int SP,
-                             vertex_t N)
-{
+inline void initialReduction(raft::handle_t const &handle,
+                             weight_t const *d_costs,
+                             Vertices<vertex_t, weight_t> &d_vertices_dev,
+                             int SP, vertex_t N) {
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -112,38 +103,34 @@ inline void initialReduction(raft::handle_t const& handle,
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
-  kernel_rowReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
-    d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits<weight_t>::max());
+  kernel_rowReduction<<<blocks_per_grid, threads_per_block, 0,
+                        handle.get_stream()>>>(
+    d_costs, d_vertices_dev.row_duals, SP, N,
+    std::numeric_limits<weight_t>::max());
 
   CHECK_CUDA(handle.get_stream());
-  kernel_columnReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
-    d_costs,
-    d_vertices_dev.row_duals,
-    d_vertices_dev.col_duals,
-    SP,
-    N,
+  kernel_columnReduction<<<blocks_per_grid, threads_per_block, 0,
+                           handle.get_stream()>>>(
+    d_costs, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N,
     std::numeric_limits<weight_t>::max());
   CHECK_CUDA(handle.get_stream());
 }
 
 template <typename vertex_t, typename weight_t>
-inline void computeInitialAssignments(raft::handle_t const& handle,
-                                      weight_t const* d_costs,
-                                      Vertices<vertex_t, weight_t>& d_vertices,
-                                      int SP,
-                                      vertex_t N,
-                                      weight_t epsilon)
-{
+inline void computeInitialAssignments(raft::handle_t const &handle,
+                                      weight_t const *d_costs,
+                                      Vertices<vertex_t, weight_t> &d_vertices,
+                                      int SP, vertex_t N, weight_t epsilon) {
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
 
   std::size_t size = SP * N;
 
-  raft::mr::device::buffer<int> row_lock_v(
-    handle.get_device_allocator(), handle.get_stream(), size);
-  raft::mr::device::buffer<int> col_lock_v(
-    handle.get_device_allocator(), handle.get_stream(), size);
+  raft::mr::device::buffer<int> row_lock_v(handle.get_device_allocator(),
+                                           handle.get_stream(), size);
+  raft::mr::device::buffer<int> col_lock_v(handle.get_device_allocator(),
+                                           handle.get_stream(), size);
 
   thrust::fill_n(thrust::device, d_vertices.row_assignments, size, -1);
   thrust::fill_n(thrust::device, d_vertices.col_assignments, size, -1);
@@ -153,29 +140,21 @@ inline void computeInitialAssignments(raft::handle_t const& handle,
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
-  kernel_computeInitialAssignments<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
-    d_costs,
-    d_vertices.row_duals,
-    d_vertices.col_duals,
-    d_vertices.row_assignments,
-    d_vertices.col_assignments,
-    row_lock_v.data(),
-    col_lock_v.data(),
-    SP,
-    N,
-    epsilon);
+  kernel_computeInitialAssignments<<<blocks_per_grid, threads_per_block, 0,
+                                     handle.get_stream()>>>(
+    d_costs, d_vertices.row_duals, d_vertices.col_duals,
+    d_vertices.row_assignments, d_vertices.col_assignments, row_lock_v.data(),
+    col_lock_v.data(), SP, N, epsilon);
   CHECK_CUDA(handle.get_stream());
 }
 
 // Function for finding row cover on individual devices.
 template <typename vertex_t, typename weight_t>
-inline int computeRowCovers(raft::handle_t const& handle,
-                            Vertices<vertex_t, weight_t>& d_vertices,
-                            VertexData<vertex_t>& d_row_data,
-                            VertexData<vertex_t>& d_col_data,
-                            int SP,
-                            vertex_t N)
-{
+inline int computeRowCovers(raft::handle_t const &handle,
+                            Vertices<vertex_t, weight_t> &d_vertices,
+                            VertexData<vertex_t> &d_row_data,
+                            VertexData<vertex_t> &d_col_data, int SP,
+                            vertex_t N) {
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -184,7 +163,8 @@ inline int computeRowCovers(raft::handle_t const& handle,
 
   thrust::fill_n(thrust::device, d_vertices.row_covers, size, int{0});
   thrust::fill_n(thrust::device, d_vertices.col_covers, size, int{0});
-  thrust::fill_n(thrust::device, d_vertices.col_slacks, size, std::numeric_limits<weight_t>::max());
+  thrust::fill_n(thrust::device, d_vertices.col_slacks, size,
+                 std::numeric_limits<weight_t>::max());
   thrust::fill_n(thrust::device, d_row_data.is_visited, size, DORMANT);
   thrust::fill_n(thrust::device, d_col_data.is_visited, size, DORMANT);
   thrust::fill_n(thrust::device, d_row_data.parents, size, vertex_t{-1});
@@ -194,28 +174,25 @@ inline int computeRowCovers(raft::handle_t const& handle,
 
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
-  kernel_computeRowCovers<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
-    d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N);
+  kernel_computeRowCovers<<<blocks_per_grid, threads_per_block, 0,
+                            handle.get_stream()>>>(
+    d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited,
+    SP, N);
 
   CHECK_CUDA(handle.get_stream());
 
-  return thrust::reduce(thrust::device, d_vertices.row_covers, d_vertices.row_covers + size);
+  return thrust::reduce(thrust::device, d_vertices.row_covers,
+                        d_vertices.row_covers + size);
 }
 
 // Function for covering the zeros in uncovered rows and expanding the frontier.
 template <typename vertex_t, typename weight_t>
-inline void coverZeroAndExpand(raft::handle_t const& handle,
-                               weight_t const* d_costs_dev,
-                               vertex_t const* d_rows_csr_neighbors,
-                               vertex_t const* d_rows_csr_ptrs,
-                               Vertices<vertex_t, weight_t>& d_vertices_dev,
-                               VertexData<vertex_t>& d_row_data_dev,
-                               VertexData<vertex_t>& d_col_data_dev,
-                               bool* d_flag,
-                               int SP,
-                               vertex_t N,
-                               weight_t epsilon)
-{
+inline void coverZeroAndExpand(
+  raft::handle_t const &handle, weight_t const *d_costs_dev,
+  vertex_t const *d_rows_csr_neighbors, vertex_t const *d_rows_csr_ptrs,
+  Vertices<vertex_t, weight_t> &d_vertices_dev,
+  VertexData<vertex_t> &d_row_data_dev, VertexData<vertex_t> &d_col_data_dev,
+  bool *d_flag, int SP, vertex_t N, weight_t epsilon) {
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
@@ -223,34 +200,24 @@ inline void coverZeroAndExpand(raft::handle_t const& handle,
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
-  kernel_coverAndExpand<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
-    d_flag,
-    d_rows_csr_ptrs,
-    d_rows_csr_neighbors,
-    d_costs_dev,
-    d_vertices_dev,
-    d_row_data_dev,
-    d_col_data_dev,
-    SP,
-    N,
-    epsilon);
+  kernel_coverAndExpand<<<blocks_per_grid, threads_per_block, 0,
+                          handle.get_stream()>>>(
+    d_flag, d_rows_csr_ptrs, d_rows_csr_neighbors, d_costs_dev, d_vertices_dev,
+    d_row_data_dev, d_col_data_dev, SP, N, epsilon);
 }
 
 template <typename vertex_t, typename weight_t>
-inline vertex_t zeroCoverIteration(raft::handle_t const& handle,
-                                   weight_t const* d_costs_dev,
-                                   Vertices<vertex_t, weight_t>& d_vertices_dev,
-                                   VertexData<vertex_t>& d_row_data_dev,
-                                   VertexData<vertex_t>& d_col_data_dev,
-                                   bool* d_flag,
-                                   int SP,
-                                   vertex_t N,
-                                   weight_t epsilon)
-{
+inline vertex_t zeroCoverIteration(raft::handle_t const &handle,
+                                   weight_t const *d_costs_dev,
+                                   Vertices<vertex_t, weight_t> &d_vertices_dev,
+                                   VertexData<vertex_t> &d_row_data_dev,
+                                   VertexData<vertex_t> &d_col_data_dev,
+                                   bool *d_flag, int SP, vertex_t N,
+                                   weight_t epsilon) {
   vertex_t M;
 
-  raft::mr::device::buffer<vertex_t> csr_ptrs_v(
-    handle.get_device_allocator(), handle.get_stream(), 0);
+  raft::mr::device::buffer<vertex_t> csr_ptrs_v(handle.get_device_allocator(),
+                                                handle.get_stream(), 0);
   raft::mr::device::buffer<vertex_t> csr_neighbors_v(
     handle.get_device_allocator(), handle.get_stream(), 0);
 
@@ -259,8 +226,8 @@ inline vertex_t zeroCoverIteration(raft::handle_t const& handle,
     dim3 threads_per_block;
     int total_blocks = 0;
 
-    raft::mr::device::buffer<bool> predicates_v(
-      handle.get_device_allocator(), handle.get_stream(), SP * N);
+    raft::mr::device::buffer<bool> predicates_v(handle.get_device_allocator(),
+                                                handle.get_stream(), SP * N);
     raft::mr::device::buffer<vertex_t> addresses_v(
       handle.get_device_allocator(), handle.get_stream(), SP * N);
 
@@ -275,108 +242,87 @@ inline vertex_t zeroCoverIteration(raft::handle_t const& handle,
       blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
     // construct predicate matrix for edges.
-    kernel_rowPredicateConstructionCSR<<<blocks_per_grid,
-                                         threads_per_block,
-                                         0,
+    kernel_rowPredicateConstructionCSR<<<blocks_per_grid, threads_per_block, 0,
                                          handle.get_stream()>>>(
-      predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, N);
+      predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP,
+      N);
     CHECK_CUDA(handle.get_stream());
 
     M = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
-    thrust::exclusive_scan(
-      thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin());
+    thrust::exclusive_scan(thrust::device, addresses_v.begin(),
+                           addresses_v.end(), addresses_v.begin());
 
     if (M > 0) {
       csr_neighbors_v.resize(M);
 
-      kernel_rowScatterCSR<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
-        predicates_v.data(),
-        addresses_v.data(),
-        csr_neighbors_v.data(),
-        csr_ptrs_v.data(),
-        M,
-        SP,
-        N);
+      kernel_rowScatterCSR<<<blocks_per_grid, threads_per_block, 0,
+                             handle.get_stream()>>>(
+        predicates_v.data(), addresses_v.data(), csr_neighbors_v.data(),
+        csr_ptrs_v.data(), M, SP, N);
 
       CHECK_CUDA(handle.get_stream());
     }
   }
 
   if (M > 0) {
-    coverZeroAndExpand(handle,
-                       d_costs_dev,
-                       csr_neighbors_v.data(),
-                       csr_ptrs_v.data(),
-                       d_vertices_dev,
-                       d_row_data_dev,
-                       d_col_data_dev,
-                       d_flag,
-                       SP,
-                       N,
-                       epsilon);
+    coverZeroAndExpand(handle, d_costs_dev, csr_neighbors_v.data(),
+                       csr_ptrs_v.data(), d_vertices_dev, d_row_data_dev,
+                       d_col_data_dev, d_flag, SP, N, epsilon);
   }
 
   return M;
 }
 
-// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending
-// on the presence of uncovered zeros.
+// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending on the presence of uncovered zeros.
 template <typename vertex_t, typename weight_t>
-inline void executeZeroCover(raft::handle_t const& handle,
-                             weight_t const* d_costs_dev,
-                             Vertices<vertex_t, weight_t>& d_vertices_dev,
-                             VertexData<vertex_t>& d_row_data_dev,
-                             VertexData<vertex_t>& d_col_data_dev,
-                             bool* d_flag,
-                             int SP,
-                             vertex_t N,
-                             weight_t epsilon)
-{
+inline void executeZeroCover(raft::handle_t const &handle,
+                             weight_t const *d_costs_dev,
+                             Vertices<vertex_t, weight_t> &d_vertices_dev,
+                             VertexData<vertex_t> &d_row_data_dev,
+                             VertexData<vertex_t> &d_col_data_dev, bool *d_flag,
+                             int SP, vertex_t N, weight_t epsilon) {
   vertex_t M = 1;
   while (M > 0) {
-    M = zeroCoverIteration(
-      handle, d_costs_dev, d_vertices_dev, d_row_data_dev, d_col_data_dev, d_flag, SP, N, epsilon);
+    M = zeroCoverIteration(handle, d_costs_dev, d_vertices_dev, d_row_data_dev,
+                           d_col_data_dev, d_flag, SP, N, epsilon);
   }
 }
 
 // Function for executing reverse pass of the maximum matching.
 template <typename vertex_t>
-inline void reversePass(raft::handle_t const& handle,
-                        VertexData<vertex_t>& d_row_data_dev,
-                        VertexData<vertex_t>& d_col_data_dev,
-                        int SP,
-                        int N)
-{
+inline void reversePass(raft::handle_t const &handle,
+                        VertexData<vertex_t> &d_row_data_dev,
+                        VertexData<vertex_t> &d_col_data_dev, int SP, int N) {
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
 
   std::size_t size = SP * N;
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
+                                         total_blocks, size);
 
-  raft::mr::device::buffer<bool> predicates_v(
-    handle.get_device_allocator(), handle.get_stream(), size);
-  raft::mr::device::buffer<vertex_t> addresses_v(
-    handle.get_device_allocator(), handle.get_stream(), size);
+  raft::mr::device::buffer<bool> predicates_v(handle.get_device_allocator(),
+                                              handle.get_stream(), size);
+  raft::mr::device::buffer<vertex_t> addresses_v(handle.get_device_allocator(),
+                                                 handle.get_stream(), size);
 
   thrust::fill_n(thrust::device, predicates_v.data(), size, false);
   thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0});
 
   // compact the reverse pass row vertices.
-  kernel_augmentPredicateConstruction<<<blocks_per_grid,
-                                        threads_per_block,
-                                        0,
+  kernel_augmentPredicateConstruction<<<blocks_per_grid, threads_per_block, 0,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_col_data_dev.is_visited, size);
 
   CHECK_CUDA(handle.get_stream());
 
   // calculate total number of vertices.
-  std::size_t csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
+  std::size_t csr_size =
+    thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
   // exclusive scan for calculating the scatter addresses.
-  thrust::exclusive_scan(
-    thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin());
+  thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(),
+                         addresses_v.begin());
 
   if (csr_size > 0) {
     int total_blocks_1 = 0;
@@ -388,12 +334,14 @@ inline void reversePass(raft::handle_t const& handle,
     raft::mr::device::buffer<vertex_t> elements_v(
       handle.get_device_allocator(), handle.get_stream(), csr_size);
 
-    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0,
+                            handle.get_stream()>>>(
       elements_v.data(), predicates_v.data(), addresses_v.data(), size);
 
     CHECK_CUDA(handle.get_stream());
 
-    kernel_reverseTraversal<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
+    kernel_reverseTraversal<<<blocks_per_grid_1, threads_per_block_1, 0,
+                              handle.get_stream()>>>(
       elements_v.data(), d_row_data_dev, d_col_data_dev, csr_size);
     CHECK_CUDA(handle.get_stream());
   }
@@ -401,30 +349,27 @@ inline void reversePass(raft::handle_t const& handle,
 
 // Function for executing augmentation pass of the maximum matching.
 template <typename vertex_t, typename weight_t>
-inline void augmentationPass(raft::handle_t const& handle,
-                             Vertices<vertex_t, weight_t>& d_vertices_dev,
-                             VertexData<vertex_t>& d_row_data_dev,
-                             VertexData<vertex_t>& d_col_data_dev,
-                             int SP,
-                             int N)
-{
+inline void augmentationPass(raft::handle_t const &handle,
+                             Vertices<vertex_t, weight_t> &d_vertices_dev,
+                             VertexData<vertex_t> &d_row_data_dev,
+                             VertexData<vertex_t> &d_col_data_dev, int SP,
+                             int N) {
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
+                                         total_blocks, SP * N);
 
-  raft::mr::device::buffer<bool> predicates_v(
-    handle.get_device_allocator(), handle.get_stream(), SP * N);
-  raft::mr::device::buffer<vertex_t> addresses_v(
-    handle.get_device_allocator(), handle.get_stream(), SP * N);
+  raft::mr::device::buffer<bool> predicates_v(handle.get_device_allocator(),
+                                              handle.get_stream(), SP * N);
+  raft::mr::device::buffer<vertex_t> addresses_v(handle.get_device_allocator(),
+                                                 handle.get_stream(), SP * N);
 
   thrust::fill_n(thrust::device, predicates_v.data(), SP * N, false);
   thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0});
 
   // compact the reverse pass row vertices.
-  kernel_augmentPredicateConstruction<<<blocks_per_grid,
-                                        threads_per_block,
-                                        0,
+  kernel_augmentPredicateConstruction<<<blocks_per_grid, threads_per_block, 0,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP * N);
 
@@ -435,8 +380,8 @@ inline void augmentationPass(raft::handle_t const& handle,
   vertex_t row_ids_csr_size =
     thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
   // exclusive scan for calculating the scatter addresses.
-  thrust::exclusive_scan(
-    thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin());
+  thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(),
+                         addresses_v.begin());
 
   if (row_ids_csr_size > 0) {
     int total_blocks_1 = 0;
@@ -448,18 +393,17 @@ inline void augmentationPass(raft::handle_t const& handle,
     raft::mr::device::buffer<vertex_t> elements_v(
       handle.get_device_allocator(), handle.get_stream(), row_ids_csr_size);
 
-    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
-      elements_v.data(), predicates_v.data(), addresses_v.data(), vertex_t{SP * N});
+    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0,
+                            handle.get_stream()>>>(
+      elements_v.data(), predicates_v.data(), addresses_v.data(),
+      vertex_t{SP * N});
 
     CHECK_CUDA(handle.get_stream());
 
-    kernel_augmentation<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
-      d_vertices_dev.row_assignments,
-      d_vertices_dev.col_assignments,
-      elements_v.data(),
-      d_row_data_dev,
-      d_col_data_dev,
-      vertex_t{N},
+    kernel_augmentation<<<blocks_per_grid_1, threads_per_block_1, 0,
+                          handle.get_stream()>>>(
+      d_vertices_dev.row_assignments, d_vertices_dev.col_assignments,
+      elements_v.data(), d_row_data_dev, d_col_data_dev, vertex_t{N},
       row_ids_csr_size);
 
     CHECK_CUDA(handle.get_stream());
@@ -467,46 +411,35 @@ inline void augmentationPass(raft::handle_t const& handle,
 }
 
 template <typename vertex_t, typename weight_t>
-inline void dualUpdate(raft::handle_t const& handle,
-                       Vertices<vertex_t, weight_t>& d_vertices_dev,
-                       VertexData<vertex_t>& d_row_data_dev,
-                       VertexData<vertex_t>& d_col_data_dev,
-                       int SP,
-                       vertex_t N,
-                       weight_t epsilon)
-{
+inline void dualUpdate(raft::handle_t const &handle,
+                       Vertices<vertex_t, weight_t> &d_vertices_dev,
+                       VertexData<vertex_t> &d_row_data_dev,
+                       VertexData<vertex_t> &d_col_data_dev, int SP, vertex_t N,
+                       weight_t epsilon) {
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks;
 
-  raft::mr::device::buffer<weight_t> sp_min_v(
-    handle.get_device_allocator(), handle.get_stream(), 1);
+  raft::mr::device::buffer<weight_t> sp_min_v(handle.get_device_allocator(),
+                                              handle.get_stream(), 1);
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
-  kernel_dualUpdate_1<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
-    sp_min_v.data(),
-    d_vertices_dev.col_slacks,
-    d_vertices_dev.col_covers,
-    SP,
-    N,
-    std::numeric_limits<weight_t>::max());
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
+                                         total_blocks, SP);
+  kernel_dualUpdate_1<<<blocks_per_grid, threads_per_block, 0,
+                        handle.get_stream()>>>(
+    sp_min_v.data(), d_vertices_dev.col_slacks, d_vertices_dev.col_covers, SP,
+    N, std::numeric_limits<weight_t>::max());
 
   CHECK_CUDA(handle.get_stream());
 
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
-  kernel_dualUpdate_2<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
-    sp_min_v.data(),
-    d_vertices_dev.row_duals,
-    d_vertices_dev.col_duals,
-    d_vertices_dev.col_slacks,
-    d_vertices_dev.row_covers,
-    d_vertices_dev.col_covers,
-    d_row_data_dev.is_visited,
-    d_col_data_dev.parents,
-    SP,
-    N,
-    std::numeric_limits<weight_t>::max(),
+  kernel_dualUpdate_2<<<blocks_per_grid, threads_per_block, 0,
+                        handle.get_stream()>>>(
+    sp_min_v.data(), d_vertices_dev.row_duals, d_vertices_dev.col_duals,
+    d_vertices_dev.col_slacks, d_vertices_dev.row_covers,
+    d_vertices_dev.col_covers, d_row_data_dev.is_visited,
+    d_col_data_dev.parents, SP, N, std::numeric_limits<weight_t>::max(),
     epsilon);
 
   CHECK_CUDA(handle.get_stream());
@@ -514,19 +447,18 @@ inline void dualUpdate(raft::handle_t const& handle,
 
 // Function for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-inline void calcObjValDual(raft::handle_t const& handle,
-                           weight_t* d_obj_val,
-                           Vertices<vertex_t, weight_t>& d_vertices_dev,
-                           int SP,
-                           int N)
-{
+inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val,
+                           Vertices<vertex_t, weight_t> &d_vertices_dev, int SP,
+                           int N) {
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
+                                         total_blocks, SP);
 
-  kernel_calcObjValDual<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+  kernel_calcObjValDual<<<blocks_per_grid, threads_per_block, 0,
+                          handle.get_stream()>>>(
     d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N);
 
   CHECK_CUDA(handle.get_stream());
@@ -534,21 +466,20 @@ inline void calcObjValDual(raft::handle_t const& handle,
 
 // Function for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-inline void calcObjValPrimal(raft::handle_t const& handle,
-                             weight_t* d_obj_val,
-                             weight_t const* d_costs,
-                             vertex_t const* d_row_assignments,
-                             int SP,
-                             vertex_t N)
-{
+inline void calcObjValPrimal(raft::handle_t const &handle, weight_t *d_obj_val,
+                             weight_t const *d_costs,
+                             vertex_t const *d_row_assignments, int SP,
+                             vertex_t N) {
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
+                                         total_blocks, SP);
 
-  kernel_calcObjValPrimal<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
-    d_obj_val, d_costs, d_row_assignments, SP, N);
+  kernel_calcObjValPrimal<<<blocks_per_grid, threads_per_block, 0,
+                            handle.get_stream()>>>(d_obj_val, d_costs,
+                                                   d_row_assignments, SP, N);
 
   CHECK_CUDA(handle.get_stream());
 }
diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/lap_kernels.cuh
index 45ad23afd1..8c9012ed72 100644
--- a/cpp/include/raft/lap/lap_kernels.cuh
+++ b/cpp/include/raft/lap/lap_kernels.cuh
@@ -48,57 +48,42 @@ const int AUGMENT{4};
 const int MODIFIED{5};
 
 template <typename weight_t>
-bool __device__ near_zero(weight_t w, weight_t epsilon)
-{
+bool __device__ near_zero(weight_t w, weight_t epsilon) {
   return ((w > -epsilon) && (w < epsilon));
 }
 
 template <>
-bool __device__ near_zero<int32_t>(int32_t w, int32_t epsilon)
-{
+bool __device__ near_zero<int32_t>(int32_t w, int32_t epsilon) {
   return (w == 0);
 }
 
 template <>
-bool __device__ near_zero<int64_t>(int64_t w, int64_t epsilon)
-{
+bool __device__ near_zero<int64_t>(int64_t w, int64_t epsilon) {
   return (w == 0);
 }
 
-// Device function for traversing the neighbors from start pointer to end pointer and updating the
-// covers. The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of
-// Step 4 execution.
+// Device function for traversing the neighbors from start pointer to end pointer and updating the covers.
+// The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of Step 4 execution.
 template <typename vertex_t, typename weight_t>
-__device__ void cover_and_expand_row(weight_t const* d_elements,
-                                     weight_t const* d_row_duals,
-                                     weight_t const* d_col_duals,
-                                     weight_t* d_col_slacks,
-                                     int* d_row_covers,
-                                     int* d_col_covers,
-                                     vertex_t const* d_col_assignments,
-                                     bool* d_flag,
-                                     vertex_t* d_row_parents,
-                                     vertex_t* d_col_parents,
-                                     int* d_row_visited,
-                                     int* d_col_visited,
-                                     vertex_t rowid,
-                                     int spid,
-                                     int colid,
-                                     vertex_t N,
-                                     weight_t epsilon)
-{
+__device__ void cover_and_expand_row(
+  weight_t const *d_elements, weight_t const *d_row_duals,
+  weight_t const *d_col_duals, weight_t *d_col_slacks, int *d_row_covers,
+  int *d_col_covers, vertex_t const *d_col_assignments, bool *d_flag,
+  vertex_t *d_row_parents, vertex_t *d_col_parents, int *d_row_visited,
+  int *d_col_visited, vertex_t rowid, int spid, int colid, vertex_t N,
+  weight_t epsilon) {
   int ROWID = spid * N + rowid;
   int COLID = spid * N + colid;
 
-  weight_t slack =
-    d_elements[spid * N * N + rowid * N + colid] - d_row_duals[ROWID] - d_col_duals[COLID];
+  weight_t slack = d_elements[spid * N * N + rowid * N + colid] -
+                   d_row_duals[ROWID] - d_col_duals[COLID];
 
   int nxt_rowid = d_col_assignments[COLID];
   int NXT_ROWID = spid * N + nxt_rowid;
 
   if (rowid != nxt_rowid && d_col_covers[COLID] == 0) {
     if (slack < d_col_slacks[COLID]) {
-      d_col_slacks[COLID]  = slack;
+      d_col_slacks[COLID] = slack;
       d_col_parents[COLID] = ROWID;
     }
 
@@ -107,12 +92,13 @@ __device__ void cover_and_expand_row(weight_t const* d_elements,
         d_row_parents[NXT_ROWID] = COLID;  // update parent info
 
         d_row_covers[NXT_ROWID] = 0;
-        d_col_covers[COLID]     = 1;
+        d_col_covers[COLID] = 1;
 
-        if (d_row_visited[NXT_ROWID] != VISITED) d_row_visited[NXT_ROWID] = ACTIVE;
+        if (d_row_visited[NXT_ROWID] != VISITED)
+          d_row_visited[NXT_ROWID] = ACTIVE;
       } else {
         d_col_visited[COLID] = REVERSE;
-        *d_flag              = true;
+        *d_flag = true;
       }
     }
   }
@@ -121,34 +107,28 @@ __device__ void cover_and_expand_row(weight_t const* d_elements,
 
 // Device function for traversing an alternating path from unassigned row to unassigned column.
 template <typename vertex_t>
-__device__ void __reverse_traversal(int* d_row_visited,
-                                    vertex_t* d_row_children,
-                                    vertex_t* d_col_children,
-                                    vertex_t const* d_row_parents,
-                                    vertex_t const* d_col_parents,
-                                    int cur_colid)
-{
+__device__ void __reverse_traversal(
+  int *d_row_visited, vertex_t *d_row_children, vertex_t *d_col_children,
+  vertex_t const *d_row_parents, vertex_t const *d_col_parents, int cur_colid) {
   int cur_rowid = -1;
 
   while (cur_colid != -1) {
     d_col_children[cur_colid] = cur_rowid;
-    cur_rowid                 = d_col_parents[cur_colid];
+    cur_rowid = d_col_parents[cur_colid];
 
     d_row_children[cur_rowid] = cur_colid;
-    cur_colid                 = d_row_parents[cur_rowid];
+    cur_colid = d_row_parents[cur_rowid];
   }
   d_row_visited[cur_rowid] = AUGMENT;
 }
 
 // Device function for augmenting the alternating path from unassigned column to unassigned row.
 template <typename vertex_t>
-__device__ void __augment(vertex_t* d_row_assignments,
-                          vertex_t* d_col_assignments,
-                          vertex_t const* d_row_children,
-                          vertex_t const* d_col_children,
-                          vertex_t cur_rowid,
-                          vertex_t N)
-{
+__device__ void __augment(vertex_t *d_row_assignments,
+                          vertex_t *d_col_assignments,
+                          vertex_t const *d_row_children,
+                          vertex_t const *d_col_children, vertex_t cur_rowid,
+                          vertex_t N) {
   int cur_colid = -1;
 
   while (cur_rowid != -1) {
@@ -165,18 +145,20 @@ __device__ void __augment(vertex_t* d_row_assignments,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_rowReduction(
-  weight_t const* d_costs, weight_t* d_row_duals, int SP, vertex_t N, weight_t infinity)
-{
-  int spid     = blockIdx.y * blockDim.y + threadIdx.y;
-  int rowid    = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void kernel_rowReduction(weight_t const *d_costs,
+                                    weight_t *d_row_duals, int SP, vertex_t N,
+                                    weight_t infinity) {
+  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+  int rowid = blockIdx.x * blockDim.x + threadIdx.x;
   weight_t min = infinity;
 
   if (spid < SP && rowid < N) {
     for (int colid = 0; colid < N; colid++) {
       weight_t slack = d_costs[spid * N * N + rowid * N + colid];
 
-      if (slack < min) { min = slack; }
+      if (slack < min) {
+        min = slack;
+      }
     }
 
     d_row_duals[spid * N + rowid] = min;
@@ -187,26 +169,25 @@ __global__ void kernel_rowReduction(
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_columnReduction(weight_t const* d_costs,
-                                       weight_t const* d_row_duals,
-                                       weight_t* d_col_duals,
-                                       int SP,
-                                       vertex_t N,
-                                       weight_t infinity)
-{
-  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_columnReduction(weight_t const *d_costs,
+                                       weight_t const *d_row_duals,
+                                       weight_t *d_col_duals, int SP,
+                                       vertex_t N, weight_t infinity) {
+  int spid = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
 
   weight_t min = infinity;
 
   if (spid < SP && colid < N) {
     for (int rowid = 0; rowid < N; rowid++) {
-      weight_t cost     = d_costs[spid * N * N + rowid * N + colid];
+      weight_t cost = d_costs[spid * N * N + rowid * N + colid];
       weight_t row_dual = d_row_duals[spid * N + rowid];
 
       weight_t slack = cost - row_dual;
 
-      if (slack < min) { min = slack; }
+      if (slack < min) {
+        min = slack;
+      }
     }
 
     d_col_duals[spid * N + colid] = min;
@@ -215,18 +196,12 @@ __global__ void kernel_columnReduction(weight_t const* d_costs,
 
 // Kernel for calculating initial assignments.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_computeInitialAssignments(weight_t const* d_costs,
-                                                 weight_t const* d_row_duals,
-                                                 weight_t const* d_col_duals,
-                                                 vertex_t* d_row_assignments,
-                                                 vertex_t* d_col_assignments,
-                                                 int* d_row_lock,
-                                                 int* d_col_lock,
-                                                 int SP,
-                                                 vertex_t N,
-                                                 weight_t epsilon)
-{
-  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_computeInitialAssignments(
+  weight_t const *d_costs, weight_t const *d_row_duals,
+  weight_t const *d_col_duals, vertex_t *d_row_assignments,
+  vertex_t *d_col_assignments, int *d_row_lock, int *d_col_lock, int SP,
+  vertex_t N, weight_t epsilon) {
+  int spid = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && colid < N) {
@@ -238,15 +213,15 @@ __global__ void kernel_computeInitialAssignments(weight_t const* d_costs,
 
       if (d_col_lock[overall_colid] == 1) break;
 
-      weight_t cost     = d_costs[spid * N * N + rowid * N + colid];
+      weight_t cost = d_costs[spid * N * N + rowid * N + colid];
       weight_t row_dual = d_row_duals[overall_rowid];
-      weight_t slack    = cost - row_dual - col_dual;
+      weight_t slack = cost - row_dual - col_dual;
 
       if (near_zero(slack, epsilon)) {
         if (atomicCAS(&d_row_lock[overall_rowid], 0, 1) == 0) {
           d_row_assignments[overall_rowid] = colid;
           d_col_assignments[overall_colid] = rowid;
-          d_col_lock[overall_colid]        = 1;
+          d_col_lock[overall_colid] = 1;
         }
       }
     }
@@ -255,10 +230,10 @@ __global__ void kernel_computeInitialAssignments(weight_t const* d_costs,
 
 // Kernel for populating the cover arrays and initializing alternating tree.
 template <typename vertex_t>
-__global__ void kernel_computeRowCovers(
-  vertex_t* d_row_assignments, int* d_row_covers, int* d_row_visited, int SP, vertex_t N)
-{
-  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_computeRowCovers(vertex_t *d_row_assignments,
+                                        int *d_row_covers, int *d_row_visited,
+                                        int SP, vertex_t N) {
+  int spid = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && rowid < N) {
@@ -274,10 +249,11 @@ __global__ void kernel_computeRowCovers(
 
 // Kernel for populating the predicate matrix for edges in row major format.
 template <typename vertex_t>
-__global__ void kernel_rowPredicateConstructionCSR(
-  bool* d_predicates, vertex_t* d_addresses, int* d_row_visited, int SP, vertex_t N)
-{
-  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates,
+                                                   vertex_t *d_addresses,
+                                                   int *d_row_visited, int SP,
+                                                   vertex_t N) {
+  int spid = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && rowid < N) {
@@ -285,160 +261,130 @@ __global__ void kernel_rowPredicateConstructionCSR(
 
     if (d_row_visited[index] == ACTIVE) {
       d_predicates[index] = true;
-      d_addresses[index]  = 1;
+      d_addresses[index] = 1;
     } else {
       d_predicates[index] = false;
-      d_addresses[index]  = 0;
+      d_addresses[index] = 0;
     }
   }
 }
 
 // Kernel for scattering the edges based on the scatter addresses.
 template <typename vertex_t>
-__global__ void kernel_rowScatterCSR(bool const* d_predicates,
-                                     vertex_t const* d_addresses,
-                                     vertex_t* d_neighbors,
-                                     vertex_t* d_ptrs,
-                                     vertex_t M,
-                                     int SP,
-                                     vertex_t N)
-{
-  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_rowScatterCSR(bool const *d_predicates,
+                                     vertex_t const *d_addresses,
+                                     vertex_t *d_neighbors, vertex_t *d_ptrs,
+                                     vertex_t M, int SP, vertex_t N) {
+  int spid = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && rowid < N) {
     int index = spid * N + rowid;
 
-    bool predicate  = d_predicates[index];
+    bool predicate = d_predicates[index];
     vertex_t compid = d_addresses[index];
 
-    if (predicate) { d_neighbors[compid] = rowid; }
+    if (predicate) {
+      d_neighbors[compid] = rowid;
+    }
     if (rowid == 0) {
       d_ptrs[spid] = compid;
-      d_ptrs[SP]   = M;
+      d_ptrs[SP] = M;
     }
   }
 }
 
 // Kernel for finding the minimum zero cover.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_coverAndExpand(bool* d_flag,
-                                      vertex_t const* d_ptrs,
-                                      vertex_t const* d_neighbors,
-                                      weight_t const* d_elements,
+__global__ void kernel_coverAndExpand(bool *d_flag, vertex_t const *d_ptrs,
+                                      vertex_t const *d_neighbors,
+                                      weight_t const *d_elements,
                                       Vertices<vertex_t, weight_t> d_vertices,
                                       VertexData<vertex_t> d_row_data,
-                                      VertexData<vertex_t> d_col_data,
-                                      int SP,
-                                      vertex_t N,
-                                      weight_t epsilon)
-{
-  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
+                                      VertexData<vertex_t> d_col_data, int SP,
+                                      vertex_t N, weight_t epsilon) {
+  int spid = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
 
   // Load values into local memory
 
   if (spid < SP && colid < N) {
     thrust::for_each(
-      thrust::seq,
-      d_neighbors + d_ptrs[spid],
-      d_neighbors + d_ptrs[spid + 1],
-      [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, epsilon] __device__(
-        vertex_t rowid) {
-        cover_and_expand_row(d_elements,
-                             d_vertices.row_duals,
-                             d_vertices.col_duals,
-                             d_vertices.col_slacks,
-                             d_vertices.row_covers,
-                             d_vertices.col_covers,
-                             d_vertices.col_assignments,
-                             d_flag,
-                             d_row_data.parents,
-                             d_col_data.parents,
-                             d_row_data.is_visited,
-                             d_col_data.is_visited,
-                             rowid,
-                             spid,
-                             colid,
-                             N,
-                             epsilon);
+      thrust::seq, d_neighbors + d_ptrs[spid], d_neighbors + d_ptrs[spid + 1],
+      [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N,
+       epsilon] __device__(vertex_t rowid) {
+        cover_and_expand_row(
+          d_elements, d_vertices.row_duals, d_vertices.col_duals,
+          d_vertices.col_slacks, d_vertices.row_covers, d_vertices.col_covers,
+          d_vertices.col_assignments, d_flag, d_row_data.parents,
+          d_col_data.parents, d_row_data.is_visited, d_col_data.is_visited,
+          rowid, spid, colid, N, epsilon);
       });
   }
 }
 
 // Kernel for constructing the predicates for reverse pass or augmentation candidates.
 template <typename vertex_t>
-__global__ void kernel_augmentPredicateConstruction(bool* d_predicates,
-                                                    vertex_t* d_addresses,
-                                                    int* d_visited,
-                                                    int size)
-{
+__global__ void kernel_augmentPredicateConstruction(bool *d_predicates,
+                                                    vertex_t *d_addresses,
+                                                    int *d_visited, int size) {
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
     int visited = d_visited[id];
     if ((visited == REVERSE) || (visited == AUGMENT)) {
       d_predicates[id] = true;
-      d_addresses[id]  = 1;
+      d_addresses[id] = 1;
     } else {
       d_predicates[id] = false;
-      d_addresses[id]  = 0;
+      d_addresses[id] = 0;
     }
   }
 }
 
 // Kernel for scattering the vertices based on the scatter addresses.
 template <typename vertex_t>
-__global__ void kernel_augmentScatter(vertex_t* d_elements,
-                                      bool const* d_predicates,
-                                      vertex_t const* d_addresses,
-                                      std::size_t size)
-{
+__global__ void kernel_augmentScatter(vertex_t *d_elements,
+                                      bool const *d_predicates,
+                                      vertex_t const *d_addresses,
+                                      std::size_t size) {
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
-    if (d_predicates[id]) { d_elements[d_addresses[id]] = id; }
+    if (d_predicates[id]) {
+      d_elements[d_addresses[id]] = id;
+    }
   }
 }
 
 // Kernel for executing the reverse pass of the maximum matching algorithm.
 template <typename vertex_t>
-__global__ void kernel_reverseTraversal(vertex_t* d_elements,
+__global__ void kernel_reverseTraversal(vertex_t *d_elements,
                                         VertexData<vertex_t> d_row_data,
                                         VertexData<vertex_t> d_col_data,
-                                        int size)
-{
+                                        int size) {
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
-    __reverse_traversal(d_row_data.is_visited,
-                        d_row_data.children,
-                        d_col_data.children,
-                        d_row_data.parents,
-                        d_col_data.parents,
-                        d_elements[id]);
+    __reverse_traversal(d_row_data.is_visited, d_row_data.children,
+                        d_col_data.children, d_row_data.parents,
+                        d_col_data.parents, d_elements[id]);
   }
 }
 
 // Kernel for executing the augmentation pass of the maximum matching algorithm.
 template <typename vertex_t>
-__global__ void kernel_augmentation(vertex_t* d_row_assignments,
-                                    vertex_t* d_col_assignments,
-                                    vertex_t const* d_row_elements,
+__global__ void kernel_augmentation(vertex_t *d_row_assignments,
+                                    vertex_t *d_col_assignments,
+                                    vertex_t const *d_row_elements,
                                     VertexData<vertex_t> d_row_data,
-                                    VertexData<vertex_t> d_col_data,
-                                    vertex_t N,
-                                    vertex_t size)
-{
+                                    VertexData<vertex_t> d_col_data, vertex_t N,
+                                    vertex_t size) {
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
-    __augment(d_row_assignments,
-              d_col_assignments,
-              d_row_data.children,
-              d_col_data.children,
-              d_row_elements[id],
-              N);
+    __augment(d_row_assignments, d_col_assignments, d_row_data.children,
+              d_col_data.children, d_row_elements[id], N);
   }
 }
 
@@ -446,21 +392,18 @@ __global__ void kernel_augmentation(vertex_t* d_row_assignments,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_dualUpdate_1(weight_t* d_sp_min,
-                                    weight_t const* d_col_slacks,
-                                    int const* d_col_covers,
-                                    int SP,
-                                    vertex_t N,
-                                    weight_t infinity)
-{
+__global__ void kernel_dualUpdate_1(weight_t *d_sp_min,
+                                    weight_t const *d_col_slacks,
+                                    int const *d_col_covers, int SP, vertex_t N,
+                                    weight_t infinity) {
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP) {
     weight_t min = infinity;
     for (int colid = 0; colid < N; colid++) {
-      int index      = spid * N + colid;
+      int index = spid * N + colid;
       weight_t slack = d_col_slacks[index];
-      int col_cover  = d_col_covers[index];
+      int col_cover = d_col_covers[index];
 
       if (col_cover == 0)
         if (slack < min) min = slack;
@@ -474,29 +417,21 @@ __global__ void kernel_dualUpdate_1(weight_t* d_sp_min,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_dualUpdate_2(weight_t const* d_sp_min,
-                                    weight_t* d_row_duals,
-                                    weight_t* d_col_duals,
-                                    weight_t* d_col_slacks,
-                                    int const* d_row_covers,
-                                    int const* d_col_covers,
-                                    int* d_row_visited,
-                                    vertex_t* d_col_parents,
-                                    int SP,
-                                    vertex_t N,
-                                    weight_t infinity,
-                                    weight_t epsilon)
-{
+__global__ void kernel_dualUpdate_2(
+  weight_t const *d_sp_min, weight_t *d_row_duals, weight_t *d_col_duals,
+  weight_t *d_col_slacks, int const *d_row_covers, int const *d_col_covers,
+  int *d_row_visited, vertex_t *d_col_parents, int SP, vertex_t N,
+  weight_t infinity, weight_t epsilon) {
   int spid = blockIdx.y * blockDim.y + threadIdx.y;
-  int id   = blockIdx.x * blockDim.x + threadIdx.x;
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && id < N) {
     int index = spid * N + id;
 
     if (d_sp_min[spid] < infinity) {
       weight_t theta = d_sp_min[spid];
-      int row_cover  = d_row_covers[index];
-      int col_cover  = d_col_covers[index];
+      int row_cover = d_row_covers[index];
+      int col_cover = d_col_covers[index];
 
       if (row_cover == 0)  // Row vertex is reachable from source.
         d_row_duals[index] += theta;
@@ -518,12 +453,10 @@ __global__ void kernel_dualUpdate_2(weight_t const* d_sp_min,
 
 // Kernel for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual,
-                                      weight_t const* d_row_duals,
-                                      weight_t const* d_col_duals,
-                                      int SP,
-                                      vertex_t N)
-{
+__global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual,
+                                      weight_t const *d_row_duals,
+                                      weight_t const *d_col_duals, int SP,
+                                      vertex_t N) {
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP) {
@@ -538,12 +471,10 @@ __global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual,
 
 // Kernel for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_calcObjValPrimal(weight_t* d_obj_val_primal,
-                                        weight_t const* d_costs,
-                                        vertex_t const* d_row_assignments,
-                                        int SP,
-                                        vertex_t N)
-{
+__global__ void kernel_calcObjValPrimal(weight_t *d_obj_val_primal,
+                                        weight_t const *d_costs,
+                                        vertex_t const *d_row_assignments,
+                                        int SP, vertex_t N) {
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP) {
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 11d3174951..7a454f64e2 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -37,8 +37,8 @@ namespace linalg {
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
-{
+void addScalar(OutT *out, const InT *in, InT scalar, IdxType len,
+               cudaStream_t stream) {
   auto op = [scalar] __device__(InT in) { return OutT(in + scalar); };
   unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
 }
@@ -57,24 +57,23 @@ void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t s
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
-{
+void add(OutT *out, const InT *in1, const InT *in2, IdxType len,
+         cudaStream_t stream) {
   auto op = [] __device__(InT a, InT b) { return OutT(a + b); };
   binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
 }
 
 template <class math_t, typename IdxType>
-__global__ void add_dev_scalar_kernel(math_t* outDev,
-                                      const math_t* inDev,
-                                      const math_t* singleScalarDev,
-                                      IdxType len)
-{
+__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
+                                      const math_t *singleScalarDev,
+                                      IdxType len) {
   IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }
+  if (i < len) {
+    outDev[i] = inDev[i] + *singleScalarDev;
+  }
 }
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
- * write result to outDev[i]
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
  * @param outDev the output buffer
@@ -84,16 +83,14 @@ __global__ void add_dev_scalar_kernel(math_t* outDev,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void addDevScalar(math_t* outDev,
-                  const math_t* inDev,
-                  const math_t* singleScalarDev,
-                  IdxType len,
-                  cudaStream_t stream)
-{
+void addDevScalar(math_t *outDev, const math_t *inDev,
+                  const math_t *singleScalarDev, IdxType len,
+                  cudaStream_t stream) {
   // TODO: block dimension has not been tuned
   dim3 block(256);
   dim3 grid(raft::ceildiv(len, (IdxType)block.x));
-  add_dev_scalar_kernel<math_t><<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+  add_dev_scalar_kernel<math_t>
+    <<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index a49a433941..940d786e87 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -22,10 +22,10 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType>
-__global__ void binaryOpKernel(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op)
-{
+template <typename InType, int VecLen, typename Lambda, typename IdxType,
+          typename OutType>
+__global__ void binaryOpKernel(OutType *out, const InType *in1,
+                               const InType *in2, IdxType len, Lambda op) {
   typedef TxN_t<InType, VecLen> InVecType;
   typedef TxN_t<OutType, VecLen> OutVecType;
   InVecType a, b;
@@ -42,11 +42,12 @@ __global__ void binaryOpKernel(
   c.store(out, idx);
 }
 
-template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType, int TPB>
-void binaryOpImpl(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
-{
-  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+template <typename InType, int VecLen, typename Lambda, typename IdxType,
+          typename OutType, int TPB>
+void binaryOpImpl(OutType *out, const InType *in1, const InType *in2,
+                  IdxType len, Lambda op, cudaStream_t stream) {
+  const IdxType nblks =
+    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
   binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
     <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
   CUDA_CHECK(cudaPeekAtLastError());
@@ -55,8 +56,8 @@ void binaryOpImpl(
 /**
  * @brief Checks if addresses are aligned on N bytes
  */
-inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N)
-{
+inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3,
+                           uint64_t N) {
   return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
 }
 
@@ -76,36 +77,38 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint6
  * @note Lambda must be a functor with the following signature:
  *       `OutType func(const InType& val1, const InType& val2);`
  */
-template <typename InType,
-          typename Lambda,
-          typename OutType = InType,
-          typename IdxType = int,
-          int TPB          = 256>
-void binaryOp(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
-{
-  constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes           = len * maxSize;
-  uint64_t in1Addr       = uint64_t(in1);
-  uint64_t in2Addr       = uint64_t(in2);
-  uint64_t outAddr       = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) {
+template <typename InType, typename Lambda, typename OutType = InType,
+          typename IdxType = int, int TPB = 256>
+void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
+              Lambda op, cudaStream_t stream) {
+  constexpr auto maxSize =
+    sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes = len * maxSize;
+  uint64_t in1Addr = uint64_t(in1);
+  uint64_t in2Addr = uint64_t(in2);
+  uint64_t outAddr = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 &&
+      addressAligned(in1Addr, in2Addr, outAddr, 16)) {
     binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) {
+  } else if (8 / maxSize && bytes % 8 == 0 &&
+             addressAligned(in1Addr, in2Addr, outAddr, 8)) {
     binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) {
+  } else if (4 / maxSize && bytes % 4 == 0 &&
+             addressAligned(in1Addr, in2Addr, outAddr, 4)) {
     binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) {
+  } else if (2 / maxSize && bytes % 2 == 0 &&
+             addressAligned(in1Addr, in2Addr, outAddr, 2)) {
     binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (1 / maxSize) {
     binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else {
-    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len, op, stream);
+    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len,
+                                                           op, stream);
   }
 }
 
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index b129fe4758..b5a93c4953 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -122,16 +122,9 @@ namespace linalg {
  *    conditioned systems. Negative values mean no regularizaton.
  */
 template <typename math_t>
-void choleskyRank1Update(const raft::handle_t& handle,
-                         math_t* L,
-                         int n,
-                         int ld,
-                         void* workspace,
-                         int* n_bytes,
-                         cublasFillMode_t uplo,
-                         cudaStream_t stream,
-                         math_t eps = -1)
-{
+void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
+                         void *workspace, int *n_bytes, cublasFillMode_t uplo,
+                         cudaStream_t stream, math_t eps = -1) {
   // The matrix A' is defined as:
   // A' = [[A_11, A_12]
   //       [A_21, A_22]]
@@ -151,17 +144,18 @@ void choleskyRank1Update(const raft::handle_t& handle,
   // We need a workspace in device memory to store a scalar. Additionally, in
   // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
   const int align = 256;
-  int offset =
-    (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align) : 0;
+  int offset = (uplo == CUBLAS_FILL_MODE_LOWER)
+                 ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align)
+                 : 0;
   if (workspace == nullptr) {
     *n_bytes = offset + 1 * sizeof(math_t);
     return;
   }
-  math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
-  math_t* L_22 = L + (n - 1) * ld + n - 1;
+  math_t *s = reinterpret_cast<math_t *>(((char *)workspace) + offset);
+  math_t *L_22 = L + (n - 1) * ld + n - 1;
 
-  math_t* A_new;
-  math_t* A_row;
+  math_t *A_new;
+  math_t *A_row;
   if (uplo == CUBLAS_FILL_MODE_UPPER) {
     // A_new is stored as the n-1 th column of L
     A_new = L + (n - 1) * ld;
@@ -170,36 +164,27 @@ void choleskyRank1Update(const raft::handle_t& handle,
     // as the n-th row of L. Since the matrix is column major, this is non
     // contiguous. We copy elements from A_row to a contiguous workspace A_new.
     A_row = L + n - 1;
-    A_new = reinterpret_cast<math_t*>(workspace);
-    CUBLAS_CHECK(
-      raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
+    A_new = reinterpret_cast<math_t *>(workspace);
+    CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
+                                          A_row, ld, A_new, 1, stream));
   }
-  cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t op =
+    (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
   if (n > 1) {
     // Calculate L_12 = x by solving equation L_11 x = A_12
     math_t alpha = 1;
-    CUBLAS_CHECK(raft::linalg::cublastrsm(handle.get_cublas_handle(),
-                                          CUBLAS_SIDE_LEFT,
-                                          uplo,
-                                          op,
-                                          CUBLAS_DIAG_NON_UNIT,
-                                          n - 1,
-                                          1,
-                                          &alpha,
-                                          L,
-                                          ld,
-                                          A_new,
-                                          n - 1,
-                                          stream));
+    CUBLAS_CHECK(raft::linalg::cublastrsm(
+      handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op,
+      CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream));
 
     // A_new now stores L_12, we calculate s = L_12 * L_12
-    CUBLAS_CHECK(
-      raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
+    CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1,
+                                         A_new, 1, A_new, 1, s, stream));
 
     if (uplo == CUBLAS_FILL_MODE_LOWER) {
       // Copy back the L_12 elements as the n-th row of L
-      CUBLAS_CHECK(
-        raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
+      CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
+                                            A_new, 1, A_row, ld, stream));
     }
   } else {  // n == 1 case
     CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
@@ -217,7 +202,9 @@ void choleskyRank1Update(const raft::handle_t& handle,
   // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
   // negative, which would result L_22 = NaN. A small positive eps parameter
   // can be used to prevent this.
-  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; }
+  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) {
+    L_22_host = eps;
+  }
   ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update");
   raft::update_device(L_22, &L_22_host, 1, stream);
 }
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index 7e0744f98a..ef983ff3d0 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -26,27 +26,18 @@ namespace linalg {
 // of the matrix, i.e. reduce along rows for row major or reduce along columns
 // for column major layout. Kernel does an inplace reduction adding to original
 // values of dots.
-template <typename InType,
-          typename OutType,
-          typename IdxType,
-          int TPB,
-          typename MainLambda,
-          typename ReduceLambda,
-          typename FinalLambda>
-__global__ void coalescedReductionKernel(OutType* dots,
-                                         const InType* data,
-                                         int D,
-                                         int N,
-                                         OutType init,
+template <typename InType, typename OutType, typename IdxType, int TPB,
+          typename MainLambda, typename ReduceLambda, typename FinalLambda>
+__global__ void coalescedReductionKernel(OutType *dots, const InType *data,
+                                         int D, int N, OutType init,
                                          MainLambda main_op,
                                          ReduceLambda reduce_op,
                                          FinalLambda final_op,
-                                         bool inplace = false)
-{
+                                         bool inplace = false) {
   typedef cub::BlockReduce<OutType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType thread_data = init;
-  IdxType rowStart    = blockIdx.x * D;
+  IdxType rowStart = blockIdx.x * D;
   for (IdxType i = threadIdx.x; i < D; i += TPB) {
     IdxType idx = rowStart + i;
     thread_data = reduce_op(thread_data, main_op(data[idx], i));
@@ -88,37 +79,33 @@ __global__ void coalescedReductionKernel(OutType* dots,
  * @param inplace reduction result added inplace or overwrites old values?
  * @param stream cuda stream where to launch work
  */
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
+template <typename InType, typename OutType = InType, typename IdxType = int,
+          typename MainLambda = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
-void coalescedReduction(OutType* dots,
-                        const InType* data,
-                        int D,
-                        int N,
-                        OutType init,
-                        cudaStream_t stream,
-                        bool inplace           = false,
-                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
+          typename FinalLambda = raft::Nop<OutType>>
+void coalescedReduction(OutType *dots, const InType *data, int D, int N,
+                        OutType init, cudaStream_t stream, bool inplace = false,
+                        MainLambda main_op = raft::Nop<InType, IdxType>(),
                         ReduceLambda reduce_op = raft::Sum<OutType>(),
-                        FinalLambda final_op   = raft::Nop<OutType>())
-{
+                        FinalLambda final_op = raft::Nop<OutType>()) {
   // One block per reduction
   // Efficient only for large leading dimensions
   if (D <= 32) {
     coalescedReductionKernel<InType, OutType, IdxType, 32>
-      <<<N, 32, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+      <<<N, 32, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
+                             final_op, inplace);
   } else if (D <= 64) {
     coalescedReductionKernel<InType, OutType, IdxType, 64>
-      <<<N, 64, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+      <<<N, 64, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
+                             final_op, inplace);
   } else if (D <= 128) {
     coalescedReductionKernel<InType, OutType, IdxType, 128>
-      <<<N, 128, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+      <<<N, 128, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
+                              final_op, inplace);
   } else {
     coalescedReductionKernel<InType, OutType, IdxType, 256>
-      <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
+      <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
+                              final_op, inplace);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
index 35d9d96ea4..aa711a9140 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -55,7 +55,8 @@ namespace linalg {
  *                 thread block. This also determines the number of threads per
  *                 thread block
  */
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr,
+          int _tc>
 struct KernelPolicy {
   enum {
     /** number of elements along K worked upon per main loop iteration */
@@ -100,7 +101,8 @@ struct KernelPolicy {
 
 };  // struct KernelPolicy
 
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr,
+          int _tc>
 struct ColKernelPolicy {
   enum {
     /** number of elements along K worked upon per main loop iteration */
@@ -149,8 +151,7 @@ struct ColKernelPolicy {
  * @{
  */
 template <typename DataT, int _veclen>
-struct Policy4x4 {
-};
+struct Policy4x4 {};
 
 template <int _veclen>
 struct Policy4x4<float, _veclen> {
@@ -179,7 +180,8 @@ struct Policy4x4<double, _veclen> {
  * @tparam Policy policy used to customize memory access behavior.
  *                See documentation for `KernelPolicy` to know more.
  */
-template <typename DataT, typename IdxT, typename Policy, bool isRowMajor = true>
+template <typename DataT, typename IdxT, typename Policy,
+          bool isRowMajor = true>
 struct Contractions_NT {
  protected:
   typedef Policy P;
@@ -245,7 +247,8 @@ struct Contractions_NT {
    * @param[in] _k number of cols of X and Y
    * @param[in] _smem shared memory region used during computations
    */
-  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, IdxT _k, char* _smem)
+  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
+                     IdxT _k, char* _smem)
     : m(_m),
       n(_n),
       k(_k),
@@ -262,9 +265,7 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0)
-  {
-  }
+      pageRd(0) {}
 
   /**
    * @brief Ctor
@@ -275,15 +276,8 @@ struct Contractions_NT {
    * @param[in] _k number of cols of X and Y
    * @param[in] _smem shared memory region used during computations
    */
-  DI Contractions_NT(const DataT* _x,
-                     const DataT* _y,
-                     IdxT _m,
-                     IdxT _n,
-                     IdxT _k,
-                     IdxT _lda,
-                     IdxT _ldb,
-                     IdxT _ldd,
-                     char* _smem)
+  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
+                     IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem)
     : m(_m),
       n(_n),
       k(_k),
@@ -297,18 +291,17 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0)
-  {
+      pageRd(0) {
     if (isRowMajor) {
       xrowid = IdxT(blockIdx.y) * P::Mblk + srowid;
       yrowid = IdxT(blockIdx.x) * P::Nblk + srowid;
-      x      = _x + xrowid * lda;
-      y      = _y + yrowid * ldb;
+      x = _x + xrowid * lda;
+      y = _y + yrowid * ldb;
     } else {
       xrowid = IdxT(blockIdx.y) * P::Mblk;
       yrowid = IdxT(blockIdx.x) * P::Nblk;
-      x      = _x + xrowid + srowid * lda;
-      y      = _y + yrowid + srowid * ldb;
+      x = _x + xrowid + srowid * lda;
+      y = _y + yrowid + srowid * ldb;
     }
   }
 
@@ -317,8 +310,7 @@ struct Contractions_NT {
    * @brief Load current block of X/Y from global memory to registers
    * @param[in] kidx current start index of k to be loaded
    */
-  DI void ldgXY(IdxT kidx)
-  {
+  DI void ldgXY(IdxT kidx) {
     ldgX(kidx);
     ldgY(kidx);
   }
@@ -327,8 +319,7 @@ struct Contractions_NT {
    * @brief Store current block of X/Y from registers to smem
    * @param[in] kidx current start index of k to be loaded
    */
-  DI void stsXY()
-  {
+  DI void stsXY() {
     stsX(sx + pageWr * P::SmemPage);
     stsY(sy + pageWr * P::SmemPage);
   }
@@ -337,15 +328,13 @@ struct Contractions_NT {
    * @brief Load X and Y block from shared memory to registers
    * @param[in] kidx k value from the current k-block to be loaded from smem
    */
-  DI void ldsXY(int kidx)
-  {
+  DI void ldsXY(int kidx) {
     ldsX(kidx, sx + pageRd * P::SmemPage);
     ldsY(kidx, sy + pageRd * P::SmemPage);
   }
 
  private:
-  DI void ldgX(IdxT kidx)
-  {
+  DI void ldgX(IdxT kidx) {
     if (isRowMajor) {
       auto numRows = m;
       auto koffset = kidx + scolid;
@@ -362,10 +351,11 @@ struct Contractions_NT {
       }
     } else {
       const auto numRows = k;
-      auto koffset       = scolid;
+      auto koffset = scolid;
 #pragma unroll
       for (int i = 0; i < P::LdgPerThX; ++i) {
-        if ((koffset + xrowid) < lda && (srowid + kidx + i * P::LdgRowsX) < numRows) {
+        if ((koffset + xrowid) < lda &&
+            (srowid + kidx + i * P::LdgRowsX) < numRows) {
           ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset);
         } else {
 #pragma unroll
@@ -377,8 +367,7 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldgY(IdxT kidx)
-  {
+  DI void ldgY(IdxT kidx) {
     if (isRowMajor) {
       auto numRows = n;
       auto koffset = kidx + scolid;
@@ -398,7 +387,8 @@ struct Contractions_NT {
       auto koffset = scolid;
 #pragma unroll
       for (int i = 0; i < P::LdgPerThY; ++i) {
-        if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) {
+        if ((koffset + yrowid) < ldb &&
+            (srowid + kidx + i * P::LdgRowsY) < numRows) {
           ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset);
         } else {
 #pragma unroll
@@ -410,8 +400,7 @@ struct Contractions_NT {
     }
   }
 
-  DI void stsX(DataT* smem)
-  {
+  DI void stsX(DataT* smem) {
     auto* saddr = smem + srowid * P::SmemStride + scolid;
 #pragma unroll
     for (int i = 0; i < P::LdgPerThX; ++i) {
@@ -419,8 +408,7 @@ struct Contractions_NT {
     }
   }
 
-  DI void stsY(DataT* smem)
-  {
+  DI void stsY(DataT* smem) {
     auto* saddr = smem + srowid * P::SmemStride + scolid;
 #pragma unroll
     for (int i = 0; i < P::LdgPerThY; ++i) {
@@ -428,8 +416,7 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldsX(int kidx, DataT* smem)
-  {
+  DI void ldsX(int kidx, DataT* smem) {
     if (isRowMajor) {
       auto* saddr = smem + accrowid * P::SmemStride + kidx;
 #pragma unroll
@@ -448,8 +435,7 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldsY(int kidx, DataT* smem)
-  {
+  DI void ldsY(int kidx, DataT* smem) {
     if (isRowMajor) {
       auto* saddr = smem + acccolid * P::SmemStride + kidx;
 #pragma unroll
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h
index 2d18691410..7c79e6c91d 100644
--- a/cpp/include/raft/linalg/cublas_wrappers.h
+++ b/cpp/include/raft/linalg/cublas_wrappers.h
@@ -25,7 +25,8 @@
 #include <cstdint>
 
 #define _CUBLAS_ERR_TO_STR(err) \
-  case err: return #err
+  case err:                     \
+    return #err
 
 namespace raft {
 
@@ -33,15 +34,15 @@ namespace raft {
  * @brief Exception thrown when a cuBLAS error is encountered.
  */
 struct cublas_error : public raft::exception {
-  explicit cublas_error(char const* const message) : raft::exception(message) {}
-  explicit cublas_error(std::string const& message) : raft::exception(message) {}
+  explicit cublas_error(char const *const message) : raft::exception(message) {}
+  explicit cublas_error(std::string const &message)
+    : raft::exception(message) {}
 };
 
 namespace linalg {
 namespace detail {
 
-inline const char* cublas_error_to_string(cublasStatus_t err)
-{
+inline const char *cublas_error_to_string(cublasStatus_t err) {
   switch (err) {
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
@@ -53,7 +54,8 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-    default: return "CUBLAS_STATUS_UNKNOWN";
+    default:
+      return "CUBLAS_STATUS_UNKNOWN";
   };
 }
 
@@ -69,19 +71,16 @@ inline const char* cublas_error_to_string(cublasStatus_t err)
  * Invokes a cuBLAS runtime API function call, if the call does not return
  * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
  */
-#define CUBLAS_TRY(call)                                                   \
-  do {                                                                     \
-    cublasStatus_t const status = (call);                                  \
-    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
-      std::string msg{};                                                   \
-      SET_ERROR_MSG(msg,                                                   \
-                    "cuBLAS error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                             \
-                    #call,                                                 \
-                    status,                                                \
-                    raft::linalg::detail::cublas_error_to_string(status)); \
-      throw raft::cublas_error(msg);                                       \
-    }                                                                      \
+#define CUBLAS_TRY(call)                                                      \
+  do {                                                                        \
+    cublasStatus_t const status = (call);                                     \
+    if (CUBLAS_STATUS_SUCCESS != status) {                                    \
+      std::string msg{};                                                      \
+      SET_ERROR_MSG(                                                          \
+        msg, "cuBLAS error encountered at: ", "call='%s', Reason=%d:%s",      \
+        #call, status, raft::linalg::detail::cublas_error_to_string(status)); \
+      throw raft::cublas_error(msg);                                          \
+    }                                                                         \
   } while (0)
 
 /** FIXME: temporary alias for cuML compatibility */
@@ -108,39 +107,22 @@ namespace linalg {
  * @{
  */
 template <typename T>
-cublasStatus_t cublasaxpy(cublasHandle_t handle,
-                          int n,
-                          const T* alpha,
-                          const T* x,
-                          int incx,
-                          T* y,
-                          int incy,
+cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, const T *alpha,
+                          const T *x, int incx, T *y, int incy,
                           cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
-                                 int n,
-                                 const float* alpha,
-                                 const float* x,
-                                 int incx,
-                                 float* y,
-                                 int incy,
-                                 cudaStream_t stream)
-{
+inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
+                                 const float *alpha, const float *x, int incx,
+                                 float *y, int incy, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
 }
 
 template <>
-inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
-                                 int n,
-                                 const double* alpha,
-                                 const double* x,
-                                 int incx,
-                                 double* y,
-                                 int incy,
-                                 cudaStream_t stream)
-{
+inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
+                                 const double *alpha, const double *x, int incx,
+                                 double *y, int incy, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
 }
@@ -151,21 +133,21 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasSwap(
-  cublasHandle_t handle, int n, T* x, int incx, T* y, int incy, cudaStream_t stream);
+cublasStatus_t cublasSwap(cublasHandle_t handle, int n, T *x, int incx, T *y,
+                          int incy, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasSwap(
-  cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream)
-{
+inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, float *x,
+                                 int incx, float *y, int incy,
+                                 cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSswap(handle, n, x, incx, y, incy);
 }
 
 template <>
-inline cublasStatus_t cublasSwap(
-  cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream)
-{
+inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x,
+                                 int incx, double *y, int incy,
+                                 cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDswap(handle, n, x, incx, y, incy);
 }
@@ -177,20 +159,20 @@ inline cublasStatus_t cublasSwap(
  * @{
  */
 template <typename T>
-cublasStatus_t cublasCopy(
-  cublasHandle_t handle, int n, const T* x, int incx, T* y, int incy, cudaStream_t stream);
+cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const T *x, int incx,
+                          T *y, int incy, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasCopy(
-  cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream)
-{
+inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const float *x,
+                                 int incx, float *y, int incy,
+                                 cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasScopy(handle, n, x, incx, y, incy);
 }
 template <>
-inline cublasStatus_t cublasCopy(
-  cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream)
-{
+inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x,
+                                 int incx, double *y, int incy,
+                                 cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDcopy(handle, n, x, incx, y, incy);
 }
@@ -201,56 +183,31 @@ inline cublasStatus_t cublasCopy(
  * @{
  */
 template <typename T>
-cublasStatus_t cublasgemv(cublasHandle_t handle,
-                          cublasOperation_t transA,
-                          int m,
-                          int n,
-                          const T* alfa,
-                          const T* A,
-                          int lda,
-                          const T* x,
-                          int incx,
-                          const T* beta,
-                          T* y,
-                          int incy,
+cublasStatus_t cublasgemv(cublasHandle_t handle, cublasOperation_t transA,
+                          int m, int n, const T *alfa, const T *A, int lda,
+                          const T *x, int incx, const T *beta, T *y, int incy,
                           cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemv(cublasHandle_t handle,
-                                 cublasOperation_t transA,
-                                 int m,
-                                 int n,
-                                 const float* alfa,
-                                 const float* A,
-                                 int lda,
-                                 const float* x,
-                                 int incx,
-                                 const float* beta,
-                                 float* y,
-                                 int incy,
-                                 cudaStream_t stream)
-{
+                                 cublasOperation_t transA, int m, int n,
+                                 const float *alfa, const float *A, int lda,
+                                 const float *x, int incx, const float *beta,
+                                 float *y, int incy, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
+  return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y,
+                     incy);
 }
 
 template <>
 inline cublasStatus_t cublasgemv(cublasHandle_t handle,
-                                 cublasOperation_t transA,
-                                 int m,
-                                 int n,
-                                 const double* alfa,
-                                 const double* A,
-                                 int lda,
-                                 const double* x,
-                                 int incx,
-                                 const double* beta,
-                                 double* y,
-                                 int incy,
-                                 cudaStream_t stream)
-{
+                                 cublasOperation_t transA, int m, int n,
+                                 const double *alfa, const double *A, int lda,
+                                 const double *x, int incx, const double *beta,
+                                 double *y, int incy, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
+  return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y,
+                     incy);
 }
 /** @} */
 
@@ -259,47 +216,23 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasger(cublasHandle_t handle,
-                         int m,
-                         int n,
-                         const T* alpha,
-                         const T* x,
-                         int incx,
-                         const T* y,
-                         int incy,
-                         T* A,
-                         int lda,
-                         cudaStream_t stream);
+cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, const T *alpha,
+                         const T *x, int incx, const T *y, int incy, T *A,
+                         int lda, cudaStream_t stream);
 template <>
-inline cublasStatus_t cublasger(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                const float* alpha,
-                                const float* x,
-                                int incx,
-                                const float* y,
-                                int incy,
-                                float* A,
-                                int lda,
-                                cudaStream_t stream)
-{
+inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
+                                const float *alpha, const float *x, int incx,
+                                const float *y, int incy, float *A, int lda,
+                                cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
 template <>
-inline cublasStatus_t cublasger(cublasHandle_t handle,
-                                int m,
-                                int n,
-                                const double* alpha,
-                                const double* x,
-                                int incx,
-                                const double* y,
-                                int incy,
-                                double* A,
-                                int lda,
-                                cudaStream_t stream)
-{
+inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
+                                const double *alpha, const double *x, int incx,
+                                const double *y, int incy, double *A, int lda,
+                                cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
@@ -310,62 +243,34 @@ inline cublasStatus_t cublasger(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasgemm(cublasHandle_t handle,
-                          cublasOperation_t transA,
-                          cublasOperation_t transB,
-                          int m,
-                          int n,
-                          int k,
-                          const T* alfa,
-                          const T* A,
-                          int lda,
-                          const T* B,
-                          int ldb,
-                          const T* beta,
-                          T* C,
-                          int ldc,
+cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA,
+                          cublasOperation_t transB, int m, int n, int k,
+                          const T *alfa, const T *A, int lda, const T *B,
+                          int ldb, const T *beta, T *C, int ldc,
                           cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB,
-                                 int m,
-                                 int n,
-                                 int k,
-                                 const float* alfa,
-                                 const float* A,
-                                 int lda,
-                                 const float* B,
-                                 int ldb,
-                                 const float* beta,
-                                 float* C,
-                                 int ldc,
-                                 cudaStream_t stream)
-{
+                                 cublasOperation_t transB, int m, int n, int k,
+                                 const float *alfa, const float *A, int lda,
+                                 const float *B, int ldb, const float *beta,
+                                 float *C, int ldc, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
+  return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb,
+                     beta, C, ldc);
 }
 
 template <>
 inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB,
-                                 int m,
-                                 int n,
-                                 int k,
-                                 const double* alfa,
-                                 const double* A,
-                                 int lda,
-                                 const double* B,
-                                 int ldb,
-                                 const double* beta,
-                                 double* C,
-                                 int ldc,
-                                 cudaStream_t stream)
-{
+                                 cublasOperation_t transB, int m, int n, int k,
+                                 const double *alfa, const double *A, int lda,
+                                 const double *B, int ldb, const double *beta,
+                                 double *C, int ldc, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
+  return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb,
+                     beta, C, ldc);
 }
 /** @} */
 
@@ -376,93 +281,38 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle,
 template <typename T>
 cublasStatus_t cublasgemmBatched(cublasHandle_t handle,  // NOLINT
                                  cublasOperation_t transa,
-                                 cublasOperation_t transb,
-                                 int m,
-                                 int n,
-                                 int k,
-                                 const T* alpha,
-                                 const T* const Aarray[],  // NOLINT
-                                 int lda,
-                                 const T* const Barray[],  // NOLINT
-                                 int ldb,
-                                 const T* beta,
-                                 T* Carray[],  // NOLINT
-                                 int ldc,
-                                 int batchCount,
-                                 cudaStream_t stream);
+                                 cublasOperation_t transb, int m, int n, int k,
+                                 const T *alpha,
+                                 const T *const Aarray[],           // NOLINT
+                                 int lda, const T *const Barray[],  // NOLINT
+                                 int ldb, const T *beta,
+                                 T *Carray[],  // NOLINT
+                                 int ldc, int batchCount, cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemmBatched(  // NOLINT
-  cublasHandle_t handle,
-  cublasOperation_t transa,
-  cublasOperation_t transb,
-  int m,
-  int n,
-  int k,
-  const float* alpha,
-  const float* const Aarray[],  // NOLINT
-  int lda,
-  const float* const Barray[],  // NOLINT
-  int ldb,
-  const float* beta,
-  float* Carray[],  // NOLINT
-  int ldc,
-  int batchCount,
-  cudaStream_t stream)
-{
+  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+  int m, int n, int k, const float *alpha,
+  const float *const Aarray[],                  // NOLINT
+  int lda, const float *const Barray[],         // NOLINT
+  int ldb, const float *beta, float *Carray[],  // NOLINT
+  int ldc, int batchCount, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemmBatched(handle,
-                            transa,
-                            transb,
-                            m,
-                            n,
-                            k,
-                            alpha,
-                            Aarray,
-                            lda,
-                            Barray,
-                            ldb,
-                            beta,
-                            Carray,
-                            ldc,
-                            batchCount);
+  return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda,
+                            Barray, ldb, beta, Carray, ldc, batchCount);
 }
 
 template <>
 inline cublasStatus_t cublasgemmBatched(  // NOLINT
-  cublasHandle_t handle,
-  cublasOperation_t transa,
-  cublasOperation_t transb,
-  int m,
-  int n,
-  int k,
-  const double* alpha,
-  const double* const Aarray[],  // NOLINT
-  int lda,
-  const double* const Barray[],  // NOLINT
-  int ldb,
-  const double* beta,
-  double* Carray[],  // NOLINT
-  int ldc,
-  int batchCount,
-  cudaStream_t stream)
-{
+  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+  int m, int n, int k, const double *alpha,
+  const double *const Aarray[],                   // NOLINT
+  int lda, const double *const Barray[],          // NOLINT
+  int ldb, const double *beta, double *Carray[],  // NOLINT
+  int ldc, int batchCount, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemmBatched(handle,
-                            transa,
-                            transb,
-                            m,
-                            n,
-                            k,
-                            alpha,
-                            Aarray,
-                            lda,
-                            Barray,
-                            ldb,
-                            beta,
-                            Carray,
-                            ldc,
-                            batchCount);
+  return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda,
+                            Barray, ldb, beta, Carray, ldc, batchCount);
 }
 /** @} */
 
@@ -472,110 +322,36 @@ inline cublasStatus_t cublasgemmBatched(  // NOLINT
  */
 template <typename T>
 cublasStatus_t cublasgemmStridedBatched(  // NOLINT
-  cublasHandle_t handle,
-  cublasOperation_t transa,
-  cublasOperation_t transb,
-  int m,
-  int n,
-  int k,
-  const T* alpha,
-  const T* const Aarray,
-  int lda,
-  int64_t strideA,
-  const T* const Barray,
-  int ldb,
-  int64_t strideB,
-  const T* beta,
-  T* Carray,
-  int ldc,
-  int64_t strideC,
-  int batchCount,
+  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+  int m, int n, int k, const T *alpha, const T *const Aarray, int lda,
+  int64_t strideA, const T *const Barray, int ldb, int64_t strideB,
+  const T *beta, T *Carray, int ldc, int64_t strideC, int batchCount,
   cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
-  cublasHandle_t handle,
-  cublasOperation_t transa,
-  cublasOperation_t transb,
-  int m,
-  int n,
-  int k,
-  const float* alpha,
-  const float* const Aarray,
-  int lda,
-  int64_t strideA,
-  const float* const Barray,
-  int ldb,
-  int64_t strideB,
-  const float* beta,
-  float* Carray,
-  int ldc,
-  int64_t strideC,
-  int batchCount,
-  cudaStream_t stream)
-{
+  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+  int m, int n, int k, const float *alpha, const float *const Aarray, int lda,
+  int64_t strideA, const float *const Barray, int ldb, int64_t strideB,
+  const float *beta, float *Carray, int ldc, int64_t strideC, int batchCount,
+  cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemmStridedBatched(handle,
-                                   transa,
-                                   transb,
-                                   m,
-                                   n,
-                                   k,
-                                   alpha,
-                                   Aarray,
-                                   lda,
-                                   strideA,
-                                   Barray,
-                                   ldb,
-                                   strideB,
-                                   beta,
-                                   Carray,
-                                   ldc,
-                                   strideC,
-                                   batchCount);
+  return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha,
+                                   Aarray, lda, strideA, Barray, ldb, strideB,
+                                   beta, Carray, ldc, strideC, batchCount);
 }
 
 template <>
 inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
-  cublasHandle_t handle,
-  cublasOperation_t transa,
-  cublasOperation_t transb,
-  int m,
-  int n,
-  int k,
-  const double* alpha,
-  const double* const Aarray,
-  int lda,
-  int64_t strideA,
-  const double* const Barray,
-  int ldb,
-  int64_t strideB,
-  const double* beta,
-  double* Carray,
-  int ldc,
-  int64_t strideC,
-  int batchCount,
-  cudaStream_t stream)
-{
+  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+  int m, int n, int k, const double *alpha, const double *const Aarray, int lda,
+  int64_t strideA, const double *const Barray, int ldb, int64_t strideB,
+  const double *beta, double *Carray, int ldc, int64_t strideC, int batchCount,
+  cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemmStridedBatched(handle,
-                                   transa,
-                                   transb,
-                                   m,
-                                   n,
-                                   k,
-                                   alpha,
-                                   Aarray,
-                                   lda,
-                                   strideA,
-                                   Barray,
-                                   ldb,
-                                   strideB,
-                                   beta,
-                                   Carray,
-                                   ldc,
-                                   strideC,
-                                   batchCount);
+  return cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha,
+                                   Aarray, lda, strideA, Barray, ldb, strideB,
+                                   beta, Carray, ldc, strideC, batchCount);
 }
 /** @} */
 
@@ -585,85 +361,51 @@ inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
  */
 
 template <typename T>
-cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,
-                                  int n,         // NOLINT
-                                  T* const A[],  // NOLINT
-                                  int lda,
-                                  int* P,
-                                  int* info,
-                                  int batchSize,
+cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, int n,  // NOLINT
+                                  T *const A[],                  // NOLINT
+                                  int lda, int *P, int *info, int batchSize,
                                   cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
-                                         int n,
-                                         float* const A[],  // NOLINT
-                                         int lda,
-                                         int* P,
-                                         int* info,
-                                         int batchSize,
-                                         cudaStream_t stream)
-{
+inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,    // NOLINT
+                                         int n, float *const A[],  // NOLINT
+                                         int lda, int *P, int *info,
+                                         int batchSize, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
 template <>
-inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
-                                         int n,
-                                         double* const A[],  // NOLINT
-                                         int lda,
-                                         int* P,
-                                         int* info,
-                                         int batchSize,
-                                         cudaStream_t stream)
-{
+inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,     // NOLINT
+                                         int n, double *const A[],  // NOLINT
+                                         int lda, int *P, int *info,
+                                         int batchSize, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
 template <typename T>
-cublasStatus_t cublasgetriBatched(cublasHandle_t handle,
-                                  int n,               // NOLINT
-                                  const T* const A[],  // NOLINT
-                                  int lda,
-                                  const int* P,
-                                  T* const C[],  // NOLINT
-                                  int ldc,
-                                  int* info,
-                                  int batchSize,
+cublasStatus_t cublasgetriBatched(cublasHandle_t handle, int n,  // NOLINT
+                                  const T *const A[],            // NOLINT
+                                  int lda, const int *P,
+                                  T *const C[],  // NOLINT
+                                  int ldc, int *info, int batchSize,
                                   cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasgetriBatched(  // NOLINT
-  cublasHandle_t handle,
-  int n,
-  const float* const A[],  // NOLINT
-  int lda,
-  const int* P,
-  float* const C[],  // NOLINT
-  int ldc,
-  int* info,
-  int batchSize,
-  cudaStream_t stream)
-{
+inline cublasStatus_t cublasgetriBatched(                // NOLINT
+  cublasHandle_t handle, int n, const float *const A[],  // NOLINT
+  int lda, const int *P, float *const C[],               // NOLINT
+  int ldc, int *info, int batchSize, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
 template <>
-inline cublasStatus_t cublasgetriBatched(  // NOLINT
-  cublasHandle_t handle,
-  int n,
-  const double* const A[],  // NOLINT
-  int lda,
-  const int* P,
-  double* const C[],  // NOLINT
-  int ldc,
-  int* info,
-  int batchSize,
-  cudaStream_t stream)
-{
+inline cublasStatus_t cublasgetriBatched(                 // NOLINT
+  cublasHandle_t handle, int n, const double *const A[],  // NOLINT
+  int lda, const int *P, double *const C[],               // NOLINT
+  int ldc, int *info, int batchSize, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
@@ -677,57 +419,34 @@ inline cublasStatus_t cublasgetriBatched(  // NOLINT
 
 template <typename T>
 inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans,
-                                        int m,
-                                        int n,
-                                        int nrhs,
-                                        T* Aarray[],  // NOLINT
-                                        int lda,
-                                        T* Carray[],  // NOLINT
-                                        int ldc,
-                                        int* info,
-                                        int* devInfoArray,
-                                        int batchSize,
-                                        cudaStream_t stream);
+                                        cublasOperation_t trans, int m, int n,
+                                        int nrhs, T *Aarray[],  // NOLINT
+                                        int lda, T *Carray[],   // NOLINT
+                                        int ldc, int *info, int *devInfoArray,
+                                        int batchSize, cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans,
-                                        int m,
-                                        int n,
-                                        int nrhs,
-                                        float* Aarray[],  // NOLINT
-                                        int lda,
-                                        float* Carray[],  // NOLINT
-                                        int ldc,
-                                        int* info,
-                                        int* devInfoArray,
-                                        int batchSize,
-                                        cudaStream_t stream)
-{
+                                        cublasOperation_t trans, int m, int n,
+                                        int nrhs, float *Aarray[],  // NOLINT
+                                        int lda, float *Carray[],   // NOLINT
+                                        int ldc, int *info, int *devInfoArray,
+                                        int batchSize, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgelsBatched(
-    handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
+                            info, devInfoArray, batchSize);
 }
 
 template <>
 inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans,
-                                        int m,
-                                        int n,
-                                        int nrhs,
-                                        double* Aarray[],  // NOLINT
-                                        int lda,
-                                        double* Carray[],  // NOLINT
-                                        int ldc,
-                                        int* info,
-                                        int* devInfoArray,
-                                        int batchSize,
-                                        cudaStream_t stream)
-{
+                                        cublasOperation_t trans, int m, int n,
+                                        int nrhs, double *Aarray[],  // NOLINT
+                                        int lda, double *Carray[],   // NOLINT
+                                        int ldc, int *info, int *devInfoArray,
+                                        int batchSize, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgelsBatched(
-    handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
+                            info, devInfoArray, batchSize);
 }
 
 /** @} */
@@ -737,59 +456,33 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
  * @{
  */
 template <typename T>
-cublasStatus_t cublasgeam(cublasHandle_t handle,
-                          cublasOperation_t transA,
-                          cublasOperation_t transB,
-                          int m,
-                          int n,
-                          const T* alfa,
-                          const T* A,
-                          int lda,
-                          const T* beta,
-                          const T* B,
-                          int ldb,
-                          T* C,
-                          int ldc,
-                          cudaStream_t stream);
+cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA,
+                          cublasOperation_t transB, int m, int n, const T *alfa,
+                          const T *A, int lda, const T *beta, const T *B,
+                          int ldb, T *C, int ldc, cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB,
-                                 int m,
-                                 int n,
-                                 const float* alfa,
-                                 const float* A,
-                                 int lda,
-                                 const float* beta,
-                                 const float* B,
-                                 int ldb,
-                                 float* C,
-                                 int ldc,
-                                 cudaStream_t stream)
-{
+                                 cublasOperation_t transB, int m, int n,
+                                 const float *alfa, const float *A, int lda,
+                                 const float *beta, const float *B, int ldb,
+                                 float *C, int ldc, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
+  return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb,
+                     C, ldc);
 }
 
 template <>
 inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB,
-                                 int m,
-                                 int n,
-                                 const double* alfa,
-                                 const double* A,
-                                 int lda,
-                                 const double* beta,
-                                 const double* B,
-                                 int ldb,
-                                 double* C,
-                                 int ldc,
-                                 cudaStream_t stream)
-{
+                                 cublasOperation_t transB, int m, int n,
+                                 const double *alfa, const double *A, int lda,
+                                 const double *beta, const double *B, int ldb,
+                                 double *C, int ldc, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
+  return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb,
+                     C, ldc);
 }
 /** @} */
 
@@ -798,59 +491,31 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublassymm(cublasHandle_t handle,
-                          cublasSideMode_t side,
-                          cublasFillMode_t uplo,
-                          int m,
-                          int n,
-                          const T* alpha,
-                          const T* A,
-                          int lda,
-                          const T* B,
-                          int ldb,
-                          const T* beta,
-                          T* C,
-                          int ldc,
-                          cudaStream_t stream);
+cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
+                          cublasFillMode_t uplo, int m, int n, const T *alpha,
+                          const T *A, int lda, const T *B, int ldb,
+                          const T *beta, T *C, int ldc, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublassymm(cublasHandle_t handle,
-                                 cublasSideMode_t side,
-                                 cublasFillMode_t uplo,
-                                 int m,
-                                 int n,
-                                 const float* alpha,
-                                 const float* A,
-                                 int lda,
-                                 const float* B,
-                                 int ldb,
-                                 const float* beta,
-                                 float* C,
-                                 int ldc,
-                                 cudaStream_t stream)
-{
+inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
+                                 cublasFillMode_t uplo, int m, int n,
+                                 const float *alpha, const float *A, int lda,
+                                 const float *B, int ldb, const float *beta,
+                                 float *C, int ldc, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                     ldc);
 }
 
 template <>
-inline cublasStatus_t cublassymm(cublasHandle_t handle,
-                                 cublasSideMode_t side,
-                                 cublasFillMode_t uplo,
-                                 int m,
-                                 int n,
-                                 const double* alpha,
-                                 const double* A,
-                                 int lda,
-                                 const double* B,
-                                 int ldb,
-                                 const double* beta,
-                                 double* C,
-                                 int ldc,
-                                 cudaStream_t stream)
-{
+inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
+                                 cublasFillMode_t uplo, int m, int n,
+                                 const double *alpha, const double *A, int lda,
+                                 const double *B, int ldb, const double *beta,
+                                 double *C, int ldc, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                     ldc);
 }
 /** @} */
 
@@ -859,51 +524,27 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublassyrk(cublasHandle_t handle,
-                          cublasFillMode_t uplo,
-                          cublasOperation_t trans,
-                          int n,
-                          int k,
-                          const T* alpha,
-                          const T* A,
-                          int lda,
-                          const T* beta,
-                          T* C,
-                          int ldc,
+cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
+                          cublasOperation_t trans, int n, int k, const T *alpha,
+                          const T *A, int lda, const T *beta, T *C, int ldc,
                           cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublassyrk(cublasHandle_t handle,
-                                 cublasFillMode_t uplo,
-                                 cublasOperation_t trans,
-                                 int n,
-                                 int k,
-                                 const float* alpha,
-                                 const float* A,
-                                 int lda,
-                                 const float* beta,
-                                 float* C,
-                                 int ldc,
-                                 cudaStream_t stream)
-{
+inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
+                                 cublasOperation_t trans, int n, int k,
+                                 const float *alpha, const float *A, int lda,
+                                 const float *beta, float *C, int ldc,
+                                 cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
 template <>
-inline cublasStatus_t cublassyrk(cublasHandle_t handle,
-                                 cublasFillMode_t uplo,
-                                 cublasOperation_t trans,
-                                 int n,
-                                 int k,
-                                 const double* alpha,
-                                 const double* A,
-                                 int lda,
-                                 const double* beta,
-                                 double* C,
-                                 int ldc,
-                                 cudaStream_t stream)
-{
+inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
+                                 cublasOperation_t trans, int n, int k,
+                                 const double *alpha, const double *A, int lda,
+                                 const double *beta, double *C, int ldc,
+                                 cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
@@ -914,77 +555,52 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasnrm2(
-  cublasHandle_t handle, int n, const T* x, int incx, T* result, cudaStream_t stream);
+cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const T *x, int incx,
+                          T *result, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasnrm2(
-  cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream)
-{
+inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const float *x,
+                                 int incx, float *result, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSnrm2(handle, n, x, incx, result);
 }
 
 template <>
-inline cublasStatus_t cublasnrm2(
-  cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream)
-{
+inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const double *x,
+                                 int incx, double *result,
+                                 cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDnrm2(handle, n, x, incx, result);
 }
 /** @} */
 
 template <typename T>
-cublasStatus_t cublastrsm(cublasHandle_t handle,
-                          cublasSideMode_t side,
-                          cublasFillMode_t uplo,
-                          cublasOperation_t trans,
-                          cublasDiagType_t diag,
-                          int m,
-                          int n,
-                          const T* alpha,
-                          const T* A,
-                          int lda,
-                          T* B,
-                          int ldb,
+cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
+                          cublasFillMode_t uplo, cublasOperation_t trans,
+                          cublasDiagType_t diag, int m, int n, const T *alpha,
+                          const T *A, int lda, T *B, int ldb,
                           cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublastrsm(cublasHandle_t handle,
-                                 cublasSideMode_t side,
-                                 cublasFillMode_t uplo,
-                                 cublasOperation_t trans,
-                                 cublasDiagType_t diag,
-                                 int m,
-                                 int n,
-                                 const float* alpha,
-                                 const float* A,
-                                 int lda,
-                                 float* B,
-                                 int ldb,
-                                 cudaStream_t stream)
-{
+inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
+                                 cublasFillMode_t uplo, cublasOperation_t trans,
+                                 cublasDiagType_t diag, int m, int n,
+                                 const float *alpha, const float *A, int lda,
+                                 float *B, int ldb, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+  return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B,
+                     ldb);
 }
 
 template <>
-inline cublasStatus_t cublastrsm(cublasHandle_t handle,
-                                 cublasSideMode_t side,
-                                 cublasFillMode_t uplo,
-                                 cublasOperation_t trans,
-                                 cublasDiagType_t diag,
-                                 int m,
-                                 int n,
-                                 const double* alpha,
-                                 const double* A,
-                                 int lda,
-                                 double* B,
-                                 int ldb,
-                                 cudaStream_t stream)
-{
+inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
+                                 cublasFillMode_t uplo, cublasOperation_t trans,
+                                 cublasDiagType_t diag, int m, int n,
+                                 const double *alpha, const double *A, int lda,
+                                 double *B, int ldb, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+  return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B,
+                     ldb);
 }
 
 /**
@@ -992,39 +608,21 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasdot(cublasHandle_t handle,
-                         int n,
-                         const T* x,
-                         int incx,
-                         const T* y,
-                         int incy,
-                         T* result,
-                         cudaStream_t stream);
+cublasStatus_t cublasdot(cublasHandle_t handle, int n, const T *x, int incx,
+                         const T *y, int incy, T *result, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasdot(cublasHandle_t handle,
-                                int n,
-                                const float* x,
-                                int incx,
-                                const float* y,
-                                int incy,
-                                float* result,
-                                cudaStream_t stream)
-{
+inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const float *x,
+                                int incx, const float *y, int incy,
+                                float *result, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSdot(handle, n, x, incx, y, incy, result);
 }
 
 template <>
-inline cublasStatus_t cublasdot(cublasHandle_t handle,
-                                int n,
-                                const double* x,
-                                int incx,
-                                const double* y,
-                                int incy,
-                                double* result,
-                                cudaStream_t stream)
-{
+inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x,
+                                int incx, const double *y, int incy,
+                                double *result, cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDdot(handle, n, x, incx, y, incy, result);
 }
@@ -1044,8 +642,7 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle,
 // template<>
 inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
                                            cublasPointerMode_t mode,
-                                           cudaStream_t stream)
-{
+                                           cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSetPointerMode(handle, mode);
 }
@@ -1056,21 +653,21 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasscal(
-  cublasHandle_t handle, int n, const T* alpha, T* x, int incx, cudaStream_t stream);
+cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x,
+                          int incx, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasscal(
-  cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream)
-{
+inline cublasStatus_t cublasscal(cublasHandle_t handle, int n,
+                                 const float *alpha, float *x, int incx,
+                                 cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSscal(handle, n, alpha, x, incx);
 }
 
 template <>
-inline cublasStatus_t cublasscal(
-  cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream)
-{
+inline cublasStatus_t cublasscal(cublasHandle_t handle, int n,
+                                 const double *alpha, double *x, int incx,
+                                 cudaStream_t stream) {
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDscal(handle, n, alpha, x, incx);
 }
diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h
index 76a9f40f4d..0eadf47fe3 100644
--- a/cpp/include/raft/linalg/cusolver_wrappers.h
+++ b/cpp/include/raft/linalg/cusolver_wrappers.h
@@ -24,7 +24,8 @@
 #include <type_traits>
 
 #define _CUSOLVER_ERR_TO_STR(err) \
-  case err: return #err;
+  case err:                       \
+    return #err;
 
 namespace raft {
 
@@ -32,15 +33,16 @@ namespace raft {
  * @brief Exception thrown when a cuSOLVER error is encountered.
  */
 struct cusolver_error : public raft::exception {
-  explicit cusolver_error(char const* const message) : raft::exception(message) {}
-  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
+  explicit cusolver_error(char const *const message)
+    : raft::exception(message) {}
+  explicit cusolver_error(std::string const &message)
+    : raft::exception(message) {}
 };
 
 namespace linalg {
 namespace detail {
 
-inline const char* cusolver_error_to_string(cusolverStatus_t err)
-{
+inline const char *cusolver_error_to_string(cusolverStatus_t err) {
   switch (err) {
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
@@ -52,7 +54,8 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-    default: return "CUSOLVER_STATUS_UNKNOWN";
+    default:
+      return "CUSOLVER_STATUS_UNKNOWN";
   };
 }
 
@@ -73,11 +76,8 @@ inline const char* cusolver_error_to_string(cusolverStatus_t err)
     cusolverStatus_t const status = (call);                                  \
     if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
       std::string msg{};                                                     \
-      SET_ERROR_MSG(msg,                                                     \
-                    "cuSOLVER error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                               \
-                    #call,                                                   \
-                    status,                                                  \
+      SET_ERROR_MSG(msg, "cuSOLVER error encountered at: ",                  \
+                    "call='%s', Reason=%d:%s", #call, status,                \
                     raft::linalg::detail::cusolver_error_to_string(status)); \
       throw raft::cusolver_error(msg);                                       \
     }                                                                        \
@@ -107,76 +107,42 @@ namespace linalg {
  * @{
  */
 template <typename T>
-cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,
-                                 int m,  // NOLINT
-                                 int n,
-                                 T* A,
-                                 int lda,
-                                 T* Workspace,
-                                 int* devIpiv,
-                                 int* devInfo,
+cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, int m,  // NOLINT
+                                 int n, T *A, int lda, T *Workspace,
+                                 int *devIpiv, int *devInfo,
                                  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m,
-                                        int n,
-                                        float* A,
-                                        int lda,
-                                        float* Workspace,
-                                        int* devIpiv,
-                                        int* devInfo,
-                                        cudaStream_t stream)
-{
+                                        int m, int n, float *A, int lda,
+                                        float *Workspace, int *devIpiv,
+                                        int *devInfo, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m,
-                                        int n,
-                                        double* A,
-                                        int lda,
-                                        double* Workspace,
-                                        int* devIpiv,
-                                        int* devInfo,
-                                        cudaStream_t stream)
-{
+                                        int m, int n, double *A, int lda,
+                                        double *Workspace, int *devIpiv,
+                                        int *devInfo, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  T* A,
-  int lda,
-  int* Lwork);
+  cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork);
 
 template <>
 inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  float* A,
-  int lda,
-  int* Lwork)
-{
+  cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
   return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  double* A,
-  int lda,
-  int* Lwork)
-{
+  cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
   return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 
@@ -186,49 +152,30 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
-                                 cublasOperation_t trans,
-                                 int n,
-                                 int nrhs,
-                                 const T* A,
-                                 int lda,
-                                 const int* devIpiv,
-                                 T* B,
-                                 int ldb,
-                                 int* devInfo,
-                                 cudaStream_t stream);
+                                 cublasOperation_t trans, int n, int nrhs,
+                                 const T *A, int lda, const int *devIpiv, T *B,
+                                 int ldb, int *devInfo, cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans,
-                                        int n,
-                                        int nrhs,
-                                        const float* A,
-                                        int lda,
-                                        const int* devIpiv,
-                                        float* B,
-                                        int ldb,
-                                        int* devInfo,
-                                        cudaStream_t stream)
-{
+                                        cublasOperation_t trans, int n,
+                                        int nrhs, const float *A, int lda,
+                                        const int *devIpiv, float *B, int ldb,
+                                        int *devInfo, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+  return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb,
+                          devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans,
-                                        int n,
-                                        int nrhs,
-                                        const double* A,
-                                        int lda,
-                                        const int* devIpiv,
-                                        double* B,
-                                        int ldb,
-                                        int* devInfo,
-                                        cudaStream_t stream)
-{
+                                        cublasOperation_t trans, int n,
+                                        int nrhs, const double *A, int lda,
+                                        const int *devIpiv, double *B, int ldb,
+                                        int *devInfo, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+  return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb,
+                          devInfo);
 }
 /** @} */
 
@@ -238,40 +185,20 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cublasFillMode_t uplo,
-  int n,
-  const T* A,
-  int lda,
-  const T* W,
-  int* lwork);
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, const T *A, int lda, const T *W, int *lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cublasFillMode_t uplo,
-  int n,
-  const float* A,
-  int lda,
-  const float* W,
-  int* lwork)
-{
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, const float *A, int lda, const float *W, int *lwork) {
   return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cublasFillMode_t uplo,
-  int n,
-  const double* A,
-  int lda,
-  const double* W,
-  int* lwork)
-{
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, const double *A, int lda, const double *W, int *lwork) {
   return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
 }
 /** @} */
@@ -282,96 +209,52 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnsyevj(cusolverDnHandle_t handle,  // NOLINT
-                                 cusolverEigMode_t jobz,
-                                 cublasFillMode_t uplo,
-                                 int n,
-                                 T* A,
-                                 int lda,
-                                 T* W,
-                                 T* work,
-                                 int lwork,
-                                 int* info,
-                                 syevjInfo_t params,
+                                 cusolverEigMode_t jobz, cublasFillMode_t uplo,
+                                 int n, T *A, int lda, T *W, T *work, int lwork,
+                                 int *info, syevjInfo_t params,
                                  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cublasFillMode_t uplo,
-  int n,
-  float* A,
-  int lda,
-  float* W,
-  float* work,
-  int lwork,
-  int* info,
-  syevjInfo_t params,
-  cudaStream_t stream)
-{
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, float *A, int lda, float *W, float *work, int lwork, int *info,
+  syevjInfo_t params, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+  return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info,
+                          params);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cublasFillMode_t uplo,
-  int n,
-  double* A,
-  int lda,
-  double* W,
-  double* work,
-  int lwork,
-  int* info,
-  syevjInfo_t params,
-  cudaStream_t stream)
-{
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, double *A, int lda, double *W, double *work, int lwork, int *info,
+  syevjInfo_t params, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+  return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info,
+                          params);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cublasFillMode_t uplo,
-  int n,
-  const T* A,
-  int lda,
-  const T* W,
-  int* lwork,
-  syevjInfo_t params);
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, const T *A, int lda, const T *W, int *lwork, syevjInfo_t params);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cublasFillMode_t uplo,
-  int n,
-  const float* A,
-  int lda,
-  const float* W,
-  int* lwork,
-  syevjInfo_t params)
-{
-  return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params);
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, const float *A, int lda, const float *W, int *lwork,
+  syevjInfo_t params) {
+  return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork,
+                                     params);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cublasFillMode_t uplo,
-  int n,
-  const double* A,
-  int lda,
-  const double* W,
-  int* lwork,
-  syevjInfo_t params)
-{
-  return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params);
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+  int n, const double *A, int lda, const double *W, int *lwork,
+  syevjInfo_t params) {
+  return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork,
+                                     params);
 }
 /** @} */
 
@@ -381,49 +264,32 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
-                                 cusolverEigMode_t jobz,
-                                 cublasFillMode_t uplo,
-                                 int n,
-                                 T* A,
-                                 int lda,
-                                 T* W,
-                                 T* work,
-                                 int lwork,
-                                 int* devInfo,
-                                 cudaStream_t stream);
+                                 cusolverEigMode_t jobz, cublasFillMode_t uplo,
+                                 int n, T *A, int lda, T *W, T *work, int lwork,
+                                 int *devInfo, cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         cusolverEigMode_t jobz,
-                                        cublasFillMode_t uplo,
-                                        int n,
-                                        float* A,
-                                        int lda,
-                                        float* W,
-                                        float* work,
-                                        int lwork,
-                                        int* devInfo,
-                                        cudaStream_t stream)
-{
+                                        cublasFillMode_t uplo, int n, float *A,
+                                        int lda, float *W, float *work,
+                                        int lwork, int *devInfo,
+                                        cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
+  return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork,
+                          devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         cusolverEigMode_t jobz,
-                                        cublasFillMode_t uplo,
-                                        int n,
-                                        double* A,
-                                        int lda,
-                                        double* W,
-                                        double* work,
-                                        int lwork,
-                                        int* devInfo,
-                                        cudaStream_t stream)
-{
+                                        cublasFillMode_t uplo, int n, double *A,
+                                        int lda, double *W, double *work,
+                                        int lwork, int *devInfo,
+                                        cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
+  return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork,
+                          devInfo);
 }
 /** @} */
 
@@ -431,134 +297,57 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
 /**
  * @defgroup syevdx cusolver syevdx operations
  * @{
- */
+*/
 template <typename T>
 cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cusolverEigRange_t range,
-  cublasFillMode_t uplo,
-  int n,
-  const T* A,
-  int lda,
-  T vl,
-  T vu,
-  int il,
-  int iu,
-  int* h_meig,
-  const T* W,
-  int* lwork);
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+  cublasFillMode_t uplo, int n, const T *A, int lda, T vl, T vu, int il, int iu,
+  int *h_meig, const T *W, int *lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cusolverEigRange_t range,
-  cublasFillMode_t uplo,
-  int n,
-  const float* A,
-  int lda,
-  float vl,
-  float vu,
-  int il,
-  int iu,
-  int* h_meig,
-  const float* W,
-  int* lwork)
-{
-  return cusolverDnSsyevdx_bufferSize(
-    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork);
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+  cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu,
+  int il, int iu, int *h_meig, const float *W, int *lwork) {
+  return cusolverDnSsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl,
+                                      vu, il, iu, h_meig, W, lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cusolverEigRange_t range,
-  cublasFillMode_t uplo,
-  int n,
-  const double* A,
-  int lda,
-  double vl,
-  double vu,
-  int il,
-  int iu,
-  int* h_meig,
-  const double* W,
-  int* lwork)
-{
-  return cusolverDnDsyevdx_bufferSize(
-    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork);
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+  cublasFillMode_t uplo, int n, const double *A, int lda, double vl, double vu,
+  int il, int iu, int *h_meig, const double *W, int *lwork) {
+  return cusolverDnDsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl,
+                                      vu, il, iu, h_meig, W, lwork);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnsyevdx(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cusolverEigRange_t range,
-  cublasFillMode_t uplo,
-  int n,
-  T* A,
-  int lda,
-  T vl,
-  T vu,
-  int il,
-  int iu,
-  int* h_meig,
-  T* W,
-  T* work,
-  int lwork,
-  int* devInfo,
-  cudaStream_t stream);
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+  cublasFillMode_t uplo, int n, T *A, int lda, T vl, T vu, int il, int iu,
+  int *h_meig, T *W, T *work, int lwork, int *devInfo, cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cusolverEigRange_t range,
-  cublasFillMode_t uplo,
-  int n,
-  float* A,
-  int lda,
-  float vl,
-  float vu,
-  int il,
-  int iu,
-  int* h_meig,
-  float* W,
-  float* work,
-  int lwork,
-  int* devInfo,
-  cudaStream_t stream)
-{
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+  cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il,
+  int iu, int *h_meig, float *W, float *work, int lwork, int *devInfo,
+  cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSsyevdx(
-    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
+  return cusolverDnSsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu,
+                           h_meig, W, work, lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  cusolverEigRange_t range,
-  cublasFillMode_t uplo,
-  int n,
-  double* A,
-  int lda,
-  double vl,
-  double vu,
-  int il,
-  int iu,
-  int* h_meig,
-  double* W,
-  double* work,
-  int lwork,
-  int* devInfo,
-  cudaStream_t stream)
-{
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+  cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu,
+  int il, int iu, int *h_meig, double *W, double *work, int lwork, int *devInfo,
+  cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDsyevdx(
-    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
+  return cusolverDnDsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu,
+                           h_meig, W, work, lwork, devInfo);
 }
 /** @} */
 #endif
@@ -569,11 +358,7 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDngesvd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  int* lwork)
-{
+  cusolverDnHandle_t handle, int m, int n, int *lwork) {
   if (std::is_same<std::decay_t<T>, float>::value) {
     return cusolverDnSgesvd_bufferSize(handle, m, n, lwork);
   } else {
@@ -582,194 +367,72 @@ cusolverStatus_t cusolverDngesvd_bufferSize(  // NOLINT
 }
 template <typename T>
 cusolverStatus_t cusolverDngesvd(  // NOLINT
-  cusolverDnHandle_t handle,
-  signed char jobu,
-  signed char jobvt,
-  int m,
-  int n,
-  T* A,
-  int lda,
-  T* S,
-  T* U,
-  int ldu,
-  T* VT,
-  int ldvt,
-  T* work,
-  int lwork,
-  T* rwork,
-  int* devInfo,
-  cudaStream_t stream);
+  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
+  T *A, int lda, T *S, T *U, int ldu, T *VT, int ldvt, T *work, int lwork,
+  T *rwork, int *devInfo, cudaStream_t stream);
 template <>
 inline cusolverStatus_t cusolverDngesvd(  // NOLINT
-  cusolverDnHandle_t handle,
-  signed char jobu,
-  signed char jobvt,
-  int m,
-  int n,
-  float* A,
-  int lda,
-  float* S,
-  float* U,
-  int ldu,
-  float* VT,
-  int ldvt,
-  float* work,
-  int lwork,
-  float* rwork,
-  int* devInfo,
-  cudaStream_t stream)
-{
+  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
+  float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt,
+  float *work, int lwork, float *rwork, int *devInfo, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSgesvd(
-    handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
+  return cusolverDnSgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT,
+                          ldvt, work, lwork, rwork, devInfo);
 }
 template <>
 inline cusolverStatus_t cusolverDngesvd(  // NOLINT
-  cusolverDnHandle_t handle,
-  signed char jobu,
-  signed char jobvt,
-  int m,
-  int n,
-  double* A,
-  int lda,
-  double* S,
-  double* U,
-  int ldu,
-  double* VT,
-  int ldvt,
-  double* work,
-  int lwork,
-  double* rwork,
-  int* devInfo,
-  cudaStream_t stream)
-{
+  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
+  double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt,
+  double *work, int lwork, double *rwork, int *devInfo, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDgesvd(
-    handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
+  return cusolverDnDgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT,
+                          ldvt, work, lwork, rwork, devInfo);
 }
 
 template <typename T>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  int econ,
-  int m,
-  int n,
-  const T* A,
-  int lda,
-  const T* S,
-  const T* U,
-  int ldu,
-  const T* V,
-  int ldv,
-  int* lwork,
-  gesvdjInfo_t params);
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+  const T *A, int lda, const T *S, const T *U, int ldu, const T *V, int ldv,
+  int *lwork, gesvdjInfo_t params);
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  int econ,
-  int m,
-  int n,
-  const float* A,
-  int lda,
-  const float* S,
-  const float* U,
-  int ldu,
-  const float* V,
-  int ldv,
-  int* lwork,
-  gesvdjInfo_t params)
-{
-  return cusolverDnSgesvdj_bufferSize(
-    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+  const float *A, int lda, const float *S, const float *U, int ldu,
+  const float *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  return cusolverDnSgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U,
+                                      ldu, V, ldv, lwork, params);
 }
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  int econ,
-  int m,
-  int n,
-  const double* A,
-  int lda,
-  const double* S,
-  const double* U,
-  int ldu,
-  const double* V,
-  int ldv,
-  int* lwork,
-  gesvdjInfo_t params)
-{
-  return cusolverDnDgesvdj_bufferSize(
-    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+  const double *A, int lda, const double *S, const double *U, int ldu,
+  const double *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  return cusolverDnDgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U,
+                                      ldu, V, ldv, lwork, params);
 }
 template <typename T>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  int econ,
-  int m,
-  int n,
-  T* A,
-  int lda,
-  T* S,
-  T* U,
-  int ldu,
-  T* V,
-  int ldv,
-  T* work,
-  int lwork,
-  int* info,
-  gesvdjInfo_t params,
-  cudaStream_t stream);
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+  T *A, int lda, T *S, T *U, int ldu, T *V, int ldv, T *work, int lwork,
+  int *info, gesvdjInfo_t params, cudaStream_t stream);
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  int econ,
-  int m,
-  int n,
-  float* A,
-  int lda,
-  float* S,
-  float* U,
-  int ldu,
-  float* V,
-  int ldv,
-  float* work,
-  int lwork,
-  int* info,
-  gesvdjInfo_t params,
-  cudaStream_t stream)
-{
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+  float *A, int lda, float *S, float *U, int ldu, float *V, int ldv,
+  float *work, int lwork, int *info, gesvdjInfo_t params, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSgesvdj(
-    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
+  return cusolverDnSgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv,
+                           work, lwork, info, params);
 }
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
-  cusolverDnHandle_t handle,
-  cusolverEigMode_t jobz,
-  int econ,
-  int m,
-  int n,
-  double* A,
-  int lda,
-  double* S,
-  double* U,
-  int ldu,
-  double* V,
-  int ldv,
-  double* work,
-  int lwork,
-  int* info,
-  gesvdjInfo_t params,
-  cudaStream_t stream)
-{
+  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+  double *A, int lda, double *S, double *U, int ldu, double *V, int ldv,
+  double *work, int lwork, int *info, gesvdjInfo_t params,
+  cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDgesvdj(
-    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
+  return cusolverDnDgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv,
+                           work, lwork, info, params);
 }
 /** @} */
 
@@ -779,74 +442,43 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cublasFillMode_t uplo,
-  int n,
-  T* A,
-  int lda,
-  int* Lwork);
+  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, T *A, int lda,
+  int *Lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cublasFillMode_t uplo,
-  int n,
-  float* A,
-  int lda,
-  int* Lwork)
-{
+  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda,
+  int *Lwork) {
   return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cublasFillMode_t uplo,
-  int n,
-  double* A,
-  int lda,
-  int* Lwork)
-{
+  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
+  int *Lwork) {
   return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
 }
 
 template <typename T>
 inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo,
-                                        int n,
-                                        T* A,
-                                        int lda,
-                                        T* Workspace,
-                                        int Lwork,
-                                        int* devInfo,
-                                        cudaStream_t stream);
+                                        cublasFillMode_t uplo, int n, T *A,
+                                        int lda, T *Workspace, int Lwork,
+                                        int *devInfo, cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo,
-                                        int n,
-                                        float* A,
-                                        int lda,
-                                        float* Workspace,
-                                        int Lwork,
-                                        int* devInfo,
-                                        cudaStream_t stream)
-{
+                                        cublasFillMode_t uplo, int n, float *A,
+                                        int lda, float *Workspace, int Lwork,
+                                        int *devInfo, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo,
-                                        int n,
-                                        double* A,
-                                        int lda,
-                                        double* Workspace,
-                                        int Lwork,
-                                        int* devInfo,
-                                        cudaStream_t stream)
-{
+                                        cublasFillMode_t uplo, int n, double *A,
+                                        int lda, double *Workspace, int Lwork,
+                                        int *devInfo, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
@@ -858,44 +490,26 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
-                                 cublasFillMode_t uplo,
-                                 int n,
-                                 int nrhs,
-                                 const T* A,
-                                 int lda,
-                                 T* B,
-                                 int ldb,
-                                 int* devInfo,
-                                 cudaStream_t stream);
+                                 cublasFillMode_t uplo, int n, int nrhs,
+                                 const T *A, int lda, T *B, int ldb,
+                                 int *devInfo, cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo,
-                                        int n,
-                                        int nrhs,
-                                        const float* A,
-                                        int lda,
-                                        float* B,
-                                        int ldb,
-                                        int* devInfo,
-                                        cudaStream_t stream)
-{
+                                        cublasFillMode_t uplo, int n, int nrhs,
+                                        const float *A, int lda, float *B,
+                                        int ldb, int *devInfo,
+                                        cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo,
-                                        int n,
-                                        int nrhs,
-                                        const double* A,
-                                        int lda,
-                                        double* B,
-                                        int ldb,
-                                        int* devInfo,
-                                        cudaStream_t stream)
-{
+                                        cublasFillMode_t uplo, int n, int nrhs,
+                                        const double *A, int lda, double *B,
+                                        int ldb, int *devInfo,
+                                        cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
@@ -906,75 +520,38 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
  * @{
  */
 template <typename T>
-cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,
-                                 int m,  // NOLINT
-                                 int n,
-                                 T* A,
-                                 int lda,
-                                 T* TAU,
-                                 T* Workspace,
-                                 int Lwork,
-                                 int* devInfo,
-                                 cudaStream_t stream);
+cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, int m,  // NOLINT
+                                 int n, T *A, int lda, T *TAU, T *Workspace,
+                                 int Lwork, int *devInfo, cudaStream_t stream);
 template <>
 inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m,
-                                        int n,
-                                        float* A,
-                                        int lda,
-                                        float* TAU,
-                                        float* Workspace,
-                                        int Lwork,
-                                        int* devInfo,
-                                        cudaStream_t stream)
-{
+                                        int m, int n, float *A, int lda,
+                                        float *TAU, float *Workspace, int Lwork,
+                                        int *devInfo, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 template <>
 inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m,
-                                        int n,
-                                        double* A,
-                                        int lda,
-                                        double* TAU,
-                                        double* Workspace,
-                                        int Lwork,
-                                        int* devInfo,
-                                        cudaStream_t stream)
-{
+                                        int m, int n, double *A, int lda,
+                                        double *TAU, double *Workspace,
+                                        int Lwork, int *devInfo,
+                                        cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  T* A,
-  int lda,
-  int* Lwork);
+  cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork);
 template <>
 inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  float* A,
-  int lda,
-  int* Lwork)
-{
+  cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
   return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 template <>
 inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  double* A,
-  int lda,
-  int* Lwork)
-{
+  cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
   return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 /** @} */
@@ -985,86 +562,38 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnorgqr(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  int k,
-  T* A,
-  int lda,
-  const T* tau,
-  T* work,
-  int lwork,
-  int* devInfo,
-  cudaStream_t stream);
+  cusolverDnHandle_t handle, int m, int n, int k, T *A, int lda, const T *tau,
+  T *work, int lwork, int *devInfo, cudaStream_t stream);
 template <>
 inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  int k,
-  float* A,
-  int lda,
-  const float* tau,
-  float* work,
-  int lwork,
-  int* devInfo,
-  cudaStream_t stream)
-{
+  cusolverDnHandle_t handle, int m, int n, int k, float *A, int lda,
+  const float *tau, float *work, int lwork, int *devInfo, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 template <>
 inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  int k,
-  double* A,
-  int lda,
-  const double* tau,
-  double* work,
-  int lwork,
-  int* devInfo,
-  cudaStream_t stream)
-{
+  cusolverDnHandle_t handle, int m, int n, int k, double *A, int lda,
+  const double *tau, double *work, int lwork, int *devInfo,
+  cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  int k,
-  const T* A,
-  int lda,
-  const T* TAU,
-  int* lwork);
+  cusolverDnHandle_t handle, int m, int n, int k, const T *A, int lda,
+  const T *TAU, int *lwork);
 template <>
 inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  int k,
-  const float* A,
-  int lda,
-  const float* TAU,
-  int* lwork)
-{
+  cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda,
+  const float *TAU, int *lwork) {
   return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
 }
 template <>
 inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  int m,
-  int n,
-  int k,
-  const double* A,
-  int lda,
-  const double* TAU,
-  int* lwork)
-{
+  cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda,
+  const double *TAU, int *lwork) {
   return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
 }
 /** @} */
@@ -1075,114 +604,53 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnormqr(cusolverDnHandle_t handle,  // NOLINT
-                                 cublasSideMode_t side,
-                                 cublasOperation_t trans,
-                                 int m,
-                                 int n,
-                                 int k,
-                                 const T* A,
-                                 int lda,
-                                 const T* tau,
-                                 T* C,
-                                 int ldc,
-                                 T* work,
-                                 int lwork,
-                                 int* devInfo,
-                                 cudaStream_t stream);
+                                 cublasSideMode_t side, cublasOperation_t trans,
+                                 int m, int n, int k, const T *A, int lda,
+                                 const T *tau, T *C, int ldc, T *work,
+                                 int lwork, int *devInfo, cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnormqr(  // NOLINT
-  cusolverDnHandle_t handle,
-  cublasSideMode_t side,
-  cublasOperation_t trans,
-  int m,
-  int n,
-  int k,
-  const float* A,
-  int lda,
-  const float* tau,
-  float* C,
-  int ldc,
-  float* work,
-  int lwork,
-  int* devInfo,
-  cudaStream_t stream)
-{
+  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+  int m, int n, int k, const float *A, int lda, const float *tau, float *C,
+  int ldc, float *work, int lwork, int *devInfo, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
+  return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc,
+                          work, lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnormqr(  // NOLINT
-  cusolverDnHandle_t handle,
-  cublasSideMode_t side,
-  cublasOperation_t trans,
-  int m,
-  int n,
-  int k,
-  const double* A,
-  int lda,
-  const double* tau,
-  double* C,
-  int ldc,
-  double* work,
-  int lwork,
-  int* devInfo,
-  cudaStream_t stream)
-{
+  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+  int m, int n, int k, const double *A, int lda, const double *tau, double *C,
+  int ldc, double *work, int lwork, int *devInfo, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
+  return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc,
+                          work, lwork, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cublasSideMode_t side,
-  cublasOperation_t trans,
-  int m,
-  int n,
-  int k,
-  const T* A,
-  int lda,
-  const T* tau,
-  const T* C,
-  int ldc,
-  int* lwork);
+  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+  int m, int n, int k, const T *A, int lda, const T *tau, const T *C, int ldc,
+  int *lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cublasSideMode_t side,
-  cublasOperation_t trans,
-  int m,
-  int n,
-  int k,
-  const float* A,
-  int lda,
-  const float* tau,
-  const float* C,
-  int ldc,
-  int* lwork)
-{
-  return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+  int m, int n, int k, const float *A, int lda, const float *tau,
+  const float *C, int ldc, int *lwork) {
+  return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau,
+                                     C, ldc, lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle,
-  cublasSideMode_t side,
-  cublasOperation_t trans,
-  int m,
-  int n,
-  int k,
-  const double* A,
-  int lda,
-  const double* tau,
-  const double* C,
-  int ldc,
-  int* lwork)
-{
-  return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+  int m, int n, int k, const double *A, int lda, const double *tau,
+  const double *C, int ldc, int *lwork) {
+  return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau,
+                                     C, ldc, lwork);
 }
 /** @} */
 
@@ -1192,136 +660,62 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
-  cusolverSpHandle_t handle,
-  int m,
-  int n,
-  int nnzA,
-  const cusparseMatDescr_t descrA,
-  const T* csrValA,
-  const int* csrRowPtrA,
-  const int* csrColIndA,
-  int batchSize,
-  csrqrInfo_t info,
-  size_t* internalDataInBytes,
-  size_t* workspaceInBytes);
+  cusolverSpHandle_t handle, int m, int n, int nnzA,
+  const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA,
+  const int *csrColIndA, int batchSize, csrqrInfo_t info,
+  size_t *internalDataInBytes, size_t *workspaceInBytes);
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
-  cusolverSpHandle_t handle,
-  int m,
-  int n,
-  int nnzA,
-  const cusparseMatDescr_t descrA,
-  const float* csrValA,
-  const int* csrRowPtrA,
-  const int* csrColIndA,
-  int batchSize,
-  csrqrInfo_t info,
-  size_t* internalDataInBytes,
-  size_t* workspaceInBytes)
-{
-  return cusolverSpScsrqrBufferInfoBatched(handle,
-                                           m,
-                                           n,
-                                           nnzA,
-                                           descrA,
-                                           csrValA,
-                                           csrRowPtrA,
-                                           csrColIndA,
-                                           batchSize,
-                                           info,
-                                           internalDataInBytes,
-                                           workspaceInBytes);
+  cusolverSpHandle_t handle, int m, int n, int nnzA,
+  const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA,
+  const int *csrColIndA, int batchSize, csrqrInfo_t info,
+  size_t *internalDataInBytes, size_t *workspaceInBytes) {
+  return cusolverSpScsrqrBufferInfoBatched(
+    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize,
+    info, internalDataInBytes, workspaceInBytes);
 }
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
-  cusolverSpHandle_t handle,
-  int m,
-  int n,
-  int nnzA,
-  const cusparseMatDescr_t descrA,
-  const double* csrValA,
-  const int* csrRowPtrA,
-  const int* csrColIndA,
-  int batchSize,
-  csrqrInfo_t info,
-  size_t* internalDataInBytes,
-  size_t* workspaceInBytes)
-{
-  return cusolverSpDcsrqrBufferInfoBatched(handle,
-                                           m,
-                                           n,
-                                           nnzA,
-                                           descrA,
-                                           csrValA,
-                                           csrRowPtrA,
-                                           csrColIndA,
-                                           batchSize,
-                                           info,
-                                           internalDataInBytes,
-                                           workspaceInBytes);
+  cusolverSpHandle_t handle, int m, int n, int nnzA,
+  const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA,
+  const int *csrColIndA, int batchSize, csrqrInfo_t info,
+  size_t *internalDataInBytes, size_t *workspaceInBytes) {
+  return cusolverSpDcsrqrBufferInfoBatched(
+    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize,
+    info, internalDataInBytes, workspaceInBytes);
 }
 
 template <typename T>
 cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
-  cusolverSpHandle_t handle,
-  int m,
-  int n,
-  int nnzA,
-  const cusparseMatDescr_t descrA,
-  const T* csrValA,
-  const int* csrRowPtrA,
-  const int* csrColIndA,
-  const T* b,
-  T* x,
-  int batchSize,
-  csrqrInfo_t info,
-  void* pBuffer,
-  cudaStream_t stream);
+  cusolverSpHandle_t handle, int m, int n, int nnzA,
+  const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA,
+  const int *csrColIndA, const T *b, T *x, int batchSize, csrqrInfo_t info,
+  void *pBuffer, cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
-  cusolverSpHandle_t handle,
-  int m,
-  int n,
-  int nnzA,
-  const cusparseMatDescr_t descrA,
-  const float* csrValA,
-  const int* csrRowPtrA,
-  const int* csrColIndA,
-  const float* b,
-  float* x,
-  int batchSize,
-  csrqrInfo_t info,
-  void* pBuffer,
-  cudaStream_t stream)
-{
+  cusolverSpHandle_t handle, int m, int n, int nnzA,
+  const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA,
+  const int *csrColIndA, const float *b, float *x, int batchSize,
+  csrqrInfo_t info, void *pBuffer, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
-  return cusolverSpScsrqrsvBatched(
-    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
+  return cusolverSpScsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA,
+                                   csrRowPtrA, csrColIndA, b, x, batchSize,
+                                   info, pBuffer);
 }
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
-  cusolverSpHandle_t handle,
-  int m,
-  int n,
-  int nnzA,
-  const cusparseMatDescr_t descrA,
-  const double* csrValA,
-  const int* csrRowPtrA,
-  const int* csrColIndA,
-  const double* b,
-  double* x,
-  int batchSize,
-  csrqrInfo_t info,
-  void* pBuffer,
-  cudaStream_t stream)
-{
+  cusolverSpHandle_t handle, int m, int n, int nnzA,
+  const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA,
+  const int *csrColIndA, const double *b, double *x, int batchSize,
+  csrqrInfo_t info, void *pBuffer, cudaStream_t stream) {
   CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
-  return cusolverSpDcsrqrsvBatched(
-    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
+  return cusolverSpDcsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA,
+                                   csrRowPtrA, csrColIndA, b, x, batchSize,
+                                   info, pBuffer);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index 562a3d8991..c848ac1f4b 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -33,10 +33,11 @@ namespace linalg {
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
-{
+void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
+                  cudaStream_t stream) {
   unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, stream);
+    out, in, len, [scalar] __device__(math_t in) { return in / scalar; },
+    stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index 75e77ac0ce..6172618380 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -41,43 +41,26 @@ namespace linalg {
  * @{
  */
 template <typename math_t>
-void eigDC(const raft::handle_t& handle,
-           const math_t* in,
-           int n_rows,
-           int n_cols,
-           math_t* eig_vectors,
-           math_t* eig_vals,
-           cudaStream_t stream)
-{
-  auto allocator               = handle.get_device_allocator();
+void eigDC(const raft::handle_t &handle, const math_t *in, int n_rows,
+           int n_cols, math_t *eig_vectors, math_t *eig_vals,
+           cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH,
-                                            CUSOLVER_EIG_MODE_VECTOR,
-                                            CUBLAS_FILL_MODE_UPPER,
-                                            n_rows,
-                                            in,
-                                            n_cols,
-                                            eig_vals,
-                                            &lwork));
+  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
+                                            CUBLAS_FILL_MODE_UPPER, n_rows, in,
+                                            n_cols, eig_vals, &lwork));
 
   raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
   raft::mr::device::buffer<int> d_dev_info(allocator, stream, 1);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH,
-                                 CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER,
-                                 n_rows,
-                                 eig_vectors,
-                                 n_cols,
-                                 eig_vals,
-                                 d_work.data(),
-                                 lwork,
-                                 d_dev_info.data(),
-                                 stream));
+  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
+                                 n_cols, eig_vals, d_work.data(), lwork,
+                                 d_dev_info.data(), stream));
   CUDA_CHECK(cudaGetLastError());
 
   int dev_info;
@@ -107,80 +90,39 @@ enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
  * @{
  */
 template <typename math_t>
-void eigSelDC(const raft::handle_t& handle,
-              math_t* in,
-              int n_rows,
-              int n_cols,
-              int n_eig_vals,
-              math_t* eig_vectors,
-              math_t* eig_vals,
-              EigVecMemUsage memUsage,
-              cudaStream_t stream)
-{
-  auto allocator               = handle.get_device_allocator();
+void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
+              int n_eig_vals, math_t *eig_vectors, math_t *eig_vals,
+              EigVecMemUsage memUsage, cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
   int h_meig;
 
-  CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(cusolverH,
-                                             CUSOLVER_EIG_MODE_VECTOR,
-                                             CUSOLVER_EIG_RANGE_I,
-                                             CUBLAS_FILL_MODE_UPPER,
-                                             n_rows,
-                                             in,
-                                             n_cols,
-                                             math_t(0.0),
-                                             math_t(0.0),
-                                             n_cols - n_eig_vals + 1,
-                                             n_cols,
-                                             &h_meig,
-                                             eig_vals,
-                                             &lwork));
+  CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(
+    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
+    CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
+    n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork));
 
   raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
   raft::mr::device::buffer<int> d_dev_info(allocator, stream, 1);
   raft::mr::device::buffer<math_t> d_eig_vectors(allocator, stream, 0);
 
   if (memUsage == OVERWRITE_INPUT) {
-    CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH,
-                                    CUSOLVER_EIG_MODE_VECTOR,
-                                    CUSOLVER_EIG_RANGE_I,
-                                    CUBLAS_FILL_MODE_UPPER,
-                                    n_rows,
-                                    in,
-                                    n_cols,
-                                    math_t(0.0),
-                                    math_t(0.0),
-                                    n_cols - n_eig_vals + 1,
-                                    n_cols,
-                                    &h_meig,
-                                    eig_vals,
-                                    d_work.data(),
-                                    lwork,
-                                    d_dev_info.data(),
-                                    stream));
+    CUSOLVER_CHECK(cusolverDnsyevdx(
+      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
+      CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
+      n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork,
+      d_dev_info.data(), stream));
   } else if (memUsage == COPY_INPUT) {
     d_eig_vectors.resize(n_rows * n_cols, stream);
     raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream);
 
-    CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH,
-                                    CUSOLVER_EIG_MODE_VECTOR,
-                                    CUSOLVER_EIG_RANGE_I,
-                                    CUBLAS_FILL_MODE_UPPER,
-                                    n_rows,
-                                    eig_vectors,
-                                    n_cols,
-                                    math_t(0.0),
-                                    math_t(0.0),
-                                    n_cols - n_eig_vals + 1,
-                                    n_cols,
-                                    &h_meig,
-                                    eig_vals,
-                                    d_work.data(),
-                                    lwork,
-                                    d_dev_info.data(),
-                                    stream));
+    CUSOLVER_CHECK(cusolverDnsyevdx(
+      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
+      CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0),
+      math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals,
+      d_work.data(), lwork, d_dev_info.data(), stream));
   }
 
   CUDA_CHECK(cudaGetLastError());
@@ -193,10 +135,11 @@ void eigSelDC(const raft::handle_t& handle,
          "This usually occurs when some of the features do not vary enough.");
 
   if (memUsage == OVERWRITE_INPUT) {
-    raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, stream);
+    raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals,
+                                  stream);
   } else if (memUsage == COPY_INPUT) {
-    raft::matrix::truncZeroOrigin(
-      d_eig_vectors.data(), n_rows, eig_vectors, n_rows, n_eig_vals, stream);
+    raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors,
+                                  n_rows, n_eig_vals, stream);
   }
 }
 
@@ -217,17 +160,10 @@ void eigSelDC(const raft::handle_t& handle,
  * @{
  */
 template <typename math_t>
-void eigJacobi(const raft::handle_t& handle,
-               const math_t* in,
-               int n_rows,
-               int n_cols,
-               math_t* eig_vectors,
-               math_t* eig_vals,
-               cudaStream_t stream,
-               math_t tol = 1.e-7,
-               int sweeps = 15)
-{
-  auto allocator               = handle.get_device_allocator();
+void eigJacobi(const raft::handle_t &handle, const math_t *in, int n_rows,
+               int n_cols, math_t *eig_vectors, math_t *eig_vals,
+               cudaStream_t stream, math_t tol = 1.e-7, int sweeps = 15) {
+  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   syevjInfo_t syevj_params = nullptr;
@@ -236,36 +172,23 @@ void eigJacobi(const raft::handle_t& handle,
   CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, sweeps));
 
   int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(cusolverH,
-                                            CUSOLVER_EIG_MODE_VECTOR,
-                                            CUBLAS_FILL_MODE_UPPER,
-                                            n_rows,
-                                            eig_vectors,
-                                            n_cols,
-                                            eig_vals,
-                                            &lwork,
-                                            syevj_params));
+  CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(
+    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows,
+    eig_vectors, n_cols, eig_vals, &lwork, syevj_params));
 
   raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
   raft::mr::device::buffer<int> dev_info(allocator, stream, 1);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnsyevj(cusolverH,
-                                 CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER,
-                                 n_rows,
-                                 eig_vectors,
-                                 n_cols,
-                                 eig_vals,
-                                 d_work.data(),
-                                 lwork,
-                                 dev_info.data(),
-                                 syevj_params,
-                                 stream));
+  CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
+                                 n_cols, eig_vals, d_work.data(), lwork,
+                                 dev_info.data(), syevj_params, stream));
 
   int executed_sweeps;
-  CUSOLVER_CHECK(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
+  CUSOLVER_CHECK(
+    cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
 
   CUDA_CHECK(cudaGetLastError());
   CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh
index 097c3ac218..1c6dee562d 100644
--- a/cpp/include/raft/linalg/eltwise.cuh
+++ b/cpp/include/raft/linalg/eltwise.cuh
@@ -34,17 +34,19 @@ namespace linalg {
  * @{
  */
 template <typename InType, typename IdxType, typename OutType = InType>
-void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
-{
+void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len,
+               cudaStream_t stream) {
   raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in + scalar; }, stream);
+    out, in, len, [scalar] __device__(InType in) { return in + scalar; },
+    stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
-{
+void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len,
+                    cudaStream_t stream) {
   raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in * scalar; }, stream);
+    out, in, len, [scalar] __device__(InType in) { return in * scalar; },
+    stream);
 }
 /** @} */
 
@@ -60,46 +62,42 @@ void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len,
  * @{
  */
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseAdd(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
+void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len,
+                cudaStream_t stream) {
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; },
+    stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseSub(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
+void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len,
+                cudaStream_t stream) {
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; },
+    stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseMultiply(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
+void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2,
+                     IdxType len, cudaStream_t stream) {
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; },
+    stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivide(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
+void eltwiseDivide(OutType *out, const InType *in1, const InType *in2,
+                   IdxType len, cudaStream_t stream) {
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; },
+    stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivideCheckZero(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
+void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2,
+                            IdxType len, cudaStream_t stream) {
   binaryOp(
-    out,
-    in1,
-    in2,
-    len,
+    out, in1, in2, len,
     [] __device__(InType a, InType b) {
       if (b == InType(0.0))
         return InType(0.0);
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index d5942b7446..0a4897cc0b 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -43,53 +43,35 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename math_t>
-void gemm(const raft::handle_t& handle,
-          const math_t* a,
-          int n_rows_a,
-          int n_cols_a,
-          const math_t* b,
-          math_t* c,
-          int n_rows_c,
-          int n_cols_c,
-          cublasOperation_t trans_a,
-          cublasOperation_t trans_b,
-          math_t alpha,
-          math_t beta,
-          cudaStream_t stream)
-{
+void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
+          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
+          cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha,
+          math_t beta, cudaStream_t stream) {
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
-  int m   = n_rows_c;
-  int n   = n_cols_c;
-  int k   = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a;
+  int m = n_rows_c;
+  int n = n_cols_c;
+  int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a;
   int lda = trans_a == CUBLAS_OP_T ? k : m;
   int ldb = trans_b == CUBLAS_OP_T ? n : k;
   int ldc = m;
-  CUBLAS_CHECK(
-    cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
+  CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda,
+                          b, ldb, &beta, c, ldc, stream));
 }
 
 template <typename math_t>
-void gemm(const raft::handle_t& handle,
-          const math_t* a,
-          int n_rows_a,
-          int n_cols_a,
-          const math_t* b,
-          math_t* c,
-          int n_rows_c,
-          int n_cols_c,
-          cublasOperation_t trans_a,
-          cublasOperation_t trans_b,
-          cudaStream_t stream)
-{
+void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
+          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
+          cublasOperation_t trans_a, cublasOperation_t trans_b,
+          cudaStream_t stream) {
   math_t alpha = math_t(1);
-  math_t beta  = math_t(0);
-  gemm(
-    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
+  math_t beta = math_t(0);
+  gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a,
+       trans_b, alpha, beta, stream);
 }
 
 /**
- * @brief A wrapper for CUBLS GEMM function designed for handling all possible
+ * @brief A wrapper for CUBLS GEMM function designed for handling all possible 
  * combinations of operand layouts.
  * It computes the following equation: Z = alpha . X * Y + beta . Z
  * @tparam T Data type of input/output matrices (float/double)
@@ -108,20 +90,9 @@ void gemm(const raft::handle_t& handle,
  * @param beta scalar
  */
 template <typename T>
-void gemm(const raft::handle_t& handle,
-          T* z,
-          T* x,
-          T* y,
-          int _M,
-          int _N,
-          int _K,
-          bool isZColMajor,
-          bool isXColMajor,
-          bool isYColMajor,
-          cudaStream_t stream,
-          T alpha = T(1.0),
-          T beta  = T(0.0))
-{
+void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
+          int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor,
+          cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) {
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
   cublasOperation_t trans_a, trans_b;
@@ -148,13 +119,13 @@ void gemm(const raft::handle_t& handle,
     // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major
     // layout, trans_b needs to be CUBLAS_OP_N.
     trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T;
-    ldb     = isYColMajor == true ? _K : _N;
+    ldb = isYColMajor == true ? _K : _N;
 
-    c   = z;
+    c = z;
     ldc = _M;
-    M   = _M;
-    N   = _N;
-    K   = _K;
+    M = _M;
+    N = _N;
+    K = _K;
   } else {
     // Result c is required in row major layout Thus we pick
     // a = y, b = x and c = a * b = y * x
@@ -183,7 +154,7 @@ void gemm(const raft::handle_t& handle,
     // Set leading dimension appropriately
     ldb = isXColMajor == true ? _M : _K;
 
-    c   = z;
+    c = z;
     ldc = _N;
 
     M = _N;
@@ -191,8 +162,8 @@ void gemm(const raft::handle_t& handle,
     K = _K;
   }
   // Actual cuBLAS call
-  CUBLAS_CHECK(
-    cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
+  CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda,
+                          b, ldb, &beta, c, ldc, stream));
 }
 
 }  // end namespace linalg
diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h
index a78480bb21..edd18b3bee 100644
--- a/cpp/include/raft/linalg/gemv.h
+++ b/cpp/include/raft/linalg/gemv.h
@@ -26,19 +26,9 @@ namespace raft {
 namespace linalg {
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* a,
-          int n_rows,
-          int n_cols,
-          const math_t* x,
-          int incx,
-          math_t* y,
-          int incy,
-          bool trans_a,
-          math_t alpha,
-          math_t beta,
-          cudaStream_t stream)
-{
+void gemv(const raft::handle_t& handle, const math_t* a, int n_rows, int n_cols,
+          const math_t* x, int incx, math_t* y, int incy, bool trans_a,
+          math_t alpha, math_t beta, cudaStream_t stream) {
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
   cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
@@ -50,47 +40,33 @@ void gemv(const raft::handle_t& handle,
   //  n - number of columns in input matrix
   //  lda - purpose of it  to have ability to operate on submatrices of matrix without copying.
   //        If you're not think about it it's always should be equal to m
-  //  lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform
-  //  transpose
+  //  lda has deal with memory layout, but has nothing with the requirement for cuBLAS perform transpose
 
   // In Machine Learning:
   //  m - nunmber of columns in design matrix(number of features)
   //  n - number of rows in designed matrix (number of train examples)
 
-  int m   = n_rows;
-  int n   = n_cols;
+  int m = n_rows;
+  int n = n_cols;
   int lda = trans_a ? m : n;
 
-  CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta, y, incy, stream));
+  CUBLAS_CHECK(cublasgemv(cublas_h, op_a, m, n, &alpha, a, lda, x, incx, &beta,
+                          y, incy, stream));
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* a,
-          int n_rows_a,
-          int n_cols_a,
-          const math_t* x,
-          math_t* y,
-          bool trans_a,
-          math_t alpha,
-          math_t beta,
-          cudaStream_t stream)
-{
+void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a,
+          int n_cols_a, const math_t* x, math_t* y, bool trans_a, math_t alpha,
+          math_t beta, cudaStream_t stream) {
   gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
 }
 
 template <typename math_t>
-void gemv(const raft::handle_t& handle,
-          const math_t* a,
-          int n_rows_a,
-          int n_cols_a,
-          const math_t* x,
-          math_t* y,
-          bool trans_a,
-          cudaStream_t stream)
-{
+void gemv(const raft::handle_t& handle, const math_t* a, int n_rows_a,
+          int n_cols_a, const math_t* x, math_t* y, bool trans_a,
+          cudaStream_t stream) {
   math_t alpha = math_t(1);
-  math_t beta  = math_t(0);
+  math_t beta = math_t(0);
 
   gemv(handle, a, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
 }
diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h
index 2086172f5d..cb2e8ed1ab 100644
--- a/cpp/include/raft/linalg/init.h
+++ b/cpp/include/raft/linalg/init.h
@@ -36,8 +36,7 @@ namespace {
  * \param [in] stream cuda stream
  */
 template <typename T>
-void range(T* out, int start, int end, cudaStream_t stream)
-{
+void range(T *out, int start, int end, cudaStream_t stream) {
   thrust::counting_iterator<int> first(start);
   thrust::counting_iterator<int> last = first + (end - start);
   thrust::device_ptr<T> ptr(out);
@@ -54,8 +53,7 @@ void range(T* out, int start, int end, cudaStream_t stream)
  * \param [in] stream cuda stream
  */
 template <typename T, int TPB = 256>
-void range(T* out, int n, cudaStream_t stream)
-{
+void range(T *out, int n, cudaStream_t stream) {
   range(out, 0, n, stream);
 }
 }  // unnamed namespace
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index 39089473e3..b775a1f696 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-// for cmath:
+//for cmath:
 #define _USE_MATH_DEFINES
 
 #include <cmath>
@@ -40,14 +40,14 @@ using namespace linalg;
 namespace spectral {
 
 // curandGeneratorNormalX
-inline curandStatus_t curandGenerateNormalX(
-  curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev)
-{
+inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
+                                            float *outputPtr, size_t n,
+                                            float mean, float stddev) {
   return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
 }
-inline curandStatus_t curandGenerateNormalX(
-  curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev)
-{
+inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
+                                            double *outputPtr, size_t n,
+                                            double mean, double stddev) {
   return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev);
 }
 
@@ -55,7 +55,7 @@ inline curandStatus_t curandGenerateNormalX(
 // Helper functions
 // =========================================================
 
-/**
+/**  
  *  @brief  Perform Lanczos iteration
  *    Lanczos iteration is performed on a shifted matrix A+shift*I.
  *  @tparam index_type_t the type of data used for indexing.
@@ -85,30 +85,25 @@ inline curandStatus_t curandGenerateNormalX(
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-int performLanczosIteration(handle_t const& handle,
-                            sparse_matrix_t<index_type_t, value_type_t> const* A,
-                            index_type_t* iter,
-                            index_type_t maxIter,
-                            value_type_t shift,
-                            value_type_t tol,
-                            bool reorthogonalize,
-                            value_type_t* __restrict__ alpha_host,
-                            value_type_t* __restrict__ beta_host,
-                            value_type_t* __restrict__ lanczosVecs_dev,
-                            value_type_t* __restrict__ work_dev)
-{
+int performLanczosIteration(
+  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
+  index_type_t *iter, index_type_t maxIter, value_type_t shift,
+  value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host,
+  value_type_t *__restrict__ beta_host,
+  value_type_t *__restrict__ lanczosVecs_dev,
+  value_type_t *__restrict__ work_dev) {
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
 
   // Useful variables
-  constexpr value_type_t one    = 1;
+  constexpr value_type_t one = 1;
   constexpr value_type_t negOne = -1;
-  constexpr value_type_t zero   = 0;
+  constexpr value_type_t zero = 0;
   value_type_t alpha;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
 
@@ -122,28 +117,29 @@ int performLanczosIteration(handle_t const& handle,
 
     // Apply matrix
     if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n,
-                               lanczosVecs_dev,
+      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev,
                                n * sizeof(value_type_t),
-                               cudaMemcpyDeviceToDevice,
-                               stream));
+                               cudaMemcpyDeviceToDevice, stream));
     A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n);
 
     // Orthogonalize Lanczos vector
-    CUBLAS_CHECK(cublasdot(
-      cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream));
+    CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1,
+                           lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host,
+                           stream));
 
     alpha = -alpha_host[0];
-    CUBLAS_CHECK(cublasaxpy(
-      cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
+    CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1,
+                            lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1,
+                            beta_host, stream));
 
     // Check if Lanczos has converged
     if (beta_host[0] <= tol) return 0;
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[0];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n),
+                            1, stream));
   }
 
   // -------------------------------------------------------
@@ -155,121 +151,65 @@ int performLanczosIteration(handle_t const& handle,
 
     // Apply matrix
     if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n,
-                               lanczosVecs_dev + (*iter - 1) * n,
-                               n * sizeof(value_type_t),
-                               cudaMemcpyDeviceToDevice,
-                               stream));
-    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n));
+      CUDA_TRY(cudaMemcpyAsync(
+        lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n,
+        n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
+    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift,
+          lanczosVecs_dev + IDX(0, *iter, n));
 
     // Full reorthogonalization
     //   "Twice is enough" algorithm per Kahan and Parlett
     if (reorthogonalize) {
-      CUBLAS_CHECK(cublasgemv(cublas_h,
-                              CUBLAS_OP_T,
-                              n,
-                              *iter,
-                              &one,
-                              lanczosVecs_dev,
-                              n,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              &zero,
-                              work_dev,
-                              1,
-                              stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h,
-                              CUBLAS_OP_N,
-                              n,
-                              *iter,
-                              &negOne,
-                              lanczosVecs_dev,
-                              n,
-                              work_dev,
-                              1,
-                              &one,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              stream));
-
-      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1),
-                               work_dev + (*iter - 1),
-                               sizeof(value_type_t),
-                               cudaMemcpyDeviceToHost,
+      CUBLAS_CHECK(cublasgemv(
+        cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n,
+        lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream));
+
+      CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne,
+                              lanczosVecs_dev, n, work_dev, 1, &one,
+                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+
+      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1),
+                               sizeof(value_type_t), cudaMemcpyDeviceToHost,
                                stream));
 
-      CUBLAS_CHECK(cublasgemv(cublas_h,
-                              CUBLAS_OP_T,
-                              n,
-                              *iter,
-                              &one,
-                              lanczosVecs_dev,
-                              n,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              &zero,
-                              work_dev,
-                              1,
-                              stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h,
-                              CUBLAS_OP_N,
-                              n,
-                              *iter,
-                              &negOne,
-                              lanczosVecs_dev,
-                              n,
-                              work_dev,
-                              1,
-                              &one,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              stream));
+      CUBLAS_CHECK(cublasgemv(
+        cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n,
+        lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream));
+
+      CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne,
+                              lanczosVecs_dev, n, work_dev, 1, &one,
+                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
     }
 
     // Orthogonalization with 3-term recurrence relation
     else {
-      CUBLAS_CHECK(cublasdot(cublas_h,
-                             n,
-                             lanczosVecs_dev + IDX(0, *iter - 1, n),
-                             1,
-                             lanczosVecs_dev + IDX(0, *iter, n),
-                             1,
-                             alpha_host + (*iter - 1),
-                             stream));
+      CUBLAS_CHECK(cublasdot(cublas_h, n,
+                             lanczosVecs_dev + IDX(0, *iter - 1, n), 1,
+                             lanczosVecs_dev + IDX(0, *iter, n), 1,
+                             alpha_host + (*iter - 1), stream));
 
       auto alpha = -alpha_host[*iter - 1];
-      CUBLAS_CHECK(cublasaxpy(cublas_h,
-                              n,
-                              &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 1, n),
-                              1,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              stream));
+      CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha,
+                              lanczosVecs_dev + IDX(0, *iter - 1, n), 1,
+                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
 
       alpha = -beta_host[*iter - 2];
-      CUBLAS_CHECK(cublasaxpy(cublas_h,
-                              n,
-                              &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 2, n),
-                              1,
-                              lanczosVecs_dev + IDX(0, *iter, n),
-                              1,
-                              stream));
+      CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha,
+                              lanczosVecs_dev + IDX(0, *iter - 2, n), 1,
+                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
     }
 
     // Compute residual
-    CUBLAS_CHECK(cublasnrm2(
-      cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream));
+    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1,
+                            beta_host + *iter - 1, stream));
 
     // Check if Lanczos has converged
     if (beta_host[*iter - 1] <= tol) break;
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[*iter - 1];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha,
+                            lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
   }
 
   CUDA_TRY(cudaStreamSynchronize(stream));
@@ -277,7 +217,7 @@ int performLanczosIteration(handle_t const& handle,
   return 0;
 }
 
-/**
+/** 
  *  @brief  Find Householder transform for 3-dimensional system
  *    Given an input vector v=[x,y,z]', this function finds a
  *    Householder transform P such that P*v is a multiple of
@@ -295,8 +235,8 @@ int performLanczosIteration(handle_t const& handle,
  *    matrix. Matrix dimensions are 3 x 3.
  */
 template <typename index_type_t, typename value_type_t>
-static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P)
-{
+static void findHouseholder3(value_type_t *v, value_type_t *Pv,
+                             value_type_t *P) {
   // Compute norm of vector
   *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
 
@@ -306,7 +246,8 @@ static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P)
   v[0] -= *Pv;
 
   // Normalize Householder vector
-  value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+  value_type_t normHouseholder =
+    std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
   if (normHouseholder != 0) {
     v[0] /= normHouseholder;
     v[1] /= normHouseholder;
@@ -320,13 +261,11 @@ static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P)
   // Construct Householder matrix
   index_type_t i, j;
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i)
-      P[IDX(i, j, 3)] = -2 * v[i] * v[j];
-  for (i = 0; i < 3; ++i)
-    P[IDX(i, i, 3)] += 1;
+    for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j];
+  for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1;
 }
 
-/**
+/**  
  *  @brief  Apply 3-dimensional Householder transform to 4 x 4 matrix
  *    The Householder transform is pre-applied to the top three rows
  *  of the matrix and post-applied to the left three columns. The
@@ -338,8 +277,7 @@ static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P)
  *  @param A (Input/output, host memory, 16 entries) 4 x 4 matrix.
  */
 template <typename index_type_t, typename value_type_t>
-static void applyHouseholder3(const value_type_t* v, value_type_t* A)
-{
+static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
   // Loop indices
   index_type_t i, j;
   // Dot product between Householder vector and matrix row/column
@@ -348,23 +286,19 @@ static void applyHouseholder3(const value_type_t* v, value_type_t* A)
   // Pre-apply Householder transform
   for (j = 0; j < 4; ++j) {
     vDotA = 0;
-    for (i = 0; i < 3; ++i)
-      vDotA += v[i] * A[IDX(i, j, 4)];
-    for (i = 0; i < 3; ++i)
-      A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
+    for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)];
+    for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
   }
 
   // Post-apply Householder transform
   for (i = 0; i < 4; ++i) {
     vDotA = 0;
-    for (j = 0; j < 3; ++j)
-      vDotA += A[IDX(i, j, 4)] * v[j];
-    for (j = 0; j < 3; ++j)
-      A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
+    for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j];
+    for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
   }
 }
 
-/**
+/**  
  *  @brief  Perform one step of Francis QR algorithm
  *    Equivalent to two steps of the classical QR algorithm on a
  *    tridiagonal matrix.
@@ -385,14 +319,10 @@ static void applyHouseholder3(const value_type_t* v, value_type_t* A)
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int francisQRIteration(index_type_t n,
-                              value_type_t shift1,
-                              value_type_t shift2,
-                              value_type_t* alpha,
-                              value_type_t* beta,
-                              value_type_t* V,
-                              value_type_t* work)
-{
+static int francisQRIteration(index_type_t n, value_type_t shift1,
+                              value_type_t shift2, value_type_t *alpha,
+                              value_type_t *beta, value_type_t *V,
+                              value_type_t *work) {
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
@@ -422,30 +352,30 @@ static int francisQRIteration(index_type_t n,
   householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c;
   householder[1] = beta[0] * (alpha[0] + alpha[1] + b);
   householder[2] = beta[0] * beta[1];
-  findHouseholder3<index_type_t, value_type_t>(householder, &temp, householderMatrix);
+  findHouseholder3<index_type_t, value_type_t>(householder, &temp,
+                                               householderMatrix);
 
   // Apply initial Householder transform to create bulge
   memset(bulge, 0, 16 * sizeof(value_type_t));
-  for (i = 0; i < 4; ++i)
-    bulge[IDX(i, i, 4)] = alpha[i];
+  for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i];
   for (i = 0; i < 3; ++i) {
     bulge[IDX(i + 1, i, 4)] = beta[i];
     bulge[IDX(i, i + 1, 4)] = beta[i];
   }
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n);
+  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix,
+                             3, 0, work, n);
   memcpy(V, work, 3 * n * sizeof(value_type_t));
 
   // Chase bulge to bottom-right of matrix with Householder transforms
   for (pos = 0; pos < n - 4; ++pos) {
     // Move to next position
-    alpha[pos]     = bulge[IDX(0, 0, 4)];
+    alpha[pos] = bulge[IDX(0, 0, 4)];
     householder[0] = bulge[IDX(1, 0, 4)];
     householder[1] = bulge[IDX(2, 0, 4)];
     householder[2] = bulge[IDX(3, 0, 4)];
     for (j = 0; j < 3; ++j)
-      for (i = 0; i < 3; ++i)
-        bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+      for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
     bulge[IDX(3, 0, 4)] = 0;
     bulge[IDX(3, 1, 4)] = 0;
     bulge[IDX(3, 2, 4)] = beta[pos + 3];
@@ -455,22 +385,22 @@ static int francisQRIteration(index_type_t n,
     bulge[IDX(3, 3, 4)] = alpha[pos + 4];
 
     // Apply Householder transform
-    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos, householderMatrix);
+    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos,
+                                                 householderMatrix);
     applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-    Lapack<value_type_t>::gemm(
-      false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n);
+    Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n),
+                               n, householderMatrix, 3, 0, work, n);
     memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t));
   }
 
   // Apply penultimate Householder transform
   //   Values in the last row and column are zero
-  alpha[n - 4]   = bulge[IDX(0, 0, 4)];
+  alpha[n - 4] = bulge[IDX(0, 0, 4)];
   householder[0] = bulge[IDX(1, 0, 4)];
   householder[1] = bulge[IDX(2, 0, 4)];
   householder[2] = bulge[IDX(3, 0, 4)];
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i)
-      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+    for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
   bulge[IDX(3, 0, 4)] = 0;
   bulge[IDX(3, 1, 4)] = 0;
   bulge[IDX(3, 2, 4)] = 0;
@@ -478,36 +408,37 @@ static int francisQRIteration(index_type_t n,
   bulge[IDX(1, 3, 4)] = 0;
   bulge[IDX(2, 3, 4)] = 0;
   bulge[IDX(3, 3, 4)] = 0;
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4, householderMatrix);
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4,
+                                               householderMatrix);
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(
-    false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n);
+  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n,
+                             householderMatrix, 3, 0, work, n);
   memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t));
 
   // Apply final Householder transform
   //   Values in the last two rows and columns are zero
-  alpha[n - 3]   = bulge[IDX(0, 0, 4)];
+  alpha[n - 3] = bulge[IDX(0, 0, 4)];
   householder[0] = bulge[IDX(1, 0, 4)];
   householder[1] = bulge[IDX(2, 0, 4)];
   householder[2] = 0;
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i)
-      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3, householderMatrix);
+    for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3,
+                                               householderMatrix);
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(
-    false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n);
+  Lapack<value_type_t>::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n,
+                             householderMatrix, 3, 0, work, n);
   memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t));
 
   // Bulge has been eliminated
   alpha[n - 2] = bulge[IDX(0, 0, 4)];
   alpha[n - 1] = bulge[IDX(1, 1, 4)];
-  beta[n - 2]  = bulge[IDX(1, 0, 4)];
+  beta[n - 2] = bulge[IDX(1, 0, 4)];
 
   return 0;
 }
 
-/**
+/**  
  *  @brief  Perform implicit restart of Lanczos algorithm
  *    Shifts are Chebyshev nodes of unwanted region of matrix spectrum.
  *  @tparam index_type_t the type of data used for indexing.
@@ -543,30 +474,23 @@ static int francisQRIteration(index_type_t n,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-static int lanczosRestart(handle_t const& handle,
-                          index_type_t n,
-                          index_type_t iter,
-                          index_type_t iter_new,
-                          value_type_t* shiftUpper,
-                          value_type_t* shiftLower,
-                          value_type_t* __restrict__ alpha_host,
-                          value_type_t* __restrict__ beta_host,
-                          value_type_t* __restrict__ V_host,
-                          value_type_t* __restrict__ work_host,
-                          value_type_t* __restrict__ lanczosVecs_dev,
-                          value_type_t* __restrict__ work_dev,
-                          bool smallest_eig)
-{
+static int lanczosRestart(
+  handle_t const &handle, index_type_t n, index_type_t iter,
+  index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower,
+  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
+  value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host,
+  value_type_t *__restrict__ lanczosVecs_dev,
+  value_type_t *__restrict__ work_dev, bool smallest_eig) {
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
 
   // Useful constants
   constexpr value_type_t zero = 0;
-  constexpr value_type_t one  = 1;
+  constexpr value_type_t one = 1;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   // Loop index
   index_type_t i;
@@ -577,12 +501,12 @@ static int lanczosRestart(handle_t const& handle,
   index_type_t restartSteps = iter - iter_new;
 
   // Ritz values from Lanczos method
-  value_type_t* ritzVals_host = work_host + 3 * iter;
+  value_type_t *ritzVals_host = work_host + 3 * iter;
   // Shifts for implicit restart
-  value_type_t* shifts_host;
+  value_type_t *shifts_host;
 
   // Orthonormal matrix for similarity transform
-  value_type_t* V_dev = work_dev + n * iter;
+  value_type_t *V_dev = work_dev + n * iter;
 
   // -------------------------------------------------------
   // Implementation
@@ -600,8 +524,7 @@ static int lanczosRestart(handle_t const& handle,
 
   // Initialize similarity transform with identity matrix
   memset(V_host, 0, iter * iter * sizeof(value_type_t));
-  for (i = 0; i < iter; ++i)
-    V_host[IDX(i, i, iter)] = 1;
+  for (i = 0; i < iter; ++i) V_host[IDX(i, i, iter)] = 1;
 
   // Determine interval to suppress eigenvalues
   if (smallest_eig) {
@@ -625,71 +548,49 @@ static int lanczosRestart(handle_t const& handle,
   // Calculate Chebyshev nodes as shifts
   shifts_host = ritzVals_host;
   for (i = 0; i < restartSteps; ++i) {
-    shifts_host[i] = cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
+    shifts_host[i] =
+      cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
     shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower));
     shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower));
   }
 
   // Apply Francis QR algorithm to implicitly restart Lanczos
   for (i = 0; i < restartSteps; i += 2)
-    if (francisQRIteration(
-          iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host))
+    if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host,
+                           beta_host, V_host, work_host))
       WARNING("error in implicitly shifted QR algorithm");
 
   // Obtain new residual
-  CUDA_TRY(cudaMemcpyAsync(
-    V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
-
-  beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
-  CUBLAS_CHECK(cublasgemv(cublas_h,
-                          CUBLAS_OP_N,
-                          n,
-                          iter,
-                          beta_host + iter_new - 1,
-                          lanczosVecs_dev,
-                          n,
-                          V_dev + IDX(0, iter_new, iter),
-                          1,
-                          beta_host + iter - 1,
-                          lanczosVecs_dev + IDX(0, iter, n),
-                          1,
-                          stream));
+  CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t),
+                           cudaMemcpyHostToDevice, stream));
+
+  beta_host[iter - 1] =
+    beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
+  CUBLAS_CHECK(cublasgemv(
+    cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev,
+    n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1,
+    lanczosVecs_dev + IDX(0, iter, n), 1, stream));
 
   // Obtain new Lanczos vectors
-  CUBLAS_CHECK(cublasgemm(cublas_h,
-                          CUBLAS_OP_N,
-                          CUBLAS_OP_N,
-                          n,
-                          iter_new,
-                          iter,
-                          &one,
-                          lanczosVecs_dev,
-                          n,
-                          V_dev,
-                          iter,
-                          &zero,
-                          work_dev,
-                          n,
-                          stream));
-
-  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
-                           work_dev,
+  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter,
+                          &one, lanczosVecs_dev, n, V_dev, iter, &zero,
+                          work_dev, n, stream));
+
+  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev,
                            n * iter_new * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice,
-                           stream));
+                           cudaMemcpyDeviceToDevice, stream));
 
   // Normalize residual to obtain new Lanczos vector
-  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n),
-                           lanczosVecs_dev + IDX(0, iter, n),
-                           n * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice,
-                           stream));
+  CUDA_TRY(cudaMemcpyAsync(
+    lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n),
+    n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
 
-  CUBLAS_CHECK(cublasnrm2(
-    cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream));
+  CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1,
+                          beta_host + iter_new - 1, stream));
 
   auto h_beta = 1 / beta_host[iter_new - 1];
-  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
+  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta,
+                          lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
 
   return 0;
 }
@@ -700,7 +601,7 @@ static int lanczosRestart(handle_t const& handle,
 // Eigensolver
 // =========================================================
 
-/**
+/**  
  * @brief  Compute smallest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -750,28 +651,19 @@ static int lanczosRestart(handle_t const& handle,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(handle_t const& handle,
-                                sparse_matrix_t<index_type_t, value_type_t> const* A,
-                                index_type_t nEigVecs,
-                                index_type_t maxIter,
-                                index_type_t restartIter,
-                                value_type_t tol,
-                                bool reorthogonalize,
-                                index_type_t* effIter,
-                                index_type_t* totalIter,
-                                value_type_t* shift,
-                                value_type_t* __restrict__ alpha_host,
-                                value_type_t* __restrict__ beta_host,
-                                value_type_t* __restrict__ lanczosVecs_dev,
-                                value_type_t* __restrict__ work_dev,
-                                value_type_t* __restrict__ eigVals_dev,
-                                value_type_t* __restrict__ eigVecs_dev,
-                                unsigned long long seed)
-{
+int computeSmallestEigenvectors(
+  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
+  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
+  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
+  index_type_t *totalIter, value_type_t *shift,
+  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
+  value_type_t *__restrict__ lanczosVecs_dev,
+  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
+  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
   using namespace spectral;
 
   // Useful constants
-  constexpr value_type_t one  = 1;
+  constexpr value_type_t one = 1;
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
@@ -791,20 +683,21 @@ int computeSmallestEigenvectors(handle_t const& handle,
   index_type_t i;
 
   // Host memory
-  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t* work_host;  // Workspace
+  value_type_t *Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t *work_host;  // Workspace
 
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
+               "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
   RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   // -------------------------------------------------------
   // Variable initialization
@@ -817,11 +710,12 @@ int computeSmallestEigenvectors(handle_t const& handle,
   std::vector<value_type_t> Z_host_v(restartIter * restartIter);
   std::vector<value_type_t> work_host_v(4 * restartIter);
 
-  Z_host    = Z_host_v.data();
+  Z_host = Z_host_v.data();
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(
+    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue to determine shift
@@ -844,18 +738,10 @@ int computeSmallestEigenvectors(handle_t const& handle,
 
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
-  *shift   = 0;
-  status   = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                               A,
-                                                               effIter,
-                                                               maxIter_curr,
-                                                               *shift,
-                                                               0.0,
-                                                               reorthogonalize,
-                                                               alpha_host,
-                                                               beta_host,
-                                                               lanczosVecs_dev,
-                                                               work_dev);
+  *shift = 0;
+  status = performLanczosIteration<index_type_t, value_type_t>(
+    handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host,
+    beta_host, lanczosVecs_dev, work_dev);
   if (status) WARNING("error in Lanczos iteration");
 
   // Determine largest eigenvalue
@@ -870,17 +756,9 @@ int computeSmallestEigenvectors(handle_t const& handle,
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
 
-  status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                               A,
-                                                               effIter,
-                                                               maxIter_curr,
-                                                               *shift,
-                                                               0,
-                                                               reorthogonalize,
-                                                               alpha_host,
-                                                               beta_host,
-                                                               lanczosVecs_dev,
-                                                               work_dev);
+  status = performLanczosIteration<index_type_t, value_type_t>(
+    handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host,
+    beta_host, lanczosVecs_dev, work_dev);
   if (status) WARNING("error in Lanczos iteration");
   *totalIter += *effIter;
 
@@ -897,19 +775,9 @@ int computeSmallestEigenvectors(handle_t const& handle,
     if (iter_new == *effIter) break;
 
     // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(handle,
-                                                        n,
-                                                        *effIter,
-                                                        iter_new,
-                                                        &shiftUpper,
-                                                        &shiftLower,
-                                                        alpha_host,
-                                                        beta_host,
-                                                        Z_host,
-                                                        work_host,
-                                                        lanczosVecs_dev,
-                                                        work_dev,
-                                                        true);
+    status = lanczosRestart<index_type_t, value_type_t>(
+      handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host,
+      beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true);
     if (status) WARNING("error in Lanczos implicit restart");
     *effIter = iter_new;
 
@@ -918,17 +786,9 @@ int computeSmallestEigenvectors(handle_t const& handle,
 
     // Proceed with Lanczos method
 
-    status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                                 A,
-                                                                 effIter,
-                                                                 maxIter_curr,
-                                                                 *shift,
-                                                                 tol * fabs(shiftLower),
-                                                                 reorthogonalize,
-                                                                 alpha_host,
-                                                                 beta_host,
-                                                                 lanczosVecs_dev,
-                                                                 work_dev);
+    status = performLanczosIteration<index_type_t, value_type_t>(
+      handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower),
+      reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev);
     if (status) WARNING("error in Lanczos iteration");
     *totalIter += *effIter - iter_new;
   }
@@ -939,59 +799,39 @@ int computeSmallestEigenvectors(handle_t const& handle,
   }
 
   // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I',
-                              *effIter,
-                              work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter),
-                              Z_host,
-                              *effIter,
+  memcpy(work_host + 2 * (*effIter), alpha_host,
+         (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host,
+         (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I', *effIter, work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter), Z_host, *effIter,
                               work_host);
 
   // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i)
-    work_host[i + 2 * (*effIter)] -= *shift;
-  for (i = *effIter; i < nEigVecs; ++i)
-    work_host[i + 2 * (*effIter)] = 0;
+  for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0;
 
   // Copy results to device memory
-  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
-                           work_host + 2 * (*effIter),
+  CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter),
                            nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
+                           cudaMemcpyHostToDevice, stream));
 
-  CUDA_TRY(cudaMemcpyAsync(work_dev,
-                           Z_host,
+  CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host,
                            (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
+                           cudaMemcpyHostToDevice, stream));
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h,
-                          CUBLAS_OP_N,
-                          CUBLAS_OP_N,
-                          n,
-                          nEigVecs,
-                          *effIter,
-                          &one,
-                          lanczosVecs_dev,
-                          n,
-                          work_dev,
-                          *effIter,
-                          &zero,
-                          eigVecs_dev,
-                          n,
-                          stream));
+  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs,
+                          *effIter, &one, lanczosVecs_dev, n, work_dev,
+                          *effIter, &zero, eigVecs_dev, n, stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
   return 0;
 }
 
-/**
+/**  
  *  @brief  Compute smallest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -1029,25 +869,20 @@ int computeSmallestEigenvectors(handle_t const& handle,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(handle_t const& handle,
-                                sparse_matrix_t<index_type_t, value_type_t> const& A,
-                                index_type_t nEigVecs,
-                                index_type_t maxIter,
-                                index_type_t restartIter,
-                                value_type_t tol,
-                                bool reorthogonalize,
-                                index_type_t& iter,
-                                value_type_t* __restrict__ eigVals_dev,
-                                value_type_t* __restrict__ eigVecs_dev,
-                                unsigned long long seed = 1234567)
-{
+int computeSmallestEigenvectors(
+  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
+  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
+  value_type_t tol, bool reorthogonalize, index_type_t &iter,
+  value_type_t *__restrict__ eigVals_dev,
+  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) {
   using namespace spectral;
 
   // Matrix dimension
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
+               "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -1057,8 +892,8 @@ int computeSmallestEigenvectors(handle_t const& handle,
   std::vector<value_type_t> alpha_host_v(restartIter);
   std::vector<value_type_t> beta_host_v(restartIter);
 
-  value_type_t* alpha_host = alpha_host_v.data();
-  value_type_t* beta_host  = beta_host_v.data();
+  value_type_t *alpha_host = alpha_host_v.data();
+  value_type_t *beta_host = beta_host_v.data();
 
   vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
   vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
@@ -1066,23 +901,10 @@ int computeSmallestEigenvectors(handle_t const& handle,
   // Perform Lanczos method
   index_type_t effIter;
   value_type_t shift;
-  int status = computeSmallestEigenvectors(handle,
-                                           &A,
-                                           nEigVecs,
-                                           maxIter,
-                                           restartIter,
-                                           tol,
-                                           reorthogonalize,
-                                           &effIter,
-                                           &iter,
-                                           &shift,
-                                           alpha_host,
-                                           beta_host,
-                                           lanczosVecs_dev.raw(),
-                                           work_dev.raw(),
-                                           eigVals_dev,
-                                           eigVecs_dev,
-                                           seed);
+  int status = computeSmallestEigenvectors(
+    handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter,
+    &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(),
+    eigVals_dev, eigVecs_dev, seed);
 
   // Clean up and return
   return status;
@@ -1092,7 +914,7 @@ int computeSmallestEigenvectors(handle_t const& handle,
 // Eigensolver
 // =========================================================
 
-/**
+/**  
  *  @brief Compute largest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -1137,27 +959,19 @@ int computeSmallestEigenvectors(handle_t const& handle,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(handle_t const& handle,
-                               sparse_matrix_t<index_type_t, value_type_t> const* A,
-                               index_type_t nEigVecs,
-                               index_type_t maxIter,
-                               index_type_t restartIter,
-                               value_type_t tol,
-                               bool reorthogonalize,
-                               index_type_t* effIter,
-                               index_type_t* totalIter,
-                               value_type_t* __restrict__ alpha_host,
-                               value_type_t* __restrict__ beta_host,
-                               value_type_t* __restrict__ lanczosVecs_dev,
-                               value_type_t* __restrict__ work_dev,
-                               value_type_t* __restrict__ eigVals_dev,
-                               value_type_t* __restrict__ eigVecs_dev,
-                               unsigned long long seed)
-{
+int computeLargestEigenvectors(
+  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
+  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
+  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
+  index_type_t *totalIter, value_type_t *__restrict__ alpha_host,
+  value_type_t *__restrict__ beta_host,
+  value_type_t *__restrict__ lanczosVecs_dev,
+  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
+  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
   using namespace spectral;
 
   // Useful constants
-  constexpr value_type_t one  = 1;
+  constexpr value_type_t one = 1;
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
@@ -1173,8 +987,8 @@ int computeLargestEigenvectors(handle_t const& handle,
   index_type_t i;
 
   // Host memory
-  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t* work_host;  // Workspace
+  value_type_t *Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t *work_host;  // Workspace
 
   // -------------------------------------------------------
   // Check that LAPACK is enabled
@@ -1184,14 +998,15 @@ int computeLargestEigenvectors(handle_t const& handle,
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
+               "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
   RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   // -------------------------------------------------------
   // Variable initialization
@@ -1204,11 +1019,12 @@ int computeLargestEigenvectors(handle_t const& handle,
   std::vector<value_type_t> Z_host_v(restartIter * restartIter);
   std::vector<value_type_t> work_host_v(4 * restartIter);
 
-  Z_host    = Z_host_v.data();
+  Z_host = Z_host_v.data();
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(
+    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue
@@ -1228,21 +1044,13 @@ int computeLargestEigenvectors(handle_t const& handle,
   CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
 
   // Obtain tridiagonal matrix with Lanczos
-  *effIter               = 0;
+  *effIter = 0;
   value_type_t shift_val = 0.0;
-  value_type_t* shift    = &shift_val;
-
-  status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                               A,
-                                                               effIter,
-                                                               maxIter_curr,
-                                                               *shift,
-                                                               0,
-                                                               reorthogonalize,
-                                                               alpha_host,
-                                                               beta_host,
-                                                               lanczosVecs_dev,
-                                                               work_dev);
+  value_type_t *shift = &shift_val;
+
+  status = performLanczosIteration<index_type_t, value_type_t>(
+    handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host,
+    beta_host, lanczosVecs_dev, work_dev);
   if (status) WARNING("error in Lanczos iteration");
   *totalIter += *effIter;
 
@@ -1259,19 +1067,9 @@ int computeLargestEigenvectors(handle_t const& handle,
     if (iter_new == *effIter) break;
 
     // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(handle,
-                                                        n,
-                                                        *effIter,
-                                                        iter_new,
-                                                        &shiftUpper,
-                                                        &shiftLower,
-                                                        alpha_host,
-                                                        beta_host,
-                                                        Z_host,
-                                                        work_host,
-                                                        lanczosVecs_dev,
-                                                        work_dev,
-                                                        false);
+    status = lanczosRestart<index_type_t, value_type_t>(
+      handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host,
+      beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false);
     if (status) WARNING("error in Lanczos implicit restart");
     *effIter = iter_new;
 
@@ -1280,17 +1078,9 @@ int computeLargestEigenvectors(handle_t const& handle,
 
     // Proceed with Lanczos method
 
-    status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                                 A,
-                                                                 effIter,
-                                                                 maxIter_curr,
-                                                                 *shift,
-                                                                 tol * fabs(shiftLower),
-                                                                 reorthogonalize,
-                                                                 alpha_host,
-                                                                 beta_host,
-                                                                 lanczosVecs_dev,
-                                                                 work_dev);
+    status = performLanczosIteration<index_type_t, value_type_t>(
+      handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower),
+      reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev);
     if (status) WARNING("error in Lanczos iteration");
     *totalIter += *effIter - iter_new;
   }
@@ -1300,18 +1090,15 @@ int computeLargestEigenvectors(handle_t const& handle,
     WARNING("implicitly restarted Lanczos failed to converge");
   }
   for (int i = 0; i < restartIter; ++i) {
-    for (int j = 0; j < restartIter; ++j)
-      Z_host[i * restartIter + j] = 0;
+    for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0;
   }
   // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I',
-                              *effIter,
-                              work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter),
-                              Z_host,
-                              *effIter,
+  memcpy(work_host + 2 * (*effIter), alpha_host,
+         (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host,
+         (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I', *effIter, work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter), Z_host, *effIter,
                               work_host);
 
   // note: We need to pick the top nEigVecs eigenvalues
@@ -1336,52 +1123,36 @@ int computeLargestEigenvectors(handle_t const& handle,
   //}
 
   // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i)
-    work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift;
 
   for (i = 0; i < top_eigenparis_idx_offset; ++i)
     work_host[i + 2 * (*effIter)] = 0;
 
   // Copy results to device memory
   // skip smallest eigenvalue if needed
-  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
-                           work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
-                           nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
+  CUDA_TRY(cudaMemcpyAsync(
+    eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
+    nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
 
   // skip smallest eigenvector if needed
   CUDA_TRY(cudaMemcpyAsync(work_dev,
                            Z_host + (top_eigenparis_idx_offset * (*effIter)),
                            (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice,
-                           stream));
+                           cudaMemcpyHostToDevice, stream));
 
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h,
-                          CUBLAS_OP_N,
-                          CUBLAS_OP_N,
-                          n,
-                          nEigVecs,
-                          *effIter,
-                          &one,
-                          lanczosVecs_dev,
-                          n,
-                          work_dev,
-                          *effIter,
-                          &zero,
-                          eigVecs_dev,
-                          n,
-                          stream));
+  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs,
+                          *effIter, &one, lanczosVecs_dev, n, work_dev,
+                          *effIter, &zero, eigVecs_dev, n, stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
   return 0;
 }
 
-/**
+/**  
  *  @brief  Compute largest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -1419,23 +1190,18 @@ int computeLargestEigenvectors(handle_t const& handle,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(handle_t const& handle,
-                               sparse_matrix_t<index_type_t, value_type_t> const& A,
-                               index_type_t nEigVecs,
-                               index_type_t maxIter,
-                               index_type_t restartIter,
-                               value_type_t tol,
-                               bool reorthogonalize,
-                               index_type_t& iter,
-                               value_type_t* __restrict__ eigVals_dev,
-                               value_type_t* __restrict__ eigVecs_dev,
-                               unsigned long long seed = 123456)
-{
+int computeLargestEigenvectors(
+  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
+  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
+  value_type_t tol, bool reorthogonalize, index_type_t &iter,
+  value_type_t *__restrict__ eigVals_dev,
+  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) {
   // Matrix dimension
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
+               "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -1445,30 +1211,18 @@ int computeLargestEigenvectors(handle_t const& handle,
   std::vector<value_type_t> alpha_host_v(restartIter);
   std::vector<value_type_t> beta_host_v(restartIter);
 
-  value_type_t* alpha_host = alpha_host_v.data();
-  value_type_t* beta_host  = beta_host_v.data();
+  value_type_t *alpha_host = alpha_host_v.data();
+  value_type_t *beta_host = beta_host_v.data();
 
   vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
   vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
 
   // Perform Lanczos method
   index_type_t effIter;
-  int status = computeLargestEigenvectors(handle,
-                                          &A,
-                                          nEigVecs,
-                                          maxIter,
-                                          restartIter,
-                                          tol,
-                                          reorthogonalize,
-                                          &effIter,
-                                          &iter,
-                                          alpha_host,
-                                          beta_host,
-                                          lanczosVecs_dev.raw(),
-                                          work_dev.raw(),
-                                          eigVals_dev,
-                                          eigVecs_dev,
-                                          seed);
+  int status = computeLargestEigenvectors(
+    handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter,
+    &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(),
+    eigVals_dev, eigVecs_dev, seed);
 
   // Clean up and return
   return status;
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh
index 200818fdc3..aff08da2d3 100644
--- a/cpp/include/raft/linalg/map.cuh
+++ b/cpp/include/raft/linalg/map.cuh
@@ -24,18 +24,21 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
-__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args)
-{
+template <typename InType, typename OutType, typename MapOp, int TPB,
+          typename... Args>
+__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in,
+                          Args... args) {
   auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-  if (idx < len) { out[idx] = map(in[idx], args[idx]...); }
+  if (idx < len) {
+    out[idx] = map(in[idx], args[idx]...);
+  }
 }
 
-template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
-void mapImpl(
-  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
-{
+template <typename InType, typename OutType, typename MapOp, int TPB,
+          typename... Args>
+void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream,
+             const InType *in, Args... args) {
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapKernel<InType, OutType, MapOp, TPB, Args...>
     <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
@@ -57,14 +60,12 @@ void mapImpl(
  * @param args additional input arrays
  */
 
-template <typename InType,
-          typename MapOp,
-          int TPB = 256,
-          typename... Args,
+template <typename InType, typename MapOp, int TPB = 256, typename... Args,
           typename OutType = InType>
-void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
-{
-  mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
+void map(OutType *out, size_t len, MapOp map, cudaStream_t stream,
+         const InType *in, Args... args) {
+  mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in,
+                                                args...);
 }
 
 }  // namespace linalg
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh
index 78a7017c5c..f2f198670a 100644
--- a/cpp/include/raft/linalg/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/map_then_reduce.cuh
@@ -24,66 +24,50 @@
 namespace raft {
 namespace linalg {
 
-struct sum_tag {
-};
+struct sum_tag {};
 
 template <typename InType, typename OutType, int TPB>
-__device__ void reduce(OutType* out, const InType acc, sum_tag)
-{
+__device__ void reduce(OutType *out, const InType acc, sum_tag) {
   typedef cub::BlockReduce<InType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType tmp = BlockReduce(temp_storage).Sum(acc);
-  if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); }
+  if (threadIdx.x == 0) {
+    raft::myAtomicAdd(out, tmp);
+  }
 }
 
 template <typename InType, typename OutType, int TPB, typename ReduceLambda>
-__device__ void reduce(OutType* out, const InType acc, ReduceLambda op)
-{
+__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) {
   typedef cub::BlockReduce<InType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType tmp = BlockReduce(temp_storage).Reduce(acc, op);
-  if (threadIdx.x == 0) { raft::myAtomicReduce(out, tmp, op); }
+  if (threadIdx.x == 0) {
+    raft::myAtomicReduce(out, tmp, op);
+  }
 }
 
-template <typename InType,
-          typename OutType,
-          typename MapOp,
-          typename ReduceLambda,
-          int TPB,
-          typename... Args>
-__global__ void mapThenReduceKernel(OutType* out,
-                                    size_t len,
-                                    OutType neutral,
-                                    MapOp map,
-                                    ReduceLambda op,
-                                    const InType* in,
-                                    Args... args)
-{
+template <typename InType, typename OutType, typename MapOp,
+          typename ReduceLambda, int TPB, typename... Args>
+__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral,
+                                    MapOp map, ReduceLambda op,
+                                    const InType *in, Args... args) {
   OutType acc = neutral;
-  auto idx    = (threadIdx.x + (blockIdx.x * blockDim.x));
+  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-  if (idx < len) { acc = map(in[idx], args[idx]...); }
+  if (idx < len) {
+    acc = map(in[idx], args[idx]...);
+  }
 
   __syncthreads();
 
   reduce<InType, OutType, TPB>(out, acc, op);
 }
 
-template <typename InType,
-          typename OutType,
-          typename MapOp,
-          typename ReduceLambda,
-          int TPB,
-          typename... Args>
-void mapThenReduceImpl(OutType* out,
-                       size_t len,
-                       OutType neutral,
-                       MapOp map,
-                       ReduceLambda op,
-                       cudaStream_t stream,
-                       const InType* in,
-                       Args... args)
-{
+template <typename InType, typename OutType, typename MapOp,
+          typename ReduceLambda, int TPB, typename... Args>
+void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map,
+                       ReduceLambda op, cudaStream_t stream, const InType *in,
+                       Args... args) {
   raft::update_device(out, &neutral, 1, stream);
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapThenReduceKernel<InType, OutType, MapOp, ReduceLambda, TPB, Args...>
@@ -105,14 +89,10 @@ void mapThenReduceImpl(OutType* out,
  * @param args additional input arrays
  */
 
-template <typename InType,
-          typename MapOp,
-          int TPB = 256,
-          typename... Args,
+template <typename InType, typename MapOp, int TPB = 256, typename... Args,
           typename OutType = InType>
-void mapThenSumReduce(
-  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
-{
+void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream,
+                      const InType *in, Args... args) {
   mapThenReduceImpl<InType, OutType, MapOp, sum_tag, TPB, Args...>(
     out, len, (OutType)0, map, sum_tag(), stream, in, args...);
 }
@@ -135,21 +115,11 @@ void mapThenSumReduce(
  * @param args additional input arrays
  */
 
-template <typename InType,
-          typename MapOp,
-          typename ReduceLambda,
-          int TPB          = 256,
-          typename OutType = InType,
-          typename... Args>
-void mapThenReduce(OutType* out,
-                   size_t len,
-                   OutType neutral,
-                   MapOp map,
-                   ReduceLambda op,
-                   cudaStream_t stream,
-                   const InType* in,
-                   Args... args)
-{
+template <typename InType, typename MapOp, typename ReduceLambda, int TPB = 256,
+          typename OutType = InType, typename... Args>
+void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map,
+                   ReduceLambda op, cudaStream_t stream, const InType *in,
+                   Args... args) {
   mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
     out, len, neutral, map, op, stream, in, args...);
 }
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 98b5eaa809..902816418f 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -23,15 +23,10 @@ namespace raft {
 namespace linalg {
 
 template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type* out,
-                                     const Type* matrix,
-                                     const Type* vector,
-                                     IdxType D,
-                                     IdxType N,
-                                     bool rowMajor,
-                                     bool bcastAlongRows,
-                                     Lambda op)
-{
+__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
+                                     const Type *vector, IdxType D, IdxType N,
+                                     bool rowMajor, bool bcastAlongRows,
+                                     Lambda op) {
   typedef TxN_t<Type, veclen_> VecType;
   IdxType len = N * D;
   IdxType idx = threadIdx.x;
@@ -62,21 +57,17 @@ __global__ void matrixVectorOpKernel(Type* out,
   mat.store(out, idx);
 }
 
-template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
-void matrixVectorOpImpl(Type* out,
-                        const Type* matrix,
-                        const Type* vec,
-                        IdxType D,
-                        IdxType N,
-                        bool rowMajor,
-                        bool bcastAlongRows,
-                        Lambda op,
-                        cudaStream_t stream)
-{
-  IdxType len   = N * D;
-  IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
+template <typename Type, int veclen_, typename Lambda, typename IdxType,
+          int TPB>
+void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
+                        IdxType D, IdxType N, bool rowMajor,
+                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
+  IdxType len = N * D;
+  IdxType nblks =
+    raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
   matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor,
+                                bcastAlongRows, op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -98,18 +89,11 @@ void matrixVectorOpImpl(Type* out,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type* out,
-                    const Type* matrix,
-                    const Type* vec,
-                    IdxType D,
-                    IdxType N,
-                    bool rowMajor,
-                    bool bcastAlongRows,
-                    Lambda op,
-                    cudaStream_t stream)
-{
+void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
+                    IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
+                    cudaStream_t stream) {
   IdxType stride = rowMajor ? D : N;
-  size_t bytes   = stride * sizeof(Type);
+  size_t bytes = stride * sizeof(Type);
   if (16 / sizeof(Type) && bytes % 16 == 0) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream);
@@ -134,16 +118,10 @@ void matrixVectorOp(Type* out,
 ///@todo: come up with a cleaner interface to support these cases in future!
 
 template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type* out,
-                                     const Type* matrix,
-                                     const Type* vector1,
-                                     const Type* vector2,
-                                     IdxType D,
-                                     IdxType N,
-                                     bool rowMajor,
-                                     bool bcastAlongRows,
-                                     Lambda op)
-{
+__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
+                                     const Type *vector1, const Type *vector2,
+                                     IdxType D, IdxType N, bool rowMajor,
+                                     bool bcastAlongRows, Lambda op) {
   typedef TxN_t<Type, veclen_> VecType;
   IdxType len = N * D;
   IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
@@ -176,21 +154,15 @@ __global__ void matrixVectorOpKernel(Type* out,
   mat.store(out, idx);
 }
 
-template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
-void matrixVectorOpImpl(Type* out,
-                        const Type* matrix,
-                        const Type* vec1,
-                        const Type* vec2,
-                        IdxType D,
-                        IdxType N,
-                        bool rowMajor,
-                        bool bcastAlongRows,
-                        Lambda op,
-                        cudaStream_t stream)
-{
+template <typename Type, int veclen_, typename Lambda, typename IdxType,
+          int TPB>
+void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
+                        const Type *vec2, IdxType D, IdxType N, bool rowMajor,
+                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
   IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
   matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op);
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor,
+                                bcastAlongRows, op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -213,19 +185,11 @@ void matrixVectorOpImpl(Type* out,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type* out,
-                    const Type* matrix,
-                    const Type* vec1,
-                    const Type* vec2,
-                    IdxType D,
-                    IdxType N,
-                    bool rowMajor,
-                    bool bcastAlongRows,
-                    Lambda op,
-                    cudaStream_t stream)
-{
+void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
+                    const Type *vec2, IdxType D, IdxType N, bool rowMajor,
+                    bool bcastAlongRows, Lambda op, cudaStream_t stream) {
   IdxType stride = rowMajor ? D : N;
-  size_t bytes   = stride * sizeof(Type);
+  size_t bytes = stride * sizeof(Type);
   if (16 / sizeof(Type) && bytes % 16 == 0) {
     matrixVectorOpImpl<Type, 16 / sizeof(Type), Lambda, IdxType, TPB>(
       out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream);
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
index a3fcc5bac6..9d1538c172 100644
--- a/cpp/include/raft/linalg/mean_squared_error.cuh
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -24,7 +24,7 @@ namespace linalg {
 /**
  * @brief CUDA version mean squared error function mean((A-B)**2)
  * @tparam math_t data-type upon which the math operation will be performed
- * @tparam TPB threads-per-block
+ * @tparam TPB threads-per-block 
  * @param out the output mean squared error value (assumed to be a device pointer)
  * @param A input array (assumed to be a device pointer)
  * @param B input array (assumed to be a device pointer)
@@ -33,14 +33,14 @@ namespace linalg {
  * @param stream cuda-stream where to launch this kernel
  */
 template <typename math_t, int TPB = 256>
-void meanSquaredError(
-  math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
-{
+void meanSquaredError(math_t* out, const math_t* A, const math_t* B, size_t len,
+                      math_t weight, cudaStream_t stream) {
   auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) {
     math_t diff = a - b;
     return diff * diff * weight / len;
   };
-  mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A, B);
+  mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A,
+                                                   B);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
index 53d57ecd00..ce948c927d 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -33,10 +33,11 @@ namespace linalg {
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
-{
+void multiplyScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
+                    cudaStream_t stream) {
   unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream);
+    out, in, len, [scalar] __device__(math_t in) { return in * scalar; },
+    stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
index 82558c8023..64930a7123 100644
--- a/cpp/include/raft/linalg/norm.cuh
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -44,46 +44,22 @@ enum NormType { L1Norm = 0, L2Norm };
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
-void rowNorm(Type* dots,
-             const Type* data,
-             IdxType D,
-             IdxType N,
-             NormType type,
-             bool rowMajor,
-             cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>())
-{
+template <typename Type, typename IdxType = int,
+          typename Lambda = raft::Nop<Type, IdxType>>
+void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
+             bool rowMajor, cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>()) {
   switch (type) {
     case L1Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             true,
-             stream,
-             false,
-             raft::L1Op<Type, IdxType>(),
-             raft::Sum<Type>(),
-             fin_op);
+      reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false,
+             raft::L1Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
       break;
     case L2Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             true,
-             stream,
-             false,
-             raft::L2Op<Type>(),
-             raft::Sum<Type>(),
-             fin_op);
+      reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false,
+             raft::L2Op<Type>(), raft::Sum<Type>(), fin_op);
       break;
-    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
+    default:
+      ASSERT(false, "Invalid norm type passed! [%d]", type);
   };
 }
 
@@ -101,46 +77,22 @@ void rowNorm(Type* dots,
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
-void colNorm(Type* dots,
-             const Type* data,
-             IdxType D,
-             IdxType N,
-             NormType type,
-             bool rowMajor,
-             cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>())
-{
+template <typename Type, typename IdxType = int,
+          typename Lambda = raft::Nop<Type, IdxType>>
+void colNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
+             bool rowMajor, cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>()) {
   switch (type) {
     case L1Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             false,
-             stream,
-             false,
-             raft::L1Op<Type, IdxType>(),
-             raft::Sum<Type>(),
-             fin_op);
+      reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false,
+             raft::L1Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
       break;
     case L2Norm:
-      reduce(dots,
-             data,
-             D,
-             N,
-             (Type)0,
-             rowMajor,
-             false,
-             stream,
-             false,
-             raft::L2Op<Type, IdxType>(),
-             raft::Sum<Type>(),
-             fin_op);
+      reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false,
+             raft::L2Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
       break;
-    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
+    default:
+      ASSERT(false, "Invalid norm type passed! [%d]", type);
   };
 }
 
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index c2455ac3a8..cafa8d54f1 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -40,19 +40,15 @@ namespace linalg {
  * @{
  */
 template <typename math_t>
-void qrGetQ(const raft::handle_t& handle,
-            const math_t* M,
-            math_t* Q,
-            int n_rows,
-            int n_cols,
-            cudaStream_t stream)
-{
-  auto allocator               = handle.get_device_allocator();
+void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
+            int n_rows, int n_cols, cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
   int k = min(m, n);
-  CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n,
+                             cudaMemcpyDeviceToDevice, stream));
 
   raft::mr::device::buffer<math_t> tau(allocator, stream, k);
   CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream));
@@ -62,16 +58,19 @@ void qrGetQ(const raft::handle_t& handle,
 
   CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork));
   raft::mr::device::buffer<math_t> workspace(allocator, stream, Lwork);
-  CUSOLVER_CHECK(cusolverDngeqrf(
-    cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
+  CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(),
+                                 workspace.data(), Lwork, devInfo.data(),
+                                 stream));
   /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
 #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
   CUDA_CHECK(cudaDeviceSynchronize());
 #endif
-  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
+  CUSOLVER_CHECK(
+    cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDnorgqr(
-    cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
+  CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(),
+                                 workspace.data(), Lwork, devInfo.data(),
+                                 stream));
 }
 
 /**
@@ -85,41 +84,30 @@ void qrGetQ(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void qrGetQR(const raft::handle_t& handle,
-             math_t* M,
-             math_t* Q,
-             math_t* R,
-             int n_rows,
-             int n_cols,
-             cudaStream_t stream)
-{
-  auto allocator               = handle.get_device_allocator();
+void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
+             int n_rows, int n_cols, cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
   raft::mr::device::buffer<math_t> R_full(allocator, stream, m * n);
   raft::mr::device::buffer<math_t> tau(allocator, stream, min(m, n));
-  CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
-  int R_full_nrows = m, R_full_ncols = n;
   CUDA_CHECK(
-    cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
+    cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
+  int R_full_nrows = m, R_full_ncols = n;
+  CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n,
+                             cudaMemcpyDeviceToDevice, stream));
 
   int Lwork;
   raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
 
-  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(
-    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork));
+  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows,
+                                            R_full_ncols, R_full.data(),
+                                            R_full_nrows, &Lwork));
   raft::mr::device::buffer<math_t> workspace(allocator, stream, Lwork);
-  CUSOLVER_CHECK(cusolverDngeqrf(cusolverH,
-                                 R_full_nrows,
-                                 R_full_ncols,
-                                 R_full.data(),
-                                 R_full_nrows,
-                                 tau.data(),
-                                 workspace.data(),
-                                 Lwork,
-                                 devInfo.data(),
-                                 stream));
+  CUSOLVER_CHECK(cusolverDngeqrf(
+    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows,
+    tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
   // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
 #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
   CUDA_CHECK(cudaDeviceSynchronize());
@@ -127,24 +115,17 @@ void qrGetQR(const raft::handle_t& handle,
 
   raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream);
 
-  CUDA_CHECK(
-    cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n,
+                             cudaMemcpyDeviceToDevice, stream));
   int Q_nrows = m, Q_ncols = n;
 
-  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(
-    cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
+  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols,
+                                            min(Q_ncols, Q_nrows), Q, Q_nrows,
+                                            tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDnorgqr(cusolverH,
-                                 Q_nrows,
-                                 Q_ncols,
-                                 min(Q_ncols, Q_nrows),
-                                 Q,
-                                 Q_nrows,
-                                 tau.data(),
-                                 workspace.data(),
-                                 Lwork,
-                                 devInfo.data(),
-                                 stream));
+  CUSOLVER_CHECK(cusolverDnorgqr(
+    cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(),
+    workspace.data(), Lwork, devInfo.data(), stream));
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index 693a797db9..d39577bbdd 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -52,33 +52,28 @@ namespace linalg {
  * @param reduce_op binary reduction operation
  * @param final_op elementwise operation to apply before storing results
  */
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
+template <typename InType, typename OutType = InType, typename IdxType = int,
+          typename MainLambda = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
-void reduce(OutType* dots,
-            const InType* data,
-            int D,
-            int N,
-            OutType init,
-            bool rowMajor,
-            bool alongRows,
-            cudaStream_t stream,
-            bool inplace           = false,
-            MainLambda main_op     = raft::Nop<InType, IdxType>(),
+          typename FinalLambda = raft::Nop<OutType>>
+void reduce(OutType *dots, const InType *data, int D, int N, OutType init,
+            bool rowMajor, bool alongRows, cudaStream_t stream,
+            bool inplace = false,
+            MainLambda main_op = raft::Nop<InType, IdxType>(),
             ReduceLambda reduce_op = raft::Sum<OutType>(),
-            FinalLambda final_op   = raft::Nop<OutType>())
-{
+            FinalLambda final_op = raft::Nop<OutType>()) {
   if (rowMajor && alongRows) {
-    coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+    coalescedReduction(dots, data, D, N, init, stream, inplace, main_op,
+                       reduce_op, final_op);
   } else if (rowMajor && !alongRows) {
-    stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+    stridedReduction(dots, data, D, N, init, stream, inplace, main_op,
+                     reduce_op, final_op);
   } else if (!rowMajor && alongRows) {
-    stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
+    stridedReduction(dots, data, N, D, init, stream, inplace, main_op,
+                     reduce_op, final_op);
   } else {
-    coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
+    coalescedReduction(dots, data, N, D, init, stream, inplace, main_op,
+                       reduce_op, final_op);
   }
 }
 
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index f931c976fd..bba652e137 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -28,15 +28,14 @@ namespace linalg {
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
 // for column major layout
 template <typename Type, typename MainLambda>
-__global__ void stridedSummationKernel(
-  Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op)
-{
+__global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
+                                       int N, Type init, MainLambda main_op) {
   // Thread reduction
   Type thread_data = Type(init);
-  int colStart     = blockIdx.x * blockDim.x + threadIdx.x;
+  int colStart = blockIdx.x * blockDim.x + threadIdx.x;
   if (colStart < D) {
     int rowStart = blockIdx.y * blockDim.y + threadIdx.y;
-    int stride   = blockDim.y * gridDim.y;
+    int stride = blockDim.y * gridDim.y;
     for (int j = rowStart; j < N; j += stride) {
       int idx = colStart + j * D;
       thread_data += main_op(data[idx], j);
@@ -45,8 +44,8 @@ __global__ void stridedSummationKernel(
 
   // Block reduction
   extern __shared__ char tmp[];  // One element per thread in block
-  Type* temp  = (Type*)tmp;      // Cast to desired type
-  int myidx   = threadIdx.x + blockDim.x * threadIdx.y;
+  Type *temp = (Type *)tmp;      // Cast to desired type
+  int myidx = threadIdx.x + blockDim.x * threadIdx.y;
   temp[myidx] = thread_data;
   __syncthreads();
   for (int j = blockDim.y / 2; j > 0; j /= 2) {
@@ -55,31 +54,24 @@ __global__ void stridedSummationKernel(
   }
 
   // Grid reduction
-  if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]);
+  if ((colStart < D) && (threadIdx.y == 0))
+    raft::myAtomicAdd(dots + colStart, temp[myidx]);
 }
 
 // Kernel to perform reductions along the strided dimension
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
 // for column major layout
-template <typename InType,
-          typename OutType,
-          typename IdxType,
-          typename MainLambda,
-          typename ReduceLambda>
-__global__ void stridedReductionKernel(OutType* dots,
-                                       const InType* data,
-                                       int D,
-                                       int N,
-                                       OutType init,
-                                       MainLambda main_op,
-                                       ReduceLambda reduce_op)
-{
+template <typename InType, typename OutType, typename IdxType,
+          typename MainLambda, typename ReduceLambda>
+__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
+                                       int N, OutType init, MainLambda main_op,
+                                       ReduceLambda reduce_op) {
   // Thread reduction
   OutType thread_data = init;
-  IdxType colStart    = blockIdx.x * blockDim.x + threadIdx.x;
+  IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x;
   if (colStart < D) {
     IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y;
-    IdxType stride   = blockDim.y * gridDim.y;
+    IdxType stride = blockDim.y * gridDim.y;
     for (IdxType j = rowStart; j < N; j += stride) {
       IdxType idx = colStart + j * D;
       thread_data = reduce_op(thread_data, main_op(data[idx], j));
@@ -87,13 +79,14 @@ __global__ void stridedReductionKernel(OutType* dots,
   }
 
   // Block reduction
-  extern __shared__ char tmp[];   // One element per thread in block
-  auto* temp    = (OutType*)tmp;  // Cast to desired type
+  extern __shared__ char tmp[];  // One element per thread in block
+  auto *temp = (OutType *)tmp;   // Cast to desired type
   IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y);
-  temp[myidx]   = thread_data;
+  temp[myidx] = thread_data;
   __syncthreads();
   for (int j = blockDim.y / 2; j > 0; j /= 2) {
-    if (threadIdx.y < j) temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]);
+    if (threadIdx.y < j)
+      temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]);
     __syncthreads();
   }
 
@@ -129,23 +122,15 @@ __global__ void stridedReductionKernel(OutType* dots,
  * @param inplace reduction result added inplace or overwrites old values?
  * @param stream cuda stream where to launch work
  */
-template <typename InType,
-          typename OutType      = InType,
-          typename IdxType      = int,
-          typename MainLambda   = raft::Nop<InType, IdxType>,
+template <typename InType, typename OutType = InType, typename IdxType = int,
+          typename MainLambda = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda  = raft::Nop<OutType>>
-void stridedReduction(OutType* dots,
-                      const InType* data,
-                      IdxType D,
-                      IdxType N,
-                      OutType init,
-                      cudaStream_t stream,
-                      bool inplace           = false,
-                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
+          typename FinalLambda = raft::Nop<OutType>>
+void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
+                      OutType init, cudaStream_t stream, bool inplace = false,
+                      MainLambda main_op = raft::Nop<InType, IdxType>(),
                       ReduceLambda reduce_op = raft::Sum<OutType>(),
-                      FinalLambda final_op   = raft::Nop<OutType>())
-{
+                      FinalLambda final_op = raft::Nop<OutType>()) {
   ///@todo: this extra should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
   if (!inplace)
@@ -155,7 +140,7 @@ void stridedReduction(OutType* dots,
   // Arbitrary numbers for now, probably need to tune
   const dim3 thrds(32, 16);
   IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y);
-  elemsPerThread         = (elemsPerThread > 8) ? 8 : elemsPerThread;
+  elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread;
   const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x),
                    raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread));
   const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y;
@@ -168,7 +153,8 @@ void stridedReduction(OutType* dots,
       <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
   else
     stridedReductionKernel<InType, OutType, IdxType>
-      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op, reduce_op);
+      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op,
+                                            reduce_op);
 
   ///@todo: this complication should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index 43060d0818..882c105689 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -38,8 +38,8 @@ namespace linalg {
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
-{
+void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len,
+                    cudaStream_t stream) {
   auto op = [scalar] __device__(InT in) { return OutT(in - scalar); };
   unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
 }
@@ -58,25 +58,24 @@ void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStrea
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
-{
+void subtract(OutT *out, const InT *in1, const InT *in2, IdxType len,
+              cudaStream_t stream) {
   auto op = [] __device__(InT a, InT b) { return OutT(a - b); };
   binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
 }
 
 template <class math_t, typename IdxType>
-__global__ void subtract_dev_scalar_kernel(math_t* outDev,
-                                           const math_t* inDev,
-                                           const math_t* singleScalarDev,
-                                           IdxType len)
-{
-  // TODO: kernel do not use shared memory in current implementation
+__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
+                                           const math_t *singleScalarDev,
+                                           IdxType len) {
+  //TODO: kernel do not use shared memory in current implementation
   int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; }
+  if (i < len) {
+    outDev[i] = inDev[i] - *singleScalarDev;
+  }
 }
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
- * write result to outDev[i]
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
  * @param outDev the output buffer
@@ -87,12 +86,9 @@ __global__ void subtract_dev_scalar_kernel(math_t* outDev,
  * @remark block size has not been tuned
  */
 template <typename math_t, typename IdxType = int, int TPB = 256>
-void subtractDevScalar(math_t* outDev,
-                       const math_t* inDev,
-                       const math_t* singleScalarDev,
-                       IdxType len,
-                       cudaStream_t stream)
-{
+void subtractDevScalar(math_t *outDev, const math_t *inDev,
+                       const math_t *singleScalarDev, IdxType len,
+                       cudaStream_t stream) {
   // Just for the note - there is no way to express such operation with cuBLAS in effective way
   // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda
   const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 1cb8b7592f..7357a68a4c 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -50,21 +50,14 @@ namespace linalg {
 // TODO: couldn't template this function due to cusolverDnSgesvd and
 // cusolverSnSgesvd. Check if there is any other way.
 template <typename T>
-void svdQR(const raft::handle_t& handle,
-           T* in,
-           int n_rows,
-           int n_cols,
-           T* sing_vals,
-           T* left_sing_vecs,
-           T* right_sing_vecs,
-           bool trans_right,
-           bool gen_left_vec,
-           bool gen_right_vec,
-           cudaStream_t stream)
-{
-  std::shared_ptr<raft::mr::device::allocator> allocator = handle.get_device_allocator();
-  cusolverDnHandle_t cusolverH                           = handle.get_cusolver_dn_handle();
-  cublasHandle_t cublasH                                 = handle.get_cublas_handle();
+void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
+           T *sing_vals, T *left_sing_vecs, T *right_sing_vecs,
+           bool trans_right, bool gen_left_vec, bool gen_right_vec,
+           cudaStream_t stream) {
+  std::shared_ptr<raft::mr::device::allocator> allocator =
+    handle.get_device_allocator();
+  cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
+  cublasHandle_t cublasH = handle.get_cublas_handle();
 
 #if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000
   // 46340: sqrt of max int value
@@ -79,13 +72,14 @@ void svdQR(const raft::handle_t& handle,
   const int n = n_cols;
 
   raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
-  T* d_rwork = nullptr;
+  T *d_rwork = nullptr;
 
   int lwork = 0;
-  CUSOLVER_CHECK(cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
+  CUSOLVER_CHECK(
+    cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
   raft::mr::device::buffer<T> d_work(allocator, stream, lwork);
 
-  char jobu  = 'S';
+  char jobu = 'S';
   char jobvt = 'A';
 
   if (!gen_left_vec) {
@@ -98,23 +92,9 @@ void svdQR(const raft::handle_t& handle,
     strcpy(&jobvt, &new_vt);
   }
 
-  CUSOLVER_CHECK(cusolverDngesvd(cusolverH,
-                                 jobu,
-                                 jobvt,
-                                 m,
-                                 n,
-                                 in,
-                                 m,
-                                 sing_vals,
-                                 left_sing_vecs,
-                                 m,
-                                 right_sing_vecs,
-                                 n,
-                                 d_work.data(),
-                                 lwork,
-                                 d_rwork,
-                                 devInfo.data(),
-                                 stream));
+  CUSOLVER_CHECK(cusolverDngesvd(
+    cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m,
+    right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream));
 
   // Transpose the right singular vector back
   if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream);
@@ -130,37 +110,19 @@ void svdQR(const raft::handle_t& handle,
 }
 
 template <typename T>
-void svdEig(const raft::handle_t& handle,
-            T* in,
-            int n_rows,
-            int n_cols,
-            T* S,
-            T* U,
-            T* V,
-            bool gen_left_vec,
-            cudaStream_t stream)
-{
-  auto allocator               = handle.get_device_allocator();
+void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
+            T *U, T *V, bool gen_left_vec, cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-  cublasHandle_t cublasH       = handle.get_cublas_handle();
+  cublasHandle_t cublasH = handle.get_cublas_handle();
 
   int len = n_cols * n_cols;
   raft::mr::device::buffer<T> in_cross_mult(allocator, stream, len);
 
   T alpha = T(1);
-  T beta  = T(0);
-  raft::linalg::gemm(handle,
-                     in,
-                     n_rows,
-                     n_cols,
-                     in,
-                     in_cross_mult.data(),
-                     n_cols,
-                     n_cols,
-                     CUBLAS_OP_T,
-                     CUBLAS_OP_N,
-                     alpha,
-                     beta,
+  T beta = T(0);
+  raft::linalg::gemm(handle, in, n_rows, n_cols, in, in_cross_mult.data(),
+                     n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta,
                      stream);
 
   eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream);
@@ -171,20 +133,10 @@ void svdEig(const raft::handle_t& handle,
   raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true);
 
   if (gen_left_vec) {
-    raft::linalg::gemm(handle,
-                       in,
-                       n_rows,
-                       n_cols,
-                       V,
-                       U,
-                       n_rows,
-                       n_cols,
-                       CUBLAS_OP_N,
-                       CUBLAS_OP_N,
-                       alpha,
-                       beta,
-                       stream);
-    raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, true, stream);
+    raft::linalg::gemm(handle, in, n_rows, n_cols, V, U, n_rows, n_cols,
+                       CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
+    raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false,
+                                                true, stream);
   }
 }
 
@@ -206,20 +158,11 @@ void svdEig(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdJacobi(const raft::handle_t& handle,
-               math_t* in,
-               int n_rows,
-               int n_cols,
-               math_t* sing_vals,
-               math_t* left_sing_vecs,
-               math_t* right_sing_vecs,
-               bool gen_left_vec,
-               bool gen_right_vec,
-               math_t tol,
-               int max_sweeps,
-               cudaStream_t stream)
-{
-  auto allocator               = handle.get_device_allocator();
+void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
+               math_t *sing_vals, math_t *left_sing_vecs,
+               math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec,
+               math_t tol, int max_sweeps, cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   gesvdjInfo_t gesvdj_params = NULL;
@@ -234,42 +177,18 @@ void svdJacobi(const raft::handle_t& handle,
   raft::mr::device::buffer<int> devInfo(allocator, stream, 1);
 
   int lwork = 0;
-  int econ  = 1;
-
-  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH,
-                                                           CUSOLVER_EIG_MODE_VECTOR,
-                                                           econ,
-                                                           m,
-                                                           n,
-                                                           in,
-                                                           m,
-                                                           sing_vals,
-                                                           left_sing_vecs,
-                                                           m,
-                                                           right_sing_vecs,
-                                                           n,
-                                                           &lwork,
-                                                           gesvdj_params));
+  int econ = 1;
+
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(
+    cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
+    left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params));
 
   raft::mr::device::buffer<math_t> d_work(allocator, stream, lwork);
 
-  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(cusolverH,
-                                                CUSOLVER_EIG_MODE_VECTOR,
-                                                econ,
-                                                m,
-                                                n,
-                                                in,
-                                                m,
-                                                sing_vals,
-                                                left_sing_vecs,
-                                                m,
-                                                right_sing_vecs,
-                                                n,
-                                                d_work.data(),
-                                                lwork,
-                                                devInfo.data(),
-                                                gesvdj_params,
-                                                stream));
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(
+    cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
+    left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(),
+    gesvdj_params, stream));
 
   CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
@@ -288,36 +207,18 @@ void svdJacobi(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdReconstruction(const raft::handle_t& handle,
-                       math_t* U,
-                       math_t* S,
-                       math_t* V,
-                       math_t* out,
-                       int n_rows,
-                       int n_cols,
-                       int k,
-                       cudaStream_t stream)
-{
+void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S,
+                       math_t *V, math_t *out, int n_rows, int n_cols, int k,
+                       cudaStream_t stream) {
   auto allocator = handle.get_device_allocator();
 
   const math_t alpha = 1.0, beta = 0.0;
   raft::mr::device::buffer<math_t> SVT(allocator, stream, k * n_cols);
 
-  raft::linalg::gemm(
-    handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream);
-  raft::linalg::gemm(handle,
-                     U,
-                     n_rows,
-                     k,
-                     SVT.data(),
-                     out,
-                     n_rows,
-                     n_cols,
-                     CUBLAS_OP_N,
-                     CUBLAS_OP_N,
-                     alpha,
-                     beta,
-                     stream);
+  raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N,
+                     CUBLAS_OP_T, alpha, beta, stream);
+  raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols,
+                     CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
 }
 
 /**
@@ -335,18 +236,10 @@ void svdReconstruction(const raft::handle_t& handle,
  * @param stream cuda stream
  */
 template <typename math_t>
-bool evaluateSVDByL2Norm(const raft::handle_t& handle,
-                         math_t* A_d,
-                         math_t* U,
-                         math_t* S_vec,
-                         math_t* V,
-                         int n_rows,
-                         int n_cols,
-                         int k,
-                         math_t tol,
-                         cudaStream_t stream)
-{
-  auto allocator         = handle.get_device_allocator();
+bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
+                         math_t *S_vec, math_t *V, int n_rows, int n_cols,
+                         int k, math_t tol, cudaStream_t stream) {
+  auto allocator = handle.get_device_allocator();
   cublasHandle_t cublasH = handle.get_cublas_handle();
 
   int m = n_rows, n = n_cols;
@@ -370,25 +263,16 @@ bool evaluateSVDByL2Norm(const raft::handle_t& handle,
   // calculate percent error
   const math_t alpha = 1.0, beta = -1.0;
   raft::mr::device::buffer<math_t> A_minus_P(allocator, stream, m * n);
-  CUDA_CHECK(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
-
-  CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH,
-                                        CUBLAS_OP_N,
-                                        CUBLAS_OP_N,
-                                        m,
-                                        n,
-                                        &alpha,
-                                        A_d,
-                                        m,
-                                        &beta,
-                                        P_d.data(),
-                                        m,
-                                        A_minus_P.data(),
-                                        m,
-                                        stream));
-
-  math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
-  math_t percent_error  = 100.0 * norm_A_minus_P / normA;
+  CUDA_CHECK(
+    cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
+
+  CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n,
+                                        &alpha, A_d, m, &beta, P_d.data(), m,
+                                        A_minus_P.data(), m, stream));
+
+  math_t norm_A_minus_P =
+    raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
+  math_t percent_error = 100.0 * norm_A_minus_P / normA;
   return (percent_error / 100.0 < tol);
 }
 
diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h
index 9b954c29c1..d90f6271fa 100644
--- a/cpp/include/raft/linalg/transpose.h
+++ b/cpp/include/raft/linalg/transpose.h
@@ -33,34 +33,18 @@ namespace linalg {
  * @param stream: cuda stream
  */
 template <typename math_t>
-void transpose(const raft::handle_t& handle,
-               math_t* in,
-               math_t* out,
-               int n_rows,
-               int n_cols,
-               cudaStream_t stream)
-{
+void transpose(const raft::handle_t &handle, math_t *in, math_t *out,
+               int n_rows, int n_cols, cudaStream_t stream) {
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
   int out_n_rows = n_cols;
   int out_n_cols = n_rows;
 
   const math_t alpha = 1.0;
-  const math_t beta  = 0.0;
-  CUBLAS_CHECK(raft::linalg::cublasgeam(cublas_h,
-                                        CUBLAS_OP_T,
-                                        CUBLAS_OP_N,
-                                        out_n_rows,
-                                        out_n_cols,
-                                        &alpha,
-                                        in,
-                                        n_rows,
-                                        &beta,
-                                        out,
-                                        out_n_rows,
-                                        out,
-                                        out_n_rows,
-                                        stream));
+  const math_t beta = 0.0;
+  CUBLAS_CHECK(raft::linalg::cublasgeam(
+    cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, out_n_rows, out_n_cols, &alpha, in,
+    n_rows, &beta, out, out_n_rows, out, out_n_rows, stream));
 }
 
 /**
@@ -70,25 +54,24 @@ void transpose(const raft::handle_t& handle,
  * @param stream: cuda stream
  */
 template <typename math_t>
-void transpose(math_t* inout, int n, cudaStream_t stream)
-{
-  auto m        = n;
-  auto size     = n * n;
-  auto d_inout  = inout;
+void transpose(math_t *inout, int n, cudaStream_t stream) {
+  auto m = n;
+  auto size = n * n;
+  auto d_inout = inout;
   auto counting = thrust::make_counting_iterator<int>(0);
 
-  thrust::for_each(
-    thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(int idx) {
-      int s_row = idx % m;
-      int s_col = idx / m;
-      int d_row = s_col;
-      int d_col = s_row;
-      if (s_row < s_col) {
-        auto temp                  = d_inout[d_col * m + d_row];
-        d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row];
-        d_inout[s_col * m + s_row] = temp;
-      }
-    });
+  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
+                   [=] __device__(int idx) {
+                     int s_row = idx % m;
+                     int s_col = idx / m;
+                     int d_row = s_col;
+                     int d_col = s_row;
+                     if (s_row < s_col) {
+                       auto temp = d_inout[d_col * m + d_row];
+                       d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row];
+                       d_inout[s_col * m + s_row] = temp;
+                     }
+                   });
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index 198b9b2b10..46b4d296cb 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -23,9 +23,10 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType>
-__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op)
-{
+template <typename InType, int VecLen, typename Lambda, typename OutType,
+          typename IdxType>
+__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len,
+                              Lambda op) {
   typedef TxN_t<InType, VecLen> InVecType;
   typedef TxN_t<OutType, VecLen> OutVecType;
   InVecType a;
@@ -41,10 +42,12 @@ __global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambd
   b.store(out, idx);
 }
 
-template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType, int TPB>
-void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
-{
-  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+template <typename InType, int VecLen, typename Lambda, typename OutType,
+          typename IdxType, int TPB>
+void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op,
+                 cudaStream_t stream) {
+  const IdxType nblks =
+    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
   unaryOpKernel<InType, VecLen, Lambda, OutType, IdxType>
     <<<nblks, TPB, 0, stream>>>(out, in, len, op);
   CUDA_CHECK(cudaPeekAtLastError());
@@ -65,38 +68,47 @@ void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStr
  * @note Lambda must be a functor with the following signature:
  *       `OutType func(const InType& val);`
  */
-template <typename InType,
-          typename Lambda,
-          typename IdxType = int,
-          typename OutType = InType,
-          int TPB          = 256>
-void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
-{
-  if (len <= 0) return;  // silently skip in case of 0 length input
-  constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes           = len * maxSize;
-  uint64_t inAddr        = uint64_t(in);
-  uint64_t outAddr       = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) {
-    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
-  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) {
-    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
-  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) {
-    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
-  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) {
-    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+template <typename InType, typename Lambda, typename IdxType = int,
+          typename OutType = InType, int TPB = 256>
+void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op,
+             cudaStream_t stream) {
+  if (len <= 0) return;  //silently skip in case of 0 length input
+  constexpr auto maxSize =
+    sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes = len * maxSize;
+  uint64_t inAddr = uint64_t(in);
+  uint64_t outAddr = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 &&
+      outAddr % 16 == 0) {
+    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 &&
+             outAddr % 8 == 0) {
+    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 &&
+             outAddr % 4 == 0) {
+    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 &&
+             outAddr % 2 == 0) {
+    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
   } else if (1 / maxSize) {
-    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(
+      out, in, len, op, stream);
   } else {
-    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op,
+                                                          stream);
   }
 }
 
 template <typename OutType, typename Lambda, typename IdxType>
-__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op)
-{
+__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) {
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-  if (idx < len) { op(out + idx, idx); }
+  if (idx < len) {
+    op(out + idx, idx);
+  }
 }
 
 /**
@@ -116,12 +128,14 @@ __global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op)
  *                    where outLocationOffset will be out + idx.
  * @param[in]  stream cuda stream where to launch work
  */
-template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
-void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
-{
+template <typename OutType, typename Lambda, typename IdxType = int,
+          int TPB = 256>
+void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op,
+                      cudaStream_t stream) {
   if (len <= 0) return;  // silently skip in case of 0 length input
   auto nblks = raft::ceildiv<IdxType>(len, TPB);
-  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType><<<nblks, TPB, 0, stream>>>(out, len, op);
+  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, len, op);
   CUDA_CHECK(cudaGetLastError());
 }
 
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
index 579491b5cc..0a72117140 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.cuh
@@ -41,18 +41,14 @@ namespace matrix {
  * @param stream cuda stream
  */
 template <typename math_t>
-void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
-{
-  auto d_src  = in;
+void power(math_t *in, math_t *out, math_t scalar, int len,
+           cudaStream_t stream) {
+  auto d_src = in;
   auto d_dest = out;
 
   raft::linalg::binaryOp(
-    d_dest,
-    d_src,
-    d_src,
-    len,
-    [=] __device__(math_t a, math_t b) { return scalar * a * b; },
-    stream);
+    d_dest, d_src, d_src, len,
+    [=] __device__(math_t a, math_t b) { return scalar * a * b; }, stream);
 }
 
 /**
@@ -63,8 +59,7 @@ void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
  * @param stream cuda stream
  */
 template <typename math_t>
-void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
-{
+void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) {
   power(inout, inout, scalar, len, stream);
 }
 
@@ -75,8 +70,7 @@ void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
  * @param stream cuda stream
  */
 template <typename math_t>
-void power(math_t* inout, int len, cudaStream_t stream)
-{
+void power(math_t *inout, int len, cudaStream_t stream) {
   math_t scalar = 1.0;
   power(inout, scalar, len, stream);
 }
@@ -90,8 +84,7 @@ void power(math_t* inout, int len, cudaStream_t stream)
  * @{
  */
 template <typename math_t>
-void power(math_t* in, math_t* out, int len, cudaStream_t stream)
-{
+void power(math_t *in, math_t *out, int len, cudaStream_t stream) {
   math_t scalar = 1.0;
   power(in, out, scalar, len, stream);
 }
@@ -108,20 +101,13 @@ void power(math_t* in, math_t* out, int len, cudaStream_t stream)
  * @param set_neg_zero whether to set negative numbers to zero
  */
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t* in,
-             math_t* out,
-             math_t scalar,
-             IdxType len,
-             cudaStream_t stream,
-             bool set_neg_zero = false)
-{
-  auto d_src  = in;
+void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len,
+             cudaStream_t stream, bool set_neg_zero = false) {
+  auto d_src = in;
   auto d_dest = out;
 
   raft::linalg::unaryOp(
-    d_dest,
-    d_src,
-    len,
+    d_dest, d_src, len,
     [=] __device__(math_t a) {
       if (set_neg_zero) {
         if (a < math_t(0)) {
@@ -147,9 +133,8 @@ void seqRoot(math_t* in,
  * @param set_neg_zero whether to set negative numbers to zero
  */
 template <typename math_t, typename IdxType = int>
-void seqRoot(
-  math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false)
-{
+void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
+             bool set_neg_zero = false) {
   seqRoot(inout, inout, scalar, len, stream, set_neg_zero);
 }
 
@@ -163,27 +148,22 @@ void seqRoot(
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
-{
+void seqRoot(math_t *in, math_t *out, IdxType len, cudaStream_t stream) {
   math_t scalar = 1.0;
   seqRoot(in, out, scalar, len, stream);
 }
 
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t* inout, IdxType len, cudaStream_t stream)
-{
+void seqRoot(math_t *inout, IdxType len, cudaStream_t stream) {
   math_t scalar = 1.0;
   seqRoot(inout, inout, scalar, len, stream);
 }
 
 template <typename math_t, typename IdxType = int>
-void setSmallValuesZero(
-  math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
-{
+void setSmallValuesZero(math_t *out, const math_t *in, IdxType len,
+                        cudaStream_t stream, math_t thres = 1e-15) {
   raft::linalg::unaryOp(
-    out,
-    in,
-    len,
+    out, in, len,
     [=] __device__(math_t a) {
       if (a <= thres && -a <= thres) {
         return math_t(0);
@@ -204,8 +184,8 @@ void setSmallValuesZero(
  * @param thres: threshold
  */
 template <typename math_t, typename IdxType = int>
-void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
-{
+void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream,
+                        math_t thres = 1e-15) {
   setSmallValuesZero(inout, inout, len, stream, thres);
 }
 
@@ -223,21 +203,14 @@ void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t* in,
-                math_t* out,
-                math_t scalar,
-                int len,
-                cudaStream_t stream,
-                bool setzero = false,
-                math_t thres = 1e-15)
-{
-  auto d_src  = in;
+void reciprocal(math_t *in, math_t *out, math_t scalar, int len,
+                cudaStream_t stream, bool setzero = false,
+                math_t thres = 1e-15) {
+  auto d_src = in;
   auto d_dest = out;
 
   raft::linalg::unaryOp(
-    d_dest,
-    d_src,
-    len,
+    d_dest, d_src, len,
     [=] __device__(math_t a) {
       if (setzero) {
         if (abs(a) <= thres) {
@@ -264,13 +237,8 @@ void reciprocal(math_t* in,
  * @param thres: Threshold to avoid dividing by zero (|value| < thres -> result = 0)
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t* inout,
-                math_t scalar,
-                IdxType len,
-                cudaStream_t stream,
-                bool setzero = false,
-                math_t thres = 1e-15)
-{
+void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
+                bool setzero = false, math_t thres = 1e-15) {
   reciprocal(inout, inout, scalar, len, stream, setzero, thres);
 }
 
@@ -283,8 +251,7 @@ void reciprocal(math_t* inout,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
-{
+void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) {
   math_t scalar = 1.0;
   reciprocal(inout, scalar, len, stream);
 }
@@ -299,15 +266,14 @@ void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
-{
+void reciprocal(math_t *in, math_t *out, IdxType len, cudaStream_t stream) {
   math_t scalar = 1.0;
   reciprocal(in, out, scalar, len, stream);
 }
 
 template <typename math_t>
-void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
-{
+void setValue(math_t *out, const math_t *in, math_t scalar, int len,
+              cudaStream_t stream = 0) {
   raft::linalg::unaryOp(
     out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream);
 }
@@ -323,44 +289,46 @@ void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void ratio(
-  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
-{
-  auto d_src  = src;
+void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len,
+           cudaStream_t stream) {
+  auto d_src = src;
   auto d_dest = dest;
 
-  std::shared_ptr<raft::mr::device::allocator> allocator = handle.get_device_allocator();
+  std::shared_ptr<raft::mr::device::allocator> allocator =
+    handle.get_device_allocator();
 
   raft::mr::device::buffer<math_t> d_sum(allocator, stream, 1);
-  auto* d_sum_ptr = d_sum.data();
-  auto no_op      = [] __device__(math_t in) { return in; };
+  auto *d_sum_ptr = d_sum.data();
+  auto no_op = [] __device__(math_t in) { return in; };
   raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
   raft::linalg::unaryOp(
-    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream);
+    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); },
+    stream);
 }
 
 /** @} */
 
 // Computes the argmax(d_in) column-wise in a DxN matrix
 template <typename T, int TPB>
-__global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax)
-{
+__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) {
   typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   // compute maxIndex=argMax  index for column
-  using KVP    = cub::KeyValuePair<int, T>;
+  using KVP = cub::KeyValuePair<int, T>;
   int rowStart = blockIdx.x * D;
   KVP thread_data(-1, -raft::myInf<T>());
 
   for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx     = rowStart + i;
+    int idx = rowStart + i;
     thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx]));
   }
 
   auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
 
-  if (threadIdx.x == 0) { argmax[blockIdx.x] = maxKV.key; }
+  if (threadIdx.x == 0) {
+    argmax[blockIdx.x] = maxKV.key;
+  }
 }
 
 /**
@@ -372,8 +340,8 @@ __global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax)
  * @param stream: cuda stream
  */
 template <typename math_t>
-void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream)
-{
+void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
+            cudaStream_t stream) {
   int D = n_rows;
   int N = n_cols;
   if (D <= 32) {
@@ -392,29 +360,30 @@ void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t
 // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by
 // flipping the sign if the |max| value for each column is negative.
 template <typename T, int TPB>
-__global__ void signFlipKernel(T* d_in, int D, int N)
-{
+__global__ void signFlipKernel(T *d_in, int D, int N) {
   typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   // compute maxIndex=argMax (with abs()) index for column
-  using KVP    = cub::KeyValuePair<int, T>;
+  using KVP = cub::KeyValuePair<int, T>;
   int rowStart = blockIdx.x * D;
   KVP thread_data(0, 0);
   for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx     = rowStart + i;
+    int idx = rowStart + i;
     thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx])));
   }
   auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
 
   // flip column sign if d_in[maxIndex] < 0
   __shared__ bool need_sign_flip;
-  if (threadIdx.x == 0) { need_sign_flip = d_in[maxKV.key] < T(0); }
+  if (threadIdx.x == 0) {
+    need_sign_flip = d_in[maxKV.key] < T(0);
+  }
   __syncthreads();
 
   if (need_sign_flip) {
     for (int i = threadIdx.x; i < D; i += TPB) {
-      int idx   = rowStart + i;
+      int idx = rowStart + i;
       d_in[idx] = -d_in[idx];
     }
   }
@@ -429,10 +398,9 @@ __global__ void signFlipKernel(T* d_in, int D, int N)
  * @param stream cuda stream
  */
 template <typename math_t>
-void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
-{
-  int D     = n_rows;
-  int N     = n_cols;
+void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
+  int D = n_rows;
+  int N = n_cols;
   auto data = inout;
   if (D <= 32) {
     signFlipKernel<math_t, 32><<<N, 32, 0, stream>>>(data, D, N);
@@ -447,43 +415,20 @@ void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryMult(Type* data,
-                            const Type* vec,
-                            IdxType n_row,
-                            IdxType n_col,
-                            bool rowMajor,
-                            bool bcastAlongRows,
-                            cudaStream_t stream)
-{
+void matrixVectorBinaryMult(Type *data, const Type *vec, IdxType n_row,
+                            IdxType n_col, bool rowMajor, bool bcastAlongRows,
+                            cudaStream_t stream) {
   raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a * b; },
-    stream);
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) { return a * b; }, stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryMultSkipZero(Type* data,
-                                    const Type* vec,
-                                    IdxType n_row,
-                                    IdxType n_col,
-                                    bool rowMajor,
-                                    bool bcastAlongRows,
-                                    cudaStream_t stream)
-{
+void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row,
+                                    IdxType n_col, bool rowMajor,
+                                    bool bcastAlongRows, cudaStream_t stream) {
   raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
     [] __device__(Type a, Type b) {
       if (b == Type(0))
         return a;
@@ -494,45 +439,22 @@ void matrixVectorBinaryMultSkipZero(Type* data,
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryDiv(Type* data,
-                           const Type* vec,
-                           IdxType n_row,
-                           IdxType n_col,
-                           bool rowMajor,
-                           bool bcastAlongRows,
-                           cudaStream_t stream)
-{
+void matrixVectorBinaryDiv(Type *data, const Type *vec, IdxType n_row,
+                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
+                           cudaStream_t stream) {
   raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a / b; },
-    stream);
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) { return a / b; }, stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryDivSkipZero(Type* data,
-                                   const Type* vec,
-                                   IdxType n_row,
-                                   IdxType n_col,
-                                   bool rowMajor,
-                                   bool bcastAlongRows,
-                                   cudaStream_t stream,
-                                   bool return_zero = false)
-{
+void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row,
+                                   IdxType n_col, bool rowMajor,
+                                   bool bcastAlongRows, cudaStream_t stream,
+                                   bool return_zero = false) {
   if (return_zero) {
     raft::linalg::matrixVectorOp(
-      data,
-      data,
-      vec,
-      n_col,
-      n_row,
-      rowMajor,
-      bcastAlongRows,
+      data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
       [] __device__(Type a, Type b) {
         if (raft::myAbs(b) < Type(1e-10))
           return Type(0);
@@ -542,13 +464,7 @@ void matrixVectorBinaryDivSkipZero(Type* data,
       stream);
   } else {
     raft::linalg::matrixVectorOp(
-      data,
-      data,
-      vec,
-      n_col,
-      n_row,
-      rowMajor,
-      bcastAlongRows,
+      data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
       [] __device__(Type a, Type b) {
         if (raft::myAbs(b) < Type(1e-10))
           return a;
@@ -560,45 +476,21 @@ void matrixVectorBinaryDivSkipZero(Type* data,
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryAdd(Type* data,
-                           const Type* vec,
-                           IdxType n_row,
-                           IdxType n_col,
-                           bool rowMajor,
-                           bool bcastAlongRows,
-                           cudaStream_t stream)
-{
+void matrixVectorBinaryAdd(Type *data, const Type *vec, IdxType n_row,
+                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
+                           cudaStream_t stream) {
   raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; },
-    stream);
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; }, stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinarySub(Type* data,
-                           const Type* vec,
-                           IdxType n_row,
-                           IdxType n_col,
-                           bool rowMajor,
-                           bool bcastAlongRows,
-                           cudaStream_t stream)
-{
+void matrixVectorBinarySub(Type *data, const Type *vec, IdxType n_row,
+                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
+                           cudaStream_t stream) {
   raft::linalg::matrixVectorOp(
-    data,
-    data,
-    vec,
-    n_col,
-    n_row,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; },
-    stream);
+    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; }, stream);
 }
 
 };  // end namespace matrix
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
index 71a2888545..5f5755e24e 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -49,33 +49,29 @@ using namespace std;
  * @param rowMajor whether the matrix has row major layout
  */
 template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
-void copyRows(const m_t* in,
-              idx_t n_rows,
-              idx_t n_cols,
-              m_t* out,
-              const idx_array_t* indices,
-              idx_t n_rows_indices,
-              cudaStream_t stream,
-              bool rowMajor = false)
-{
+void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
+              const idx_array_t *indices, idx_t n_rows_indices,
+              cudaStream_t stream, bool rowMajor = false) {
   if (rowMajor) {
     const idx_t TPB = 256;
-    cache::get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
-      in, n_cols, indices, n_rows_indices, out);
+    cache::
+      get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
+        in, n_cols, indices, n_rows_indices, out);
     CUDA_CHECK(cudaPeekAtLastError());
     return;
   }
 
-  idx_t size    = n_rows_indices * n_cols;
+  idx_t size = n_rows_indices * n_cols;
   auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(
-    thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(idx_t idx) {
-      idx_t row = idx % n_rows_indices;
-      idx_t col = idx / n_rows_indices;
+  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
+                   [=] __device__(idx_t idx) {
+                     idx_t row = idx % n_rows_indices;
+                     idx_t col = idx / n_rows_indices;
 
-      out[col * n_rows_indices + row] = in[col * n_rows + indices[row]];
-    });
+                     out[col * n_rows_indices + row] =
+                       in[col * n_rows + indices[row]];
+                   });
 }
 
 /**
@@ -87,8 +83,8 @@ void copyRows(const m_t* in,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
+void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols,
+          cudaStream_t stream) {
   raft::copy_async(out, in, n_rows * n_cols, stream);
 }
 
@@ -103,22 +99,21 @@ void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stre
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void truncZeroOrigin(
-  m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream)
-{
-  auto m         = out_n_rows;
-  auto k         = in_n_rows;
-  idx_t size     = out_n_rows * out_n_cols;
-  auto d_q       = in;
+void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows,
+                     idx_t out_n_cols, cudaStream_t stream) {
+  auto m = out_n_rows;
+  auto k = in_n_rows;
+  idx_t size = out_n_rows * out_n_cols;
+  auto d_q = in;
   auto d_q_trunc = out;
-  auto counting  = thrust::make_counting_iterator<idx_t>(0);
+  auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(
-    thrust::cuda::par.on(stream), counting, counting + size, [=] __device__(idx_t idx) {
-      idx_t row                = idx % m;
-      idx_t col                = idx / m;
-      d_q_trunc[col * m + row] = d_q[col * k + row];
-    });
+  thrust::for_each(thrust::cuda::par.on(stream), counting, counting + size,
+                   [=] __device__(idx_t idx) {
+                     idx_t row = idx % m;
+                     idx_t col = idx / m;
+                     d_q_trunc[col * m + row] = d_q[col * k + row];
+                   });
 }
 
 /**
@@ -130,25 +125,24 @@ void truncZeroOrigin(
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
-  auto n            = n_cols;
-  auto m            = n_rows;
-  idx_t size        = n_rows * n_cols;
-  auto d_q          = inout;
+void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
+  auto n = n_cols;
+  auto m = n_rows;
+  idx_t size = n_rows * n_cols;
+  auto d_q = inout;
   auto d_q_reversed = inout;
-  auto counting     = thrust::make_counting_iterator<idx_t>(0);
+  auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(
-    thrust::cuda::par.on(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
-      idx_t dest_row             = idx % m;
-      idx_t dest_col             = idx / m;
-      idx_t src_row              = dest_row;
-      idx_t src_col              = (n - dest_col) - 1;
-      m_t temp                   = (m_t)d_q_reversed[idx];
-      d_q_reversed[idx]          = d_q[src_col * m + src_row];
-      d_q[src_col * m + src_row] = temp;
-    });
+  thrust::for_each(thrust::cuda::par.on(stream), counting,
+                   counting + (size / 2), [=] __device__(idx_t idx) {
+                     idx_t dest_row = idx % m;
+                     idx_t dest_col = idx / m;
+                     idx_t src_row = dest_row;
+                     idx_t src_col = (n - dest_col) - 1;
+                     m_t temp = (m_t)d_q_reversed[idx];
+                     d_q_reversed[idx] = d_q[src_col * m + src_row];
+                     d_q[src_col * m + src_row] = temp;
+                   });
 }
 
 /**
@@ -160,26 +154,25 @@ void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
-  auto m            = n_rows;
-  idx_t size        = n_rows * n_cols;
-  auto d_q          = inout;
+void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
+  auto m = n_rows;
+  idx_t size = n_rows * n_cols;
+  auto d_q = inout;
   auto d_q_reversed = inout;
-  auto counting     = thrust::make_counting_iterator<idx_t>(0);
+  auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(
-    thrust::cuda::par.on(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
-      idx_t dest_row = idx % m;
-      idx_t dest_col = idx / m;
-      idx_t src_row  = (m - dest_row) - 1;
-      ;
-      idx_t src_col = dest_col;
+  thrust::for_each(thrust::cuda::par.on(stream), counting,
+                   counting + (size / 2), [=] __device__(idx_t idx) {
+                     idx_t dest_row = idx % m;
+                     idx_t dest_col = idx / m;
+                     idx_t src_row = (m - dest_row) - 1;
+                     ;
+                     idx_t src_col = dest_col;
 
-      m_t temp                   = (m_t)d_q_reversed[idx];
-      d_q_reversed[idx]          = d_q[src_col * m + src_row];
-      d_q[src_col * m + src_row] = temp;
-    });
+                     m_t temp = (m_t)d_q_reversed[idx];
+                     d_q_reversed[idx] = d_q[src_col * m + src_row];
+                     d_q[src_col * m + src_row] = temp;
+                   });
 }
 
 /**
@@ -191,16 +184,16 @@ void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
  * @param v_separator: vertical separator character
  */
 template <typename m_t, typename idx_t = int>
-void print(
-  const m_t* in, idx_t n_rows, idx_t n_cols, char h_separator = ' ', char v_separator = '\n')
-{
+void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ',
+           char v_separator = '\n') {
   std::vector<m_t> h_matrix = std::vector<m_t>(n_cols * n_rows);
-  CUDA_CHECK(
-    cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t), cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(h_matrix.data(), in, n_cols * n_rows * sizeof(m_t),
+                        cudaMemcpyDeviceToHost));
 
   for (idx_t i = 0; i < n_rows; i++) {
     for (idx_t j = 0; j < n_cols; j++) {
-      printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator);
+      printf("%1.4f%c", h_matrix[j * n_rows + i],
+             j < n_cols - 1 ? h_separator : v_separator);
     }
   }
 }
@@ -212,8 +205,7 @@ void print(
  * @param n_cols: number of columns of input matrix
  */
 template <typename m_t, typename idx_t = int>
-void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
-{
+void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) {
   for (idx_t i = 0; i < n_rows; i++) {
     for (idx_t j = 0; j < n_cols; j++) {
       printf("%1.4f ", in[j * n_rows + i]);
@@ -234,9 +226,8 @@ void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
  * (1-based)
  */
 template <typename m_t, typename idx_t = int>
-__global__ void slice(
-  m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2)
-{
+__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
+                      idx_t y1, idx_t x2, idx_t y2) {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t dm = x2 - x1, dn = y2 - y1;
   if (idx < dm * dn) {
@@ -260,16 +251,8 @@ __global__ void slice(
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void sliceMatrix(m_t* in,
-                 idx_t n_rows,
-                 idx_t n_cols,
-                 m_t* out,
-                 idx_t x1,
-                 idx_t y1,
-                 idx_t x2,
-                 idx_t y2,
-                 cudaStream_t stream)
-{
+void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
+                 idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) {
   // Slicing
   dim3 block(64);
   dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x);
@@ -285,13 +268,15 @@ void sliceMatrix(m_t* in,
  * @param k: min(n_rows, n_cols)
  */
 template <typename m_t, typename idx_t = int>
-__global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k)
-{
+__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows,
+                                   idx_t n_cols, idx_t k) {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t m = n_rows, n = n_cols;
   if (idx < m * n) {
     idx_t i = idx % m, j = idx / m;
-    if (i < k && j < k && j >= i) { dst[i + j * k] = src[idx]; }
+    if (i < k && j < k && j >= i) {
+      dst[i + j * k] = src[idx];
+    }
   }
 }
 
@@ -304,8 +289,8 @@ __global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_col
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
+void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
+                         cudaStream_t stream) {
   idx_t m = n_rows, n = n_cols;
   idx_t k = min(m, n);
   dim3 block(64);
@@ -322,11 +307,13 @@ void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStr
  * @param k: dimensionality
  */
 template <typename m_t, typename idx_t = int>
-__global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k)
-{
+__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m,
+                                           idx_t n, idx_t k) {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
 
-  if (idx < k) { matrix[idx + idx * m] = vec[idx]; }
+  if (idx < k) {
+    matrix[idx + idx * m] = vec[idx];
+  }
 }
 
 /**
@@ -338,13 +325,13 @@ __global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void initializeDiagonalMatrix(
-  m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
-{
+void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
+                              cudaStream_t stream) {
   idx_t k = min(n_rows, n_cols);
   dim3 block(64);
   dim3 grid((k + block.x - 1) / block.x);
-  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows, n_cols, k);
+  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows,
+                                                         n_cols, k);
 }
 
 /**
@@ -354,10 +341,11 @@ void initializeDiagonalMatrix(
  * @param len: size of one side of the matrix
  */
 template <typename m_t, typename idx_t = int>
-__global__ void matrixDiagonalInverse(m_t* in, idx_t len)
-{
+__global__ void matrixDiagonalInverse(m_t *in, idx_t len) {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (idx < len) { in[idx + idx * len] = 1.0 / in[idx + idx * len]; }
+  if (idx < len) {
+    in[idx + idx * len] = 1.0 / in[idx + idx * len];
+  }
 }
 
 /**
@@ -367,8 +355,7 @@ __global__ void matrixDiagonalInverse(m_t* in, idx_t len)
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
-{
+void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
   dim3 block(64);
   dim3 grid((len + block.x - 1) / block.x);
   matrixDiagonalInverse<m_t><<<grid, block, 0, stream>>>(in, len);
@@ -382,11 +369,12 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
-{
+m_t getL2Norm(const raft::handle_t &handle, m_t *in, idx_t size,
+              cudaStream_t stream) {
   cublasHandle_t cublasH = handle.get_cublas_handle();
-  m_t normval            = 0;
-  CUBLAS_CHECK(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  m_t normval = 0;
+  CUBLAS_CHECK(
+    raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
   return normval;
 }
 
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
index 18c8be5f45..29e0d7cfcd 100644
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ b/cpp/include/raft/mr/buffer_base.hpp
@@ -35,11 +35,11 @@ namespace mr {
 template <typename T, typename AllocatorT>
 class buffer_base {
  public:
-  using size_type       = std::size_t;
-  using value_type      = T;
-  using iterator        = value_type*;
-  using const_iterator  = const value_type*;
-  using reference       = T&;
+  using size_type = std::size_t;
+  using value_type = T;
+  using iterator = value_type*;
+  using const_iterator = const value_type*;
+  using reference = T&;
   using const_reference = const T&;
 
   buffer_base() = delete;
@@ -55,12 +55,16 @@ class buffer_base {
    * @param[in] stream    cuda stream where this allocation operations are async
    * @param[in] n         size of the buffer (in number of elements)
    */
-  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream, size_type n = 0)
-    : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator))
-  {
+  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream,
+              size_type n = 0)
+    : data_(nullptr),
+      size_(n),
+      capacity_(n),
+      stream_(stream),
+      allocator_(std::move(allocator)) {
     if (capacity_ > 0) {
-      data_ =
-        static_cast<value_type*>(allocator_->allocate(capacity_ * sizeof(value_type), stream_));
+      data_ = static_cast<value_type*>(
+        allocator_->allocate(capacity_ * sizeof(value_type), stream_));
       CUDA_CHECK(cudaStreamSynchronize(stream_));
     }
   }
@@ -94,23 +98,23 @@ class buffer_base {
    * @param[in] stream       cuda stream where allocation operations are queued
    * @{
    */
-  void reserve(size_type new_capacity)
-  {
+  void reserve(size_type new_capacity) {
     if (new_capacity > capacity_) {
-      auto* new_data =
-        static_cast<value_type*>(allocator_->allocate(new_capacity * sizeof(value_type), stream_));
-      if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); }
+      auto* new_data = static_cast<value_type*>(
+        allocator_->allocate(new_capacity * sizeof(value_type), stream_));
+      if (size_ > 0) {
+        raft::copy(new_data, data_, size_, stream_);
+      }
       // Only deallocate if we have allocated a pointer
       if (nullptr != data_) {
         allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
       }
-      data_     = new_data;
+      data_ = new_data;
       capacity_ = new_capacity;
     }
   }
 
-  void reserve(size_type new_capacity, cudaStream_t stream)
-  {
+  void reserve(size_type new_capacity, cudaStream_t stream) {
     set_stream(stream);
     reserve(new_capacity);
   }
@@ -123,14 +127,12 @@ class buffer_base {
    * @param[in] stream   cuda stream where the work will be queued
    * @{
    */
-  void resize(const size_type new_size)
-  {
+  void resize(const size_type new_size) {
     reserve(new_size);
     size_ = new_size;
   }
 
-  void resize(const size_type new_size, cudaStream_t stream)
-  {
+  void resize(const size_type new_size, cudaStream_t stream) {
     set_stream(stream);
     resize(new_size);
   }
@@ -144,18 +146,16 @@ class buffer_base {
    * @param[in] stream   cuda stream where the work will be queued
    * @{
    */
-  void release()
-  {
+  void release() {
     if (nullptr != data_) {
       allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
     }
-    data_     = nullptr;
+    data_ = nullptr;
     capacity_ = 0;
-    size_     = 0;
+    size_ = 0;
   }
 
-  void release(cudaStream_t stream)
-  {
+  void release(cudaStream_t stream) {
     set_stream(stream);
     release();
   }
@@ -195,8 +195,7 @@ class buffer_base {
    * @param[in] stream new cuda stream to be set. If it is the same as the
    *                   current one, then this method will be a no-op.
    */
-  void set_stream(cudaStream_t stream)
-  {
+  void set_stream(cudaStream_t stream) {
     if (stream_ != stream) {
       cudaEvent_t event;
       CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp
index e930b617e0..889e1640db 100644
--- a/cpp/include/raft/mr/device/allocator.hpp
+++ b/cpp/include/raft/mr/device/allocator.hpp
@@ -32,20 +32,17 @@ namespace device {
  * further to the ones listed in `Allocator`:
  * - Allocations may be always on the device that was specified on construction.
  */
-class allocator : public base_allocator {
-};
+class allocator : public base_allocator {};
 
 /** Default device allocator based on the one provided by RMM */
 class default_allocator : public allocator {
  public:
-  void* allocate(std::size_t n, cudaStream_t stream) override
-  {
+  void* allocate(std::size_t n, cudaStream_t stream) override {
     void* ptr = rmm::mr::get_current_device_resource()->allocate(n, stream);
     return ptr;
   }
 
-  void deallocate(void* p, std::size_t n, cudaStream_t stream) override
-  {
+  void deallocate(void* p, std::size_t n, cudaStream_t stream) override {
     rmm::mr::get_current_device_resource()->deallocate(p, n, stream);
   }
 };  // class default_allocator
diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp
index 2b9d84368f..39b5674ce4 100644
--- a/cpp/include/raft/mr/device/buffer.hpp
+++ b/cpp/include/raft/mr/device/buffer.hpp
@@ -46,11 +46,11 @@ namespace device {
 template <typename T>
 class buffer : public buffer_base<T, allocator> {
  public:
-  using size_type       = typename buffer_base<T, allocator>::size_type;
-  using value_type      = typename buffer_base<T, allocator>::value_type;
-  using iterator        = typename buffer_base<T, allocator>::iterator;
-  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
-  using reference       = typename buffer_base<T, allocator>::reference;
+  using size_type = typename buffer_base<T, allocator>::size_type;
+  using value_type = typename buffer_base<T, allocator>::value_type;
+  using iterator = typename buffer_base<T, allocator>::iterator;
+  using const_iterator = typename buffer_base<T, allocator>::const_iterator;
+  using reference = typename buffer_base<T, allocator>::reference;
   using const_reference = typename buffer_base<T, allocator>::const_reference;
 
   buffer() = delete;
@@ -60,9 +60,7 @@ class buffer : public buffer_base<T, allocator> {
   buffer& operator=(const buffer& other) = delete;
 
   buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, device::allocator>(alloc, stream, n)
-  {
-  }
+    : buffer_base<T, device::allocator>(alloc, stream, n) {}
 };  // class buffer
 
 };  // namespace device
diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp
index 62b6826211..8af266d4f0 100644
--- a/cpp/include/raft/mr/host/allocator.hpp
+++ b/cpp/include/raft/mr/host/allocator.hpp
@@ -34,23 +34,20 @@ namespace host {
  * further to the ones listed in `Allocator`:
  * - Allocations don't need to be zero copy accessible form a device.
  */
-class allocator : public base_allocator {
-};
+class allocator : public base_allocator {};
 
 /** Default cudaMallocHost/cudaFreeHost based host allocator */
 class default_allocator : public allocator {
  public:
-  void* allocate(std::size_t n, cudaStream_t stream) override
-  {
+  void* allocate(std::size_t n, cudaStream_t stream) override {
     void* ptr = nullptr;
     CUDA_CHECK(cudaMallocHost(&ptr, n));
     return ptr;
   }
 
-  void deallocate(void* p, std::size_t n, cudaStream_t stream) override
-  {
-    // Must call _NO_THROW here since this is called frequently from object
-    // destructors which are "nothrow" by default
+  void deallocate(void* p, std::size_t n, cudaStream_t stream) override {
+    //Must call _NO_THROW here since this is called frequently from object
+    //destructors which are "nothrow" by default
     CUDA_CHECK_NO_THROW(cudaFreeHost(p));
   }
 };  // class default_allocator
diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp
index 52475ad6ec..3c505bf2ed 100644
--- a/cpp/include/raft/mr/host/buffer.hpp
+++ b/cpp/include/raft/mr/host/buffer.hpp
@@ -48,11 +48,11 @@ namespace host {
 template <typename T>
 class buffer : public buffer_base<T, allocator> {
  public:
-  using size_type       = typename buffer_base<T, allocator>::size_type;
-  using value_type      = typename buffer_base<T, allocator>::value_type;
-  using iterator        = typename buffer_base<T, allocator>::iterator;
-  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
-  using reference       = typename buffer_base<T, allocator>::reference;
+  using size_type = typename buffer_base<T, allocator>::size_type;
+  using value_type = typename buffer_base<T, allocator>::value_type;
+  using iterator = typename buffer_base<T, allocator>::iterator;
+  using const_iterator = typename buffer_base<T, allocator>::const_iterator;
+  using reference = typename buffer_base<T, allocator>::reference;
   using const_reference = typename buffer_base<T, allocator>::const_reference;
 
   buffer() = delete;
@@ -62,15 +62,14 @@ class buffer : public buffer_base<T, allocator> {
   buffer& operator=(const buffer& other) = delete;
 
   buffer(std::shared_ptr<allocator> alloc, const device::buffer<T>& other)
-    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size())
-  {
-    if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); }
+    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size()) {
+    if (other.size() > 0) {
+      raft::copy(data_, other.data(), other.size(), other.get_stream());
+    }
   }
 
   buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, allocator>(alloc, stream, n)
-  {
-  }
+    : buffer_base<T, allocator>(alloc, stream, n) {}
 
   reference operator[](size_type pos) { return data_[pos]; }
 
diff --git a/cpp/include/raft/random/rng.cuh b/cpp/include/raft/random/rng.cuh
index 5267770e8a..56710ea81f 100644
--- a/cpp/include/raft/random/rng.cuh
+++ b/cpp/include/raft/random/rng.cuh
@@ -43,9 +43,10 @@ enum GeneratorType {
   GenKiss99
 };
 
-template <typename OutType, typename MathType, typename GenType, typename LenType, typename Lambda>
-__global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda randOp)
-{
+template <typename OutType, typename MathType, typename GenType,
+          typename LenType, typename Lambda>
+__global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr,
+                           LenType len, Lambda randOp) {
   LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   detail::Generator<GenType> gen(seed, (uint64_t)tid, offset);
   const LenType stride = gridDim.x * blockDim.x;
@@ -57,10 +58,10 @@ __global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType
 }
 
 // used for Box-Muller type transformations
-template <typename OutType, typename MathType, typename GenType, typename LenType, typename Lambda2>
-__global__ void rand2Kernel(
-  uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda2 rand2Op)
-{
+template <typename OutType, typename MathType, typename GenType,
+          typename LenType, typename Lambda2>
+__global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr,
+                            LenType len, Lambda2 rand2Op) {
   LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   detail::Generator<GenType> gen(seed, (uint64_t)tid, offset);
   const LenType stride = gridDim.x * blockDim.x;
@@ -76,9 +77,8 @@ __global__ void rand2Kernel(
 }
 
 template <typename Type>
-__global__ void constFillKernel(Type* ptr, int len, Type val)
-{
-  unsigned tid          = (blockIdx.x * blockDim.x) + threadIdx.x;
+__global__ void constFillKernel(Type *ptr, int len, Type val) {
+  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   const unsigned stride = gridDim.x * blockDim.x;
   for (unsigned idx = tid; idx < len; idx += stride) {
     ptr[idx] = val;
@@ -99,20 +99,19 @@ __global__ void constFillKernel(Type* ptr, int len, Type val)
  * @{
  */
 template <typename Type>
-DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
-{
-  constexpr Type twoPi  = Type(2.0) * Type(3.141592654);
+DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
+                             Type sigma2, Type mu2) {
+  constexpr Type twoPi = Type(2.0) * Type(3.141592654);
   constexpr Type minus2 = -Type(2.0);
-  Type R                = raft::mySqrt(minus2 * raft::myLog(val1));
-  Type theta            = twoPi * val2;
+  Type R = raft::mySqrt(minus2 * raft::myLog(val1));
+  Type theta = twoPi * val2;
   Type s, c;
   raft::mySinCos(theta, s, c);
   val1 = R * c * sigma1 + mu1;
   val2 = R * s * sigma2 + mu2;
 }
 template <typename Type>
-DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
-{
+DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
   box_muller_transform<Type>(val1, val2, sigma1, mu1, sigma1, mu1);
 }
 /** @} */
@@ -132,8 +131,7 @@ class Rng {
       // simple heuristic to make sure all SMs will be occupied properly
       // and also not too many initialization calls will be made by each thread
       nBlocks(4 * getMultiProcessorCount()),
-      gen()
-  {
+      gen() {
     seed(_s);
   }
 
@@ -144,8 +142,7 @@ class Rng {
    *       function of timestamp. Another example is to use the c++11's
    *       `std::random_device` for setting seed.
    */
-  void seed(uint64_t _s)
-  {
+  void seed(uint64_t _s) {
     gen.seed(_s);
     offset = 0;
   }
@@ -161,8 +158,7 @@ class Rng {
    * @param[out] b intercept parameter
    */
   template <typename IdxT>
-  void affine_transform_params(IdxT n, IdxT& a, IdxT& b)
-  {
+  void affine_transform_params(IdxT n, IdxT &a, IdxT &b) {
     // always keep 'a' to be coprime to 'n'
     a = gen() % n;
     while (gcd(a, n) != 1) {
@@ -185,24 +181,27 @@ class Rng {
    * @{
    */
   template <typename Type, typename LenType = int>
-  void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream)
-  {
+  void uniform(Type *ptr, LenType len, Type start, Type end,
+               cudaStream_t stream) {
     static_assert(std::is_floating_point<Type>::value,
                   "Type for 'uniform' can only be floating point!");
     custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) { return (val * (end - start)) + start; },
+      ptr, len,
+      [=] __device__(Type val, LenType idx) {
+        return (val * (end - start)) + start;
+      },
       stream);
   }
   template <typename IntType, typename LenType = int>
-  void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream)
-  {
-    static_assert(std::is_integral<IntType>::value, "Type for 'uniformInt' can only be integer!");
+  void uniformInt(IntType *ptr, LenType len, IntType start, IntType end,
+                  cudaStream_t stream) {
+    static_assert(std::is_integral<IntType>::value,
+                  "Type for 'uniformInt' can only be integer!");
     custom_distribution(
-      ptr,
-      len,
-      [=] __device__(IntType val, LenType idx) { return (val % (end - start)) + start; },
+      ptr, len,
+      [=] __device__(IntType val, LenType idx) {
+        return (val % (end - start)) + start;
+      },
       stream);
   }
   /** @} */
@@ -219,37 +218,28 @@ class Rng {
    * @{
    */
   template <typename Type, typename LenType = int>
-  void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
-  {
+  void normal(Type *ptr, LenType len, Type mu, Type sigma,
+              cudaStream_t stream) {
     static_assert(std::is_floating_point<Type>::value,
                   "Type for 'normal' can only be floating point!");
     rand2Impl(
-      offset,
-      ptr,
-      len,
+      offset, ptr, len,
       [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
         box_muller_transform<Type>(val1, val2, sigma, mu);
       },
-      NumThreads,
-      nBlocks,
-      type,
-      stream);
+      NumThreads, nBlocks, type, stream);
   }
   template <typename IntType, typename LenType = int>
-  void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
-  {
-    static_assert(std::is_integral<IntType>::value, "Type for 'normalInt' can only be integer!");
+  void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma,
+                 cudaStream_t stream) {
+    static_assert(std::is_integral<IntType>::value,
+                  "Type for 'normalInt' can only be integer!");
     rand2Impl<IntType, double>(
-      offset,
-      ptr,
-      len,
-      [=] __device__(double& val1, double& val2, LenType idx1, LenType idx2) {
+      offset, ptr, len,
+      [=] __device__(double &val1, double &val2, LenType idx1, LenType idx2) {
         box_muller_transform<double>(val1, val2, sigma, mu);
       },
-      NumThreads,
-      nBlocks,
-      type,
-      stream);
+      NumThreads, nBlocks, type, stream);
   }
   /** @} */
 
@@ -274,32 +264,21 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void normalTable(Type* ptr,
-                   LenType n_rows,
-                   LenType n_cols,
-                   const Type* mu,
-                   const Type* sigma_vec,
-                   Type sigma,
-                   cudaStream_t stream)
-  {
+  void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu,
+                   const Type *sigma_vec, Type sigma, cudaStream_t stream) {
     rand2Impl(
-      offset,
-      ptr,
-      n_rows * n_cols,
+      offset, ptr, n_rows * n_cols,
       [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
         // yikes! use fast-int-div
-        auto col1  = idx1 % n_cols;
-        auto col2  = idx2 % n_cols;
+        auto col1 = idx1 % n_cols;
+        auto col2 = idx2 % n_cols;
         auto mean1 = mu[col1];
         auto mean2 = mu[col2];
-        auto sig1  = sigma_vec == nullptr ? sigma : sigma_vec[col1];
-        auto sig2  = sigma_vec == nullptr ? sigma : sigma_vec[col2];
+        auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1];
+        auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2];
         box_muller_transform<Type>(val1, val2, sig1, mean1, sig2, mean2);
       },
-      NumThreads,
-      nBlocks,
-      type,
-      stream);
+      NumThreads, nBlocks, type, stream);
   }
 
   /**
@@ -312,8 +291,7 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void fill(Type* ptr, LenType len, Type val, cudaStream_t stream)
-  {
+  void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) {
     constFillKernel<Type><<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
     CUDA_CHECK(cudaPeekAtLastError());
   }
@@ -331,10 +309,10 @@ class Rng {
    * @param[in]  stream stream where to launch the kernel
    */
   template <typename Type, typename OutType = bool, typename LenType = int>
-  void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream)
-  {
+  void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) {
     custom_distribution<OutType, Type>(
-      ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, stream);
+      ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; },
+      stream);
   }
 
   /**
@@ -348,14 +326,15 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream)
-  {
+  void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale,
+                        cudaStream_t stream) {
     static_assert(std::is_floating_point<Type>::value,
                   "Type for 'scaled_bernoulli' can only be floating point!");
     custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) { return val > prob ? -scale : scale; },
+      ptr, len,
+      [=] __device__(Type val, LenType idx) {
+        return val > prob ? -scale : scale;
+      },
       stream);
   }
 
@@ -371,12 +350,12 @@ class Rng {
    * @note https://en.wikipedia.org/wiki/Gumbel_distribution
    */
   template <typename Type, typename LenType = int>
-  void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream)
-  {
+  void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) {
     custom_distribution(
-      ptr,
-      len,
-      [=] __device__(Type val, LenType idx) { return mu - beta * raft::myLog(-raft::myLog(val)); },
+      ptr, len,
+      [=] __device__(Type val, LenType idx) {
+        return mu - beta * raft::myLog(-raft::myLog(val));
+      },
       stream);
   }
 
@@ -391,21 +370,16 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
-  {
+  void lognormal(Type *ptr, LenType len, Type mu, Type sigma,
+                 cudaStream_t stream) {
     rand2Impl(
-      offset,
-      ptr,
-      len,
+      offset, ptr, len,
       [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
         box_muller_transform<Type>(val1, val2, sigma, mu);
         val1 = raft::myExp(val1);
         val2 = raft::myExp(val2);
       },
-      NumThreads,
-      nBlocks,
-      type,
-      stream);
+      NumThreads, nBlocks, type, stream);
   }
 
   /**
@@ -419,11 +393,10 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
-  {
+  void logistic(Type *ptr, LenType len, Type mu, Type scale,
+                cudaStream_t stream) {
     custom_distribution(
-      ptr,
-      len,
+      ptr, len,
       [=] __device__(Type val, LenType idx) {
         constexpr Type one = (Type)1.0;
         return mu - scale * raft::myLog(one / val - one);
@@ -441,11 +414,9 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream)
-  {
+  void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) {
     custom_distribution(
-      ptr,
-      len,
+      ptr, len,
       [=] __device__(Type val, LenType idx) {
         constexpr Type one = (Type)1.0;
         return -raft::myLog(one - val) / lambda;
@@ -463,11 +434,9 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream)
-  {
+  void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) {
     custom_distribution(
-      ptr,
-      len,
+      ptr, len,
       [=] __device__(Type val, LenType idx) {
         constexpr Type one = (Type)1.0;
         constexpr Type two = (Type)2.0;
@@ -487,14 +456,13 @@ class Rng {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
-  {
+  void laplace(Type *ptr, LenType len, Type mu, Type scale,
+               cudaStream_t stream) {
     custom_distribution(
-      ptr,
-      len,
+      ptr, len,
       [=] __device__(Type val, LenType idx) {
-        constexpr Type one     = (Type)1.0;
-        constexpr Type two     = (Type)2.0;
+        constexpr Type one = (Type)1.0;
+        constexpr Type two = (Type)2.0;
         constexpr Type oneHalf = (Type)0.5;
         Type out;
         if (val <= oneHalf) {
@@ -534,44 +502,43 @@ class Rng {
    * @param stream cuda stream
    */
   template <typename DataT, typename WeightsT, typename IdxT = int>
-  void sampleWithoutReplacement(const raft::handle_t& handle,
-                                DataT* out,
-                                IdxT* outIdx,
-                                const DataT* in,
-                                const WeightsT* wts,
-                                IdxT sampledLen,
-                                IdxT len,
-                                cudaStream_t stream)
-  {
-    ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
-
-    std::shared_ptr<raft::mr::device::allocator> allocator = handle.get_device_allocator();
+  void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out,
+                                IdxT *outIdx, const DataT *in,
+                                const WeightsT *wts, IdxT sampledLen, IdxT len,
+                                cudaStream_t stream) {
+    ASSERT(sampledLen <= len,
+           "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
+
+    std::shared_ptr<raft::mr::device::allocator> allocator =
+      handle.get_device_allocator();
 
     raft::mr::device::buffer<WeightsT> expWts(allocator, stream, len);
     raft::mr::device::buffer<WeightsT> sortedWts(allocator, stream, len);
     raft::mr::device::buffer<IdxT> inIdx(allocator, stream, len);
     raft::mr::device::buffer<IdxT> outIdxBuff(allocator, stream, len);
-    auto* inIdxPtr = inIdx.data();
+    auto *inIdxPtr = inIdx.data();
     // generate modified weights
     custom_distribution(
-      expWts.data(),
-      len,
+      expWts.data(), len,
       [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) {
-        inIdxPtr[idx]          = idx;
+        inIdxPtr[idx] = idx;
         constexpr WeightsT one = (WeightsT)1.0;
-        auto exp               = -raft::myLog(one - val);
-        if (wts != nullptr) { return exp / wts[idx]; }
+        auto exp = -raft::myLog(one - val);
+        if (wts != nullptr) {
+          return exp / wts[idx];
+        }
         return exp;
       },
       stream);
     ///@todo: use a more efficient partitioning scheme instead of full sort
     // sort the array and pick the top sampledLen items
-    IdxT* outIdxPtr = outIdxBuff.data();
+    IdxT *outIdxPtr = outIdxBuff.data();
     raft::mr::device::buffer<char> workspace(allocator, stream);
-    sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream);
+    sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr,
+              (int)len, stream);
     if (outIdx != nullptr) {
-      CUDA_CHECK(cudaMemcpyAsync(
-        outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream));
+      CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen,
+                                 cudaMemcpyDeviceToDevice, stream));
     }
     scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
   }
@@ -591,15 +558,17 @@ class Rng {
    * @param[in]  stream cuda stream
    * @{
    */
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
-  {
+  template <typename OutType, typename MathType = OutType,
+            typename LenType = int, typename Lambda>
+  void custom_distribution(OutType *ptr, LenType len, Lambda randOp,
+                           cudaStream_t stream) {
     randImpl<OutType, MathType, LenType, Lambda>(
       offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
   }
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
-  {
+  template <typename OutType, typename MathType = OutType,
+            typename LenType = int, typename Lambda>
+  void custom_distribution2(OutType *ptr, LenType len, Lambda randOp,
+                            cudaStream_t stream) {
     rand2Impl<OutType, MathType, LenType, Lambda>(
       offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
   }
@@ -622,10 +591,12 @@ class Rng {
   static const int NumThreads = 256;
 
   template <bool IsNormal, typename Type, typename LenType>
-  uint64_t _setupSeeds(uint64_t& seed, uint64_t& offset, LenType len, int nThreads, int nBlocks)
-  {
+  uint64_t _setupSeeds(uint64_t &seed, uint64_t &offset, LenType len,
+                       int nThreads, int nBlocks) {
     LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads));
-    if (IsNormal && itemsPerThread % 2 == 1) { ++itemsPerThread; }
+    if (IsNormal && itemsPerThread % 2 == 1) {
+      ++itemsPerThread;
+    }
     // curand uses 2 32b uint's to generate one double
     uint64_t factor = sizeof(Type) / sizeof(float);
     if (factor == 0) ++factor;
@@ -633,26 +604,22 @@ class Rng {
     // If not, then generate new seed and start from zero offset
     uint64_t newOffset = offset + LenType(itemsPerThread) * factor;
     if (newOffset < offset) {
-      offset    = 0;
-      seed      = gen();
+      offset = 0;
+      seed = gen();
       newOffset = itemsPerThread * factor;
     }
     return newOffset;
   }
 
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
-  void randImpl(uint64_t& offset,
-                OutType* ptr,
-                LenType len,
-                Lambda randOp,
-                int nThreads,
-                int nBlocks,
-                GeneratorType type,
-                cudaStream_t stream)
-  {
+  template <typename OutType, typename MathType = OutType,
+            typename LenType = int, typename Lambda>
+  void randImpl(uint64_t &offset, OutType *ptr, LenType len, Lambda randOp,
+                int nThreads, int nBlocks, GeneratorType type,
+                cudaStream_t stream) {
     if (len <= 0) return;
-    uint64_t seed  = gen();
-    auto newOffset = _setupSeeds<false, MathType, LenType>(seed, offset, len, nThreads, nBlocks);
+    uint64_t seed = gen();
+    auto newOffset = _setupSeeds<false, MathType, LenType>(seed, offset, len,
+                                                           nThreads, nBlocks);
     switch (type) {
       case GenPhilox:
         randKernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda>
@@ -666,28 +633,26 @@ class Rng {
         randKernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
-      default: ASSERT(false, "randImpl: Incorrect generator type! %d", type);
+      default:
+        ASSERT(false, "randImpl: Incorrect generator type! %d", type);
     };
     CUDA_CHECK(cudaGetLastError());
     offset = newOffset;
   }
 
-  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda2>
-  void rand2Impl(uint64_t& offset,
-                 OutType* ptr,
-                 LenType len,
-                 Lambda2 rand2Op,
-                 int nThreads,
-                 int nBlocks,
-                 GeneratorType type,
-                 cudaStream_t stream)
-  {
+  template <typename OutType, typename MathType = OutType,
+            typename LenType = int, typename Lambda2>
+  void rand2Impl(uint64_t &offset, OutType *ptr, LenType len, Lambda2 rand2Op,
+                 int nThreads, int nBlocks, GeneratorType type,
+                 cudaStream_t stream) {
     if (len <= 0) return;
-    auto seed      = gen();
-    auto newOffset = _setupSeeds<true, MathType, LenType>(seed, offset, len, nThreads, nBlocks);
+    auto seed = gen();
+    auto newOffset = _setupSeeds<true, MathType, LenType>(seed, offset, len,
+                                                          nThreads, nBlocks);
     switch (type) {
       case GenPhilox:
-        rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda2>
+        rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType,
+                    Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       case GenTaps:
@@ -695,10 +660,12 @@ class Rng {
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       case GenKiss99:
-        rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda2>
+        rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType,
+                    Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
-      default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type);
+      default:
+        ASSERT(false, "rand2Impl: Incorrect generator type! %d", type);
     };
     CUDA_CHECK(cudaGetLastError());
     offset = newOffset;
diff --git a/cpp/include/raft/random/rng_impl.cuh b/cpp/include/raft/random/rng_impl.cuh
index 485f4ddd68..d44c6f018b 100644
--- a/cpp/include/raft/random/rng_impl.cuh
+++ b/cpp/include/raft/random/rng_impl.cuh
@@ -33,8 +33,7 @@ struct PhiloxGenerator {
    * @param subsequence as found in curand docs
    * @param offset as found in curand docs
    */
-  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
-  {
+  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
     curand_init(seed, subsequence, offset, &state);
   }
 
@@ -45,21 +44,18 @@ struct PhiloxGenerator {
   DI void next(float& ret) { ret = curand_uniform(&(this->state)); }
   DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); }
   DI void next(uint32_t& ret) { ret = curand(&(this->state)); }
-  DI void next(uint64_t& ret)
-  {
+  DI void next(uint64_t& ret) {
     uint32_t a, b;
     next(a);
     next(b);
     ret = (uint64_t)a | ((uint64_t)b << 32);
   }
-  DI void next(int32_t& ret)
-  {
+  DI void next(int32_t& ret) {
     uint32_t val;
     next(val);
     ret = int32_t(val & 0x7fffffff);
   }
-  DI void next(int64_t& ret)
-  {
+  DI void next(int64_t& ret) {
     uint64_t val;
     next(val);
     ret = int64_t(val & 0x7fffffffffffffff);
@@ -80,9 +76,8 @@ struct TapsGenerator {
    * @param subsequence unused
    * @param offset unused
    */
-  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
-  {
-    uint64_t delta  = (blockIdx.x * blockDim.x) + threadIdx.x;
+  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+    uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x;
     uint64_t stride = blockDim.x * gridDim.x;
     delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride;
     stride *= blockDim.y * gridDim.y;
@@ -95,36 +90,31 @@ struct TapsGenerator {
    * @{
    */
   template <typename Type>
-  DI void next(Type& ret)
-  {
+  DI void next(Type& ret) {
     constexpr double ULL_LARGE = 1.8446744073709551614e19;
     uint64_t val;
     next(val);
     ret = static_cast<Type>(val);
     ret /= static_cast<Type>(ULL_LARGE);
   }
-  DI void next(uint64_t& ret)
-  {
+  DI void next(uint64_t& ret) {
     constexpr uint64_t TAPS = 0x8000100040002000ULL;
-    constexpr int ROUNDS    = 128;
+    constexpr int ROUNDS = 128;
     for (int i = 0; i < ROUNDS; i++)
       state = (state >> 1) ^ (-(state & 1ULL) & TAPS);
     ret = state;
   }
-  DI void next(uint32_t& ret)
-  {
+  DI void next(uint32_t& ret) {
     uint64_t val;
     next(val);
     ret = (uint32_t)val;
   }
-  DI void next(int32_t& ret)
-  {
+  DI void next(int32_t& ret) {
     uint32_t val;
     next(val);
     ret = int32_t(val & 0x7fffffff);
   }
-  DI void next(int64_t& ret)
-  {
+  DI void next(int64_t& ret) {
     uint64_t val;
     next(val);
     ret = int64_t(val & 0x7fffffffffffffff);
@@ -145,49 +135,46 @@ struct Kiss99Generator {
    * @param subsequence unused
    * @param offset unused
    */
-  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { initKiss99(seed); }
+  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+    initKiss99(seed);
+  }
 
   /**
    * @defgroup NextRand Generate the next random number
    * @{
    */
   template <typename Type>
-  DI void next(Type& ret)
-  {
+  DI void next(Type& ret) {
     constexpr double U_LARGE = 4.294967295e9;
     uint32_t val;
     next(val);
     ret = static_cast<Type>(val);
     ret /= static_cast<Type>(U_LARGE);
   }
-  DI void next(uint32_t& ret)
-  {
+  DI void next(uint32_t& ret) {
     uint32_t MWC;
-    z   = 36969 * (z & 65535) + (z >> 16);
-    w   = 18000 * (w & 65535) + (w >> 16);
+    z = 36969 * (z & 65535) + (z >> 16);
+    w = 18000 * (w & 65535) + (w >> 16);
     MWC = ((z << 16) + w);
     jsr ^= (jsr << 17);
     jsr ^= (jsr >> 13);
     jsr ^= (jsr << 5);
     jcong = 69069 * jcong + 1234567;
-    MWC   = ((MWC ^ jcong) + jsr);
-    ret   = MWC;
+    MWC = ((MWC ^ jcong) + jsr);
+    ret = MWC;
   }
-  DI void next(uint64_t& ret)
-  {
+  DI void next(uint64_t& ret) {
     uint32_t a, b;
     next(a);
     next(b);
     ret = (uint64_t)a | ((uint64_t)b << 32);
   }
-  DI void next(int32_t& ret)
-  {
+  DI void next(int32_t& ret) {
     uint32_t val;
     next(val);
     ret = int32_t(val & 0x7fffffff);
   }
-  DI void next(int64_t& ret)
-  {
+  DI void next(int64_t& ret) {
     uint64_t val;
     next(val);
     ret = int64_t(val & 0x7fffffffffffffff);
@@ -206,8 +193,7 @@ struct Kiss99Generator {
 
   // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower
   // 128 bits. It uses 32-bit wide multiply only.
-  DI void mulByFnv1a128Prime(uint32_t* h)
-  {
+  DI void mulByFnv1a128Prime(uint32_t* h) {
     typedef union {
       uint32_t u32[2];
       uint64_t u64[1];
@@ -231,12 +217,12 @@ struct Kiss99Generator {
     // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]);
     // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]);
     uint32_t carry = 0;
-    h[0]           = h0p0.u32[0];
+    h[0] = h0p0.u32[0];
 
-    h[1]  = h0p0.u32[1] + h1p0.u32[0];
+    h[1] = h0p0.u32[1] + h1p0.u32[0];
     carry = h[1] < h0p0.u32[1] ? 1 : 0;
 
-    h[2]  = h1p0.u32[1] + carry;
+    h[2] = h1p0.u32[1] + carry;
     carry = h[2] < h1p0.u32[1] ? 1 : 0;
     h[2] += h2p0.u32[0];
     carry = h[2] < h2p0.u32[0] ? carry + 1 : carry;
@@ -247,8 +233,7 @@ struct Kiss99Generator {
     return;
   }
 
-  DI void fnv1a128(uint32_t* hash, uint32_t txt)
-  {
+  DI void fnv1a128(uint32_t* hash, uint32_t txt) {
     hash[0] ^= (txt >> 0) & 0xFF;
     mulByFnv1a128Prime(hash);
     hash[0] ^= (txt >> 8) & 0xFF;
@@ -259,8 +244,7 @@ struct Kiss99Generator {
     mulByFnv1a128Prime(hash);
   }
 
-  DI void initKiss99(uint64_t seed)
-  {
+  DI void initKiss99(uint64_t seed) {
     // Initialize hash to 128-bit FNV1a basis
     uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL};
 
@@ -275,9 +259,9 @@ struct Kiss99Generator {
     fnv1a128(hash, uint32_t(seed >> 32));
 
     // Initialize KISS99 state with hash
-    z     = hash[0];
-    w     = hash[1];
-    jsr   = hash[2];
+    z = hash[0];
+    w = hash[1];
+    jsr = hash[2];
     jcong = hash[3];
   }
 };
@@ -289,13 +273,10 @@ struct Kiss99Generator {
 template <typename GenType>
 struct Generator {
   DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset)
-    : gen(seed, subsequence, offset)
-  {
-  }
+    : gen(seed, subsequence, offset) {}
 
   template <typename Type>
-  DI void next(Type& ret)
-  {
+  DI void next(Type& ret) {
     gen.next(ret);
   }
 
diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh
index 5d38bdf4a8..e367550060 100644
--- a/cpp/include/raft/sparse/convert/coo.cuh
+++ b/cpp/include/raft/sparse/convert/coo.cuh
@@ -37,18 +37,14 @@ namespace sparse {
 namespace convert {
 
 template <typename value_idx = int, int TPB_X = 32>
-__global__ void csr_to_coo_kernel(const value_idx* row_ind,
-                                  value_idx m,
-                                  value_idx* coo_rows,
-                                  value_idx nnz)
-{
+__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m,
+                                  value_idx *coo_rows, value_idx nnz) {
   // row-based matrix 1 thread per row
   value_idx row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < m) {
     value_idx start_idx = row_ind[row];
-    value_idx stop_idx  = get_stop_idx(row, m, nnz, row_ind);
-    for (value_idx i = start_idx; i < stop_idx; i++)
-      coo_rows[i] = row;
+    value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind);
+    for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row;
   }
 }
 
@@ -61,14 +57,14 @@ __global__ void csr_to_coo_kernel(const value_idx* row_ind,
  * @param stream: cuda stream to use
  */
 template <typename value_idx = int, int TPB_X = 32>
-void csr_to_coo(
-  const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
-{
+void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows,
+                value_idx nnz, cudaStream_t stream) {
   // @TODO: Use cusparse for this.
   dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_to_coo_kernel<value_idx, TPB_X><<<grid, blk, 0, stream>>>(row_ind, m, coo_rows, nnz);
+  csr_to_coo_kernel<value_idx, TPB_X>
+    <<<grid, blk, 0, stream>>>(row_ind, m, coo_rows, nnz);
 
   CUDA_CHECK(cudaGetLastError());
 }
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 2191f5edd1..a034bdbda8 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -44,33 +44,29 @@ namespace sparse {
 namespace convert {
 
 template <typename value_t>
-void coo_to_csr(const raft::handle_t& handle,
-                const int* srcRows,
-                const int* srcCols,
-                const value_t* srcVals,
-                int nnz,
-                int m,
-                int* dst_offsets,
-                int* dstCols,
-                value_t* dstVals)
-{
-  auto stream         = handle.get_stream();
+void coo_to_csr(const raft::handle_t &handle, const int *srcRows,
+                const int *srcCols, const value_t *srcVals, int nnz, int m,
+                int *dst_offsets, int *dstCols, value_t *dstVals) {
+  auto stream = handle.get_stream();
   auto cusparseHandle = handle.get_cusparse_handle();
-  auto d_alloc        = handle.get_device_allocator();
+  auto d_alloc = handle.get_device_allocator();
   raft::mr::device::buffer<int> dstRows(d_alloc, stream, nnz);
-  CUDA_CHECK(
-    cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
-  CUDA_CHECK(
-    cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz,
+                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz,
+                             cudaMemcpyDeviceToDevice, stream));
   auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt(
     cusparseHandle, m, m, nnz, srcRows, srcCols, stream);
   raft::mr::device::buffer<char> pBuffer(d_alloc, stream, buffSize);
   raft::mr::device::buffer<int> P(d_alloc, stream, nnz);
-  CUSPARSE_CHECK(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
-  raft::sparse::cusparsecoosortByRow(
-    cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream);
-  raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream);
-  raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream);
+  CUSPARSE_CHECK(
+    cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
+  raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(),
+                                     dstCols, P.data(), pBuffer.data(), stream);
+  raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(),
+                             stream);
+  raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m,
+                                dst_offsets, stream);
   CUDA_CHECK(cudaDeviceSynchronize());
 }
 
@@ -89,20 +85,14 @@ void coo_to_csr(const raft::handle_t& handle,
  * @param stream cuda stream to use
  * @param fused_op: the fused operation
  */
-template <typename Index_, int TPB_X = 32, typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_* row_ind,
-                           Index_ total_rows,
-                           Index_ nnz,
-                           Index_ batchSize,
-                           const bool* adj,
-                           Index_* row_ind_ptr,
-                           cudaStream_t stream,
-                           Lambda fused_op)
-{
+template <typename Index_, int TPB_X = 32,
+          typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
+                           Index_ batchSize, const bool *adj,
+                           Index_ *row_ind_ptr, cudaStream_t stream,
+                           Lambda fused_op) {
   op::csr_row_op<Index_, TPB_X>(
-    row_ind,
-    batchSize,
-    nnz,
+    row_ind, batchSize, nnz,
     [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__(
       Index_ row, Index_ start_idx, Index_ stop_idx) {
       fused_op(row, start_idx, stop_idx);
@@ -118,23 +108,14 @@ void csr_adj_graph_batched(const Index_* row_ind,
     stream);
 }
 
-template <typename Index_, int TPB_X = 32, typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_* row_ind,
-                           Index_ total_rows,
-                           Index_ nnz,
-                           Index_ batchSize,
-                           const bool* adj,
-                           Index_* row_ind_ptr,
-                           cudaStream_t stream)
-{
-  csr_adj_graph_batched(row_ind,
-                        total_rows,
-                        nnz,
-                        batchSize,
-                        adj,
-                        row_ind_ptr,
-                        stream,
-                        [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {});
+template <typename Index_, int TPB_X = 32,
+          typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
+                           Index_ batchSize, const bool *adj,
+                           Index_ *row_ind_ptr, cudaStream_t stream) {
+  csr_adj_graph_batched(
+    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream,
+    [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {});
 }
 
 /**
@@ -150,17 +131,13 @@ void csr_adj_graph_batched(const Index_* row_ind,
  * @param stream cuda stream to use
  * @param fused_op the fused operation
  */
-template <typename Index_, int TPB_X = 32, typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph(const Index_* row_ind,
-                   Index_ total_rows,
-                   Index_ nnz,
-                   const bool* adj,
-                   Index_* row_ind_ptr,
-                   cudaStream_t stream,
-                   Lambda fused_op)
-{
-  csr_adj_graph_batched<Index_, TPB_X>(
-    row_ind, total_rows, nnz, total_rows, adj, row_ind_ptr, stream, fused_op);
+template <typename Index_, int TPB_X = 32,
+          typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
+                   const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream,
+                   Lambda fused_op) {
+  csr_adj_graph_batched<Index_, TPB_X>(row_ind, total_rows, nnz, total_rows,
+                                       adj, row_ind_ptr, stream, fused_op);
 }
 
 /**
@@ -174,13 +151,9 @@ void csr_adj_graph(const Index_* row_ind,
  * @param stream: cuda stream to use
  */
 template <typename T>
-void sorted_coo_to_csr(const T* rows,
-                       int nnz,
-                       T* row_ind,
-                       int m,
+void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
                        std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream)
-{
+                       cudaStream_t stream) {
   raft::mr::device::buffer<T> row_counts(d_alloc, stream, m);
 
   CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
@@ -188,9 +161,11 @@ void sorted_coo_to_csr(const T* rows,
   linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream);
 
   // create csr compressed row index from row counts
-  thrust::device_ptr<T> row_counts_d = thrust::device_pointer_cast(row_counts.data());
-  thrust::device_ptr<T> c_ind_d      = thrust::device_pointer_cast(row_ind);
-  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, c_ind_d);
+  thrust::device_ptr<T> row_counts_d =
+    thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<T> c_ind_d = thrust::device_pointer_cast(row_ind);
+  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
+                 c_ind_d);
 }
 
 /**
@@ -202,12 +177,11 @@ void sorted_coo_to_csr(const T* rows,
  * @param stream: cuda stream to use
  */
 template <typename T>
-void sorted_coo_to_csr(COO<T>* coo,
-                       int* row_ind,
+void sorted_coo_to_csr(COO<T> *coo, int *row_ind,
                        std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream)
-{
-  sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc, stream);
+                       cudaStream_t stream) {
+  sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, d_alloc,
+                    stream);
 }
 
 };  // end NAMESPACE convert
diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh
index e90882b501..299f9d36d4 100644
--- a/cpp/include/raft/sparse/convert/dense.cuh
+++ b/cpp/include/raft/sparse/convert/dense.cuh
@@ -37,20 +37,22 @@ namespace sparse {
 namespace convert {
 
 template <typename value_t>
-__global__ void csr_to_dense_warp_per_row_kernel(
-  int n_cols, const value_t* csrVal, const int* csrRowPtr, const int* csrColInd, value_t* a)
-{
+__global__ void csr_to_dense_warp_per_row_kernel(int n_cols,
+                                                 const value_t *csrVal,
+                                                 const int *csrRowPtr,
+                                                 const int *csrColInd,
+                                                 value_t *a) {
   int row = blockIdx.x;
   int tid = threadIdx.x;
 
   int colStart = csrRowPtr[row];
-  int colEnd   = csrRowPtr[row + 1];
-  int rowNnz   = colEnd - colStart;
+  int colEnd = csrRowPtr[row + 1];
+  int rowNnz = colEnd - colStart;
 
   for (int i = tid; i < rowNnz; i += blockDim.x) {
     int colIdx = colStart + i;
     if (colIdx < colEnd) {
-      int col               = csrColInd[colIdx];
+      int col = csrColInd[colIdx];
       a[row * n_cols + col] = csrVal[colIdx];
     }
   }
@@ -75,17 +77,10 @@ __global__ void csr_to_dense_warp_per_row_kernel(
  * @param[in] row_major : Is row-major output desired?
  */
 template <typename value_idx, typename value_t>
-void csr_to_dense(cusparseHandle_t handle,
-                  value_idx nrows,
-                  value_idx ncols,
-                  const value_idx* csr_indptr,
-                  const value_idx* csr_indices,
-                  const value_t* csr_data,
-                  value_idx lda,
-                  value_t* out,
-                  cudaStream_t stream,
-                  bool row_major = true)
-{
+void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols,
+                  const value_idx *csr_indptr, const value_idx *csr_indices,
+                  const value_t *csr_data, value_idx lda, value_t *out,
+                  cudaStream_t stream, bool row_major = true) {
   if (!row_major) {
     /**
      * If we need col-major, use cusparse.
@@ -96,13 +91,15 @@ void csr_to_dense(cusparseHandle_t handle,
     CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL));
 
     CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense(
-      handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, lda, stream));
+      handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out,
+      lda, stream));
 
     CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat));
 
   } else {
     int blockdim = block_dim(ncols);
-    CUDA_CHECK(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
+    CUDA_CHECK(
+      cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
     csr_to_dense_warp_per_row_kernel<<<nrows, blockdim, 0, stream>>>(
       ncols, csr_data, csr_indptr, csr_indices, out);
   }
diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh
index 348ed5eab2..73120fea8c 100644
--- a/cpp/include/raft/sparse/coo.cuh
+++ b/cpp/include/raft/sparse/coo.cuh
@@ -68,87 +68,83 @@ class COO {
   Index_Type n_cols;
 
   /**
-   * @param d_alloc: the device allocator to use for the underlying buffers
-   * @param stream: CUDA stream to use
-   */
+    * @param d_alloc: the device allocator to use for the underlying buffers
+    * @param stream: CUDA stream to use
+    */
   COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream)
     : rows_arr(d_alloc, stream, 0),
       cols_arr(d_alloc, stream, 0),
       vals_arr(d_alloc, stream, 0),
       nnz(0),
       n_rows(0),
-      n_cols(0)
-  {
-  }
+      n_cols(0) {}
 
   /**
-   * @param rows: coo rows array
-   * @param cols: coo cols array
-   * @param vals: coo vals array
-   * @param nnz: size of the rows/cols/vals arrays
-   * @param n_rows: number of rows in the dense matrix
-   * @param n_cols: number of cols in the dense matrix
-   */
-  COO(raft::mr::device::buffer<Index_Type>& rows,
-      raft::mr::device::buffer<Index_Type>& cols,
-      raft::mr::device::buffer<T>& vals,
-      Index_Type nnz,
-      Index_Type n_rows = 0,
+    * @param rows: coo rows array
+    * @param cols: coo cols array
+    * @param vals: coo vals array
+    * @param nnz: size of the rows/cols/vals arrays
+    * @param n_rows: number of rows in the dense matrix
+    * @param n_cols: number of cols in the dense matrix
+    */
+  COO(raft::mr::device::buffer<Index_Type> &rows,
+      raft::mr::device::buffer<Index_Type> &cols,
+      raft::mr::device::buffer<T> &vals, Index_Type nnz, Index_Type n_rows = 0,
       Index_Type n_cols = 0)
-    : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols)
-  {
-  }
+    : rows_arr(rows),
+      cols_arr(cols),
+      vals_arr(vals),
+      nnz(nnz),
+      n_rows(n_rows),
+      n_cols(n_cols) {}
 
   /**
-   * @param d_alloc: the device allocator use
-   * @param stream: CUDA stream to use
-   * @param nnz: size of the rows/cols/vals arrays
-   * @param n_rows: number of rows in the dense matrix
-   * @param n_cols: number of cols in the dense matrix
-   * @param init: initialize arrays with zeros
-   */
-  COO(std::shared_ptr<raft::mr::device::allocator> d_alloc,
-      cudaStream_t stream,
-      Index_Type nnz,
-      Index_Type n_rows = 0,
-      Index_Type n_cols = 0,
-      bool init         = true)
+    * @param d_alloc: the device allocator use
+    * @param stream: CUDA stream to use
+    * @param nnz: size of the rows/cols/vals arrays
+    * @param n_rows: number of rows in the dense matrix
+    * @param n_cols: number of cols in the dense matrix
+    * @param init: initialize arrays with zeros
+    */
+  COO(std::shared_ptr<raft::mr::device::allocator> d_alloc, cudaStream_t stream,
+      Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0,
+      bool init = true)
     : rows_arr(d_alloc, stream, nnz),
       cols_arr(d_alloc, stream, nnz),
       vals_arr(d_alloc, stream, nnz),
       nnz(nnz),
       n_rows(n_rows),
-      n_cols(n_cols)
-  {
+      n_cols(n_cols) {
     if (init) init_arrays(stream);
   }
 
-  void init_arrays(cudaStream_t stream)
-  {
-    CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
-    CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
-    CUDA_CHECK(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
+  void init_arrays(cudaStream_t stream) {
+    CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0,
+                               this->nnz * sizeof(Index_Type), stream));
+    CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0,
+                               this->nnz * sizeof(Index_Type), stream));
+    CUDA_CHECK(
+      cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
   }
 
   ~COO() {}
 
   /**
-   * @brief Size should be > 0, with the number of rows
-   * and cols in the dense matrix being > 0.
-   */
-  bool validate_size() const
-  {
+    * @brief Size should be > 0, with the number of rows
+    * and cols in the dense matrix being > 0.
+    */
+  bool validate_size() const {
     if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false;
     return true;
   }
 
   /**
-   * @brief If the underlying arrays have not been set,
-   * return false. Otherwise true.
-   */
-  bool validate_mem() const
-  {
-    if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || this->vals_arr.size() == 0) {
+    * @brief If the underlying arrays have not been set,
+    * return false. Otherwise true.
+    */
+  bool validate_mem() const {
+    if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 ||
+        this->vals_arr.size() == 0) {
       return false;
     }
 
@@ -158,30 +154,33 @@ class COO {
   /*
    * @brief Returns the rows array
    */
-  Index_Type* rows() { return this->rows_arr.data(); }
+  Index_Type *rows() { return this->rows_arr.data(); }
 
   /**
    * @brief Returns the cols array
    */
-  Index_Type* cols() { return this->cols_arr.data(); }
+  Index_Type *cols() { return this->cols_arr.data(); }
 
   /**
    * @brief Returns the vals array
    */
-  T* vals() { return this->vals_arr.data(); }
+  T *vals() { return this->vals_arr.data(); }
 
   /**
-   * @brief Send human-readable state information to output stream
-   */
-  friend std::ostream& operator<<(std::ostream& out, const COO<T, Index_Type>& c)
-  {
+    * @brief Send human-readable state information to output stream
+    */
+  friend std::ostream &operator<<(std::ostream &out,
+                                  const COO<T, Index_Type> &c) {
     if (c.validate_size() && c.validate_mem()) {
       cudaStream_t stream;
       CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-      out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl;
-      out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl;
-      out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) << std::endl;
+      out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream)
+          << std::endl;
+      out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream)
+          << std::endl;
+      out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream)
+          << std::endl;
       out << "nnz=" << c.nnz << std::endl;
       out << "n_rows=" << c.n_rows << std::endl;
       out << "n_cols=" << c.n_cols << std::endl;
@@ -195,59 +194,58 @@ class COO {
   }
 
   /**
-   * @brief Set the number of rows and cols
-   * @param n_rows: number of rows in the dense matrix
-   * @param n_cols: number of columns in the dense matrix
-   */
-  void setSize(int n_rows, int n_cols)
-  {
+    * @brief Set the number of rows and cols
+    * @param n_rows: number of rows in the dense matrix
+    * @param n_cols: number of columns in the dense matrix
+    */
+  void setSize(int n_rows, int n_cols) {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
   }
 
   /**
-   * @brief Set the number of rows and cols for a square dense matrix
-   * @param n: number of rows and cols
-   */
-  void setSize(int n)
-  {
+    * @brief Set the number of rows and cols for a square dense matrix
+    * @param n: number of rows and cols
+    */
+  void setSize(int n) {
     this->n_rows = n;
     this->n_cols = n;
   }
 
   /**
-   * @brief Allocate the underlying arrays
-   * @param nnz: size of underlying row/col/val arrays
-   * @param init: should values be initialized to 0?
-   * @param stream: CUDA stream to use
-   */
-  void allocate(int nnz, bool init, cudaStream_t stream) { this->allocate(nnz, 0, init, stream); }
+    * @brief Allocate the underlying arrays
+    * @param nnz: size of underlying row/col/val arrays
+    * @param init: should values be initialized to 0?
+    * @param stream: CUDA stream to use
+    */
+  void allocate(int nnz, bool init, cudaStream_t stream) {
+    this->allocate(nnz, 0, init, stream);
+  }
 
   /**
-   * @brief Allocate the underlying arrays
-   * @param nnz: size of the underlying row/col/val arrays
-   * @param size: the number of rows/cols in a square dense matrix
-   * @param init: should values be initialized to 0?
-   * @param stream: CUDA stream to use
-   */
-  void allocate(int nnz, int size, bool init, cudaStream_t stream)
-  {
+    * @brief Allocate the underlying arrays
+    * @param nnz: size of the underlying row/col/val arrays
+    * @param size: the number of rows/cols in a square dense matrix
+    * @param init: should values be initialized to 0?
+    * @param stream: CUDA stream to use
+    */
+  void allocate(int nnz, int size, bool init, cudaStream_t stream) {
     this->allocate(nnz, size, size, init, stream);
   }
 
   /**
-   * @brief Allocate the underlying arrays
-   * @param nnz: size of the underlying row/col/val arrays
-   * @param n_rows: number of rows in the dense matrix
-   * @param n_cols: number of columns in the dense matrix
-   * @param init: should values be initialized to 0?
-   * @param stream: stream to use for init
-   */
-  void allocate(int nnz, int n_rows, int n_cols, bool init, cudaStream_t stream)
-  {
+    * @brief Allocate the underlying arrays
+    * @param nnz: size of the underlying row/col/val arrays
+    * @param n_rows: number of rows in the dense matrix
+    * @param n_cols: number of columns in the dense matrix
+    * @param init: should values be initialized to 0?
+    * @param stream: stream to use for init
+    */
+  void allocate(int nnz, int n_rows, int n_cols, bool init,
+                cudaStream_t stream) {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
-    this->nnz    = nnz;
+    this->nnz = nnz;
 
     this->rows_arr.resize(this->nnz, stream);
     this->cols_arr.resize(this->nnz, stream);
diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh
index 17f3c735af..bc4a68d296 100644
--- a/cpp/include/raft/sparse/csr.cuh
+++ b/cpp/include/raft/sparse/csr.cuh
@@ -41,64 +41,57 @@ namespace sparse {
 
 struct WeakCCState {
  public:
-  bool* m;
-  WeakCCState(bool* m) : m(m) {}
+  bool *m;
+  WeakCCState(bool *m) : m(m) {}
 };
 
 template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_label_device(Index_* __restrict__ labels,
-                                     const Index_* __restrict__ row_ind,
-                                     const Index_* __restrict__ row_ind_ptr,
-                                     Index_ nnz,
-                                     bool* __restrict__ m,
-                                     Index_ start_vertex_id,
-                                     Index_ batch_size,
-                                     Index_ N,
-                                     Lambda filter_op)
-{
-  Index_ tid       = threadIdx.x + blockIdx.x * TPB_X;
+__global__ void weak_cc_label_device(Index_ *__restrict__ labels,
+                                     const Index_ *__restrict__ row_ind,
+                                     const Index_ *__restrict__ row_ind_ptr,
+                                     Index_ nnz, bool *__restrict__ m,
+                                     Index_ start_vertex_id, Index_ batch_size,
+                                     Index_ N, Lambda filter_op) {
+  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
   Index_ global_id = tid + start_vertex_id;
   if (tid < batch_size && global_id < N) {
     Index_ start = __ldg(row_ind + tid);
 
     Index_ ci, cj;
-    bool ci_mod        = false;
-    ci                 = labels[global_id];
+    bool ci_mod = false;
+    ci = labels[global_id];
     bool ci_allow_prop = filter_op(global_id);
 
     Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind);
     /// TODO: add one element to row_ind and avoid get_stop_idx
     for (Index_ j = start; j < end; j++) {
-      Index_ j_ind       = __ldg(row_ind_ptr + j);
-      cj                 = labels[j_ind];
+      Index_ j_ind = __ldg(row_ind_ptr + j);
+      cj = labels[j_ind];
       bool cj_allow_prop = filter_op(j_ind);
       if (ci < cj && ci_allow_prop) {
         if (sizeof(Index_) == 4)
-          atomicMin((int*)(labels + j_ind), ci);
+          atomicMin((int *)(labels + j_ind), ci);
         else if (sizeof(Index_) == 8)
-          atomicMin((long long int*)(labels + j_ind), ci);
+          atomicMin((long long int *)(labels + j_ind), ci);
         if (cj_allow_prop) *m = true;
       } else if (ci > cj && cj_allow_prop) {
-        ci     = cj;
+        ci = cj;
         ci_mod = true;
       }
     }
     if (ci_mod) {
       if (sizeof(Index_) == 4)
-        atomicMin((int*)(labels + global_id), ci);
+        atomicMin((int *)(labels + global_id), ci);
       else if (sizeof(Index_) == 8)
-        atomicMin((long long int*)(labels + global_id), ci);
+        atomicMin((long long int *)(labels + global_id), ci);
       if (ci_allow_prop) *m = true;
     }
   }
 }
 
 template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_init_all_kernel(Index_* labels,
-                                        Index_ N,
-                                        Index_ MAX_LABEL,
-                                        Lambda filter_op)
-{
+__global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N,
+                                        Index_ MAX_LABEL, Lambda filter_op) {
   Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     if (filter_op(tid))
@@ -130,25 +123,22 @@ __global__ void weak_cc_init_all_kernel(Index_* labels,
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_, int TPB_X = 256, typename Lambda = auto(Index_)->bool>
-void weak_cc_batched(Index_* labels,
-                     const Index_* row_ind,
-                     const Index_* row_ind_ptr,
-                     Index_ nnz,
-                     Index_ N,
-                     Index_ start_vertex_id,
-                     Index_ batch_size,
-                     WeakCCState* state,
-                     cudaStream_t stream,
-                     Lambda filter_op)
-{
-  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "Index_ should be 4 or 8 bytes");
+template <typename Index_, int TPB_X = 256,
+          typename Lambda = auto(Index_)->bool>
+void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
+                     const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
+                     Index_ start_vertex_id, Index_ batch_size,
+                     WeakCCState *state, cudaStream_t stream,
+                     Lambda filter_op) {
+  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8,
+         "Index_ should be 4 or 8 bytes");
 
   bool host_m;
 
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
   weak_cc_init_all_kernel<Index_, TPB_X>
-    <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(labels, N, MAX_LABEL, filter_op);
+    <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(
+      labels, N, MAX_LABEL, filter_op);
   CUDA_CHECK(cudaPeekAtLastError());
 
   int n_iters = 0;
@@ -157,7 +147,8 @@ void weak_cc_batched(Index_* labels,
 
     weak_cc_label_device<Index_, TPB_X>
       <<<raft::ceildiv(batch_size, Index_(TPB_X)), TPB_X, 0, stream>>>(
-        labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op);
+        labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id,
+        batch_size, N, filter_op);
     CUDA_CHECK(cudaPeekAtLastError());
 
     //** Updating m *
@@ -189,25 +180,12 @@ void weak_cc_batched(Index_* labels,
  * @param stream the cuda stream to use
  */
 template <typename Index_, int TPB_X = 256>
-void weak_cc_batched(Index_* labels,
-                     const Index_* row_ind,
-                     const Index_* row_ind_ptr,
-                     Index_ nnz,
-                     Index_ N,
-                     Index_ start_vertex_id,
-                     Index_ batch_size,
-                     WeakCCState* state,
-                     cudaStream_t stream)
-{
-  weak_cc_batched(labels,
-                  row_ind,
-                  row_ind_ptr,
-                  nnz,
-                  N,
-                  start_vertex_id,
-                  batch_size,
-                  state,
-                  stream,
+void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
+                     const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
+                     Index_ start_vertex_id, Index_ batch_size,
+                     WeakCCState *state, cudaStream_t stream) {
+  weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id,
+                  batch_size, state, stream,
                   [] __device__(Index_ tid) { return true; });
 }
 
@@ -235,20 +213,17 @@ void weak_cc_batched(Index_* labels,
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_ = int, int TPB_X = 256, typename Lambda = auto(Index_)->bool>
-void weak_cc(Index_* labels,
-             const Index_* row_ind,
-             const Index_* row_ind_ptr,
-             Index_ nnz,
-             Index_ N,
+template <typename Index_ = int, int TPB_X = 256,
+          typename Lambda = auto(Index_)->bool>
+void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
+             Index_ nnz, Index_ N,
              std::shared_ptr<raft::mr::device::allocator> d_alloc,
-             cudaStream_t stream,
-             Lambda filter_op)
-{
+             cudaStream_t stream, Lambda filter_op) {
   raft::mr::device::buffer<bool> m(d_alloc, stream, 1);
 
   WeakCCState state(m.data());
-  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, filter_op);
+  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
+                                 stream, filter_op);
 }
 
 /**
@@ -274,18 +249,14 @@ void weak_cc(Index_* labels,
  * @param stream the cuda stream to use
  */
 template <typename Index_, int TPB_X = 256>
-void weak_cc(Index_* labels,
-             const Index_* row_ind,
-             const Index_* row_ind_ptr,
-             Index_ nnz,
-             Index_ N,
+void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
+             Index_ nnz, Index_ N,
              std::shared_ptr<raft::mr::device::allocator> d_alloc,
-             cudaStream_t stream)
-{
+             cudaStream_t stream) {
   raft::mr::device::buffer<bool> m(d_alloc, stream, 1);
   WeakCCState state(m.data());
-  weak_cc_batched<Index_, TPB_X>(
-    labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; });
+  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
+                                 stream, [](Index_) { return true; });
 }
 
 };  // namespace sparse
diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h
index 9d42ec34cb..360832f557 100644
--- a/cpp/include/raft/sparse/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/cusparse_wrappers.h
@@ -23,9 +23,10 @@
 //#include <cuml/common/logger.hpp>
 
 #define _CUSPARSE_ERR_TO_STR(err) \
-  case err: return #err;
+  case err:                       \
+    return #err;
 
-// Notes:
+//Notes:
 //(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic;
 //(2.) to enforce a lower version,
 //
@@ -42,15 +43,16 @@ namespace raft {
  * @brief Exception thrown when a cuSparse error is encountered.
  */
 struct cusparse_error : public raft::exception {
-  explicit cusparse_error(char const* const message) : raft::exception(message) {}
-  explicit cusparse_error(std::string const& message) : raft::exception(message) {}
+  explicit cusparse_error(char const* const message)
+    : raft::exception(message) {}
+  explicit cusparse_error(std::string const& message)
+    : raft::exception(message) {}
 };
 
 namespace sparse {
 namespace detail {
 
-inline const char* cusparse_error_to_string(cusparseStatus_t err)
-{
+inline const char* cusparse_error_to_string(cusparseStatus_t err) {
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
   return cusparseGetErrorString(err);
 #else  // CUDART_VERSION
@@ -63,7 +65,8 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err)
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    default: return "CUSPARSE_STATUS_UNKNOWN";
+    default:
+      return "CUSPARSE_STATUS_UNKNOWN";
   };
 #endif  // CUDART_VERSION
 }
@@ -85,11 +88,8 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err)
     cusparseStatus_t const status = (call);                                  \
     if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
       std::string msg{};                                                     \
-      SET_ERROR_MSG(msg,                                                     \
-                    "cuSparse error encountered at: ",                       \
-                    "call='%s', Reason=%d:%s",                               \
-                    #call,                                                   \
-                    status,                                                  \
+      SET_ERROR_MSG(msg, "cuSparse error encountered at: ",                  \
+                    "call='%s', Reason=%d:%s", #call, status,                \
                     raft::sparse::detail::cusparse_error_to_string(status)); \
       throw raft::cusparse_error(msg);                                       \
     }                                                                        \
@@ -100,15 +100,13 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err)
 
 //@todo: use logger here once logging is enabled
 /** check for cusparse runtime API errors but do not assert */
-#define CUSPARSE_CHECK_NO_THROW(call)                              \
-  do {                                                             \
-    cusparseStatus_t err = call;                                   \
-    if (err != CUSPARSE_STATUS_SUCCESS) {                          \
-      printf("CUSPARSE call='%s' got errorcode=%d err=%s",         \
-             #call,                                                \
-             err,                                                  \
-             raft::sparse::detail::cusparse_error_to_string(err)); \
-    }                                                              \
+#define CUSPARSE_CHECK_NO_THROW(call)                                  \
+  do {                                                                 \
+    cusparseStatus_t err = call;                                       \
+    if (err != CUSPARSE_STATUS_SUCCESS) {                              \
+      printf("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \
+             raft::sparse::detail::cusparse_error_to_string(err));     \
+    }                                                                  \
   } while (0)
 
 namespace raft {
@@ -119,34 +117,28 @@ namespace sparse {
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsegthr(
-  cusparseHandle_t handle, int nnz, const T* vals, T* vals_sorted, int* d_P, cudaStream_t stream);
+cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, const T* vals,
+                              T* vals_sorted, int* d_P, cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle,
-                                     int nnz,
-                                     const double* vals,
-                                     double* vals_sorted,
-                                     int* d_P,
-                                     cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
+                                     const double* vals, double* vals_sorted,
+                                     int* d_P, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO);
+  return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P,
+                       CUSPARSE_INDEX_BASE_ZERO);
 #pragma GCC diagnostic pop
 }
 template <>
-inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle,
-                                     int nnz,
-                                     const float* vals,
-                                     float* vals_sorted,
-                                     int* d_P,
-                                     cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
+                                     const float* vals, float* vals_sorted,
+                                     int* d_P, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO);
+  return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P,
+                       CUSPARSE_INDEX_BASE_ZERO);
 #pragma GCC diagnostic pop
 }
 /** @} */
@@ -156,18 +148,15 @@ inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle,
  * @{
  */
 template <typename T>
-void cusparsecoo2csr(
-  cusparseHandle_t handle, const T* cooRowInd, int nnz, int m, T* csrRowPtr, cudaStream_t stream);
+void cusparsecoo2csr(cusparseHandle_t handle, const T* cooRowInd, int nnz,
+                     int m, T* csrRowPtr, cudaStream_t stream);
 template <>
-inline void cusparsecoo2csr(cusparseHandle_t handle,
-                            const int* cooRowInd,
-                            int nnz,
-                            int m,
-                            int* csrRowPtr,
-                            cudaStream_t stream)
-{
+inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd,
+                            int nnz, int m, int* csrRowPtr,
+                            cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, CUSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr,
+                                  CUSPARSE_INDEX_BASE_ZERO));
 }
 /** @} */
 
@@ -177,54 +166,30 @@ inline void cusparsecoo2csr(cusparseHandle_t handle,
  */
 template <typename T>
 size_t cusparsecoosort_bufferSizeExt(  // NOLINT
-  cusparseHandle_t handle,
-  int m,
-  int n,
-  int nnz,
-  const T* cooRows,
-  const T* cooCols,
-  cudaStream_t stream);
+  cusparseHandle_t handle, int m, int n, int nnz, const T* cooRows,
+  const T* cooCols, cudaStream_t stream);
 template <>
 inline size_t cusparsecoosort_bufferSizeExt(  // NOLINT
-  cusparseHandle_t handle,
-  int m,
-  int n,
-  int nnz,
-  const int* cooRows,
-  const int* cooCols,
-  cudaStream_t stream)
-{
+  cusparseHandle_t handle, int m, int n, int nnz, const int* cooRows,
+  const int* cooCols, cudaStream_t stream) {
   size_t val;
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val));
+  CUSPARSE_CHECK(
+    cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val));
   return val;
 }
 
 template <typename T>
 void cusparsecoosortByRow(  // NOLINT
-  cusparseHandle_t handle,
-  int m,
-  int n,
-  int nnz,
-  T* cooRows,
-  T* cooCols,
-  T* P,
-  void* pBuffer,
-  cudaStream_t stream);
+  cusparseHandle_t handle, int m, int n, int nnz, T* cooRows, T* cooCols, T* P,
+  void* pBuffer, cudaStream_t stream);
 template <>
 inline void cusparsecoosortByRow(  // NOLINT
-  cusparseHandle_t handle,
-  int m,
-  int n,
-  int nnz,
-  int* cooRows,
-  int* cooCols,
-  int* P,
-  void* pBuffer,
-  cudaStream_t stream)
-{
+  cusparseHandle_t handle, int m, int n, int nnz, int* cooRows, int* cooCols,
+  int* P, void* pBuffer, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer));
+  CUSPARSE_CHECK(
+    cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer));
 }
 /** @} */
 
@@ -234,67 +199,37 @@ inline void cusparsecoosortByRow(  // NOLINT
  */
 template <typename T>
 cusparseStatus_t cusparsegemmi(  // NOLINT
-  cusparseHandle_t handle,
-  int m,
-  int n,
-  int k,
-  int nnz,
-  const T* alpha,
-  const T* A,
-  int lda,
-  const T* cscValB,
-  const int* cscColPtrB,
-  const int* cscRowIndB,
-  const T* beta,
-  T* C,
-  int ldc,
-  cudaStream_t stream);
+  cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha,
+  const T* A, int lda, const T* cscValB, const int* cscColPtrB,
+  const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle,
-                                      int m,
-                                      int n,
-                                      int k,
-                                      int nnz,
-                                      const float* alpha,
-                                      const float* A,
-                                      int lda,
+inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
+                                      int k, int nnz, const float* alpha,
+                                      const float* A, int lda,
                                       const float* cscValB,
                                       const int* cscColPtrB,
-                                      const int* cscRowIndB,
-                                      const float* beta,
-                                      float* C,
-                                      int ldc,
-                                      cudaStream_t stream)
-{
+                                      const int* cscRowIndB, const float* beta,
+                                      float* C, int ldc, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseSgemmi(
-    handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
+  return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB,
+                        cscColPtrB, cscRowIndB, beta, C, ldc);
 #pragma GCC diagnostic pop
 }
 template <>
-inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle,
-                                      int m,
-                                      int n,
-                                      int k,
-                                      int nnz,
-                                      const double* alpha,
-                                      const double* A,
-                                      int lda,
+inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
+                                      int k, int nnz, const double* alpha,
+                                      const double* A, int lda,
                                       const double* cscValB,
                                       const int* cscColPtrB,
-                                      const int* cscRowIndB,
-                                      const double* beta,
-                                      double* C,
-                                      int ldc,
-                                      cudaStream_t stream)
-{
+                                      const int* cscRowIndB, const double* beta,
+                                      double* C, int ldc, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDgemmi(
-    handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
+  return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB,
+                        cscColPtrB, cscRowIndB, beta, C, ldc);
 #pragma GCC diagnostic pop
 }
 /** @} */
@@ -306,94 +241,49 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle,
  */
 template <typename IndexT, typename ValueT>
 cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                   int64_t rows,
-                                   int64_t cols,
-                                   int64_t nnz,
-                                   IndexT* csrRowOffsets,
-                                   IndexT* csrColInd,
+                                   int64_t rows, int64_t cols, int64_t nnz,
+                                   IndexT* csrRowOffsets, IndexT* csrColInd,
                                    ValueT* csrValues);
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows,
-                                          int64_t cols,
-                                          int64_t nnz,
-                                          int* csrRowOffsets,
-                                          int* csrColInd,
-                                          float* csrValues)
-{
-  return cusparseCreateCsr(spMatDescr,
-                           rows,
-                           cols,
-                           nnz,
-                           csrRowOffsets,
-                           csrColInd,
-                           csrValues,
-                           CUSPARSE_INDEX_32I,
-                           CUSPARSE_INDEX_32I,
-                           CUSPARSE_INDEX_BASE_ZERO,
+                                          int64_t rows, int64_t cols,
+                                          int64_t nnz, int* csrRowOffsets,
+                                          int* csrColInd, float* csrValues) {
+  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
+                           csrColInd, csrValues, CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_32F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows,
-                                          int64_t cols,
-                                          int64_t nnz,
-                                          int* csrRowOffsets,
-                                          int* csrColInd,
-                                          double* csrValues)
-{
-  return cusparseCreateCsr(spMatDescr,
-                           rows,
-                           cols,
-                           nnz,
-                           csrRowOffsets,
-                           csrColInd,
-                           csrValues,
-                           CUSPARSE_INDEX_32I,
-                           CUSPARSE_INDEX_32I,
-                           CUSPARSE_INDEX_BASE_ZERO,
+                                          int64_t rows, int64_t cols,
+                                          int64_t nnz, int* csrRowOffsets,
+                                          int* csrColInd, double* csrValues) {
+  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
+                           csrColInd, csrValues, CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_64F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows,
-                                          int64_t cols,
-                                          int64_t nnz,
-                                          int64_t* csrRowOffsets,
+                                          int64_t rows, int64_t cols,
+                                          int64_t nnz, int64_t* csrRowOffsets,
                                           int64_t* csrColInd,
-                                          float* csrValues)
-{
-  return cusparseCreateCsr(spMatDescr,
-                           rows,
-                           cols,
-                           nnz,
-                           csrRowOffsets,
-                           csrColInd,
-                           csrValues,
-                           CUSPARSE_INDEX_64I,
-                           CUSPARSE_INDEX_64I,
-                           CUSPARSE_INDEX_BASE_ZERO,
+                                          float* csrValues) {
+  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
+                           csrColInd, csrValues, CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_32F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows,
-                                          int64_t cols,
-                                          int64_t nnz,
-                                          int64_t* csrRowOffsets,
+                                          int64_t rows, int64_t cols,
+                                          int64_t nnz, int64_t* csrRowOffsets,
                                           int64_t* csrColInd,
-                                          double* csrValues)
-{
-  return cusparseCreateCsr(spMatDescr,
-                           rows,
-                           cols,
-                           nnz,
-                           csrRowOffsets,
-                           csrColInd,
-                           csrValues,
-                           CUSPARSE_INDEX_64I,
-                           CUSPARSE_INDEX_64I,
-                           CUSPARSE_INDEX_BASE_ZERO,
+                                          double* csrValues) {
+  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
+                           csrColInd, csrValues, CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_64F);
 }
 /** @} */
@@ -402,19 +292,16 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, int64_t size, T* values);
+cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
+                                     int64_t size, T* values);
 template <>
 inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
-                                            int64_t size,
-                                            float* values)
-{
+                                            int64_t size, float* values) {
   return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
-                                            int64_t size,
-                                            double* values)
-{
+                                            int64_t size, double* values) {
   return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F);
 }
 /** @} */
@@ -425,30 +312,23 @@ inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
  */
 template <typename T>
 cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
-                                     int64_t rows,
-                                     int64_t cols,
-                                     int64_t ld,
-                                     T* values,
-                                     cusparseOrder_t order);
+                                     int64_t rows, int64_t cols, int64_t ld,
+                                     T* values, cusparseOrder_t order);
 template <>
 inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
-                                            int64_t rows,
-                                            int64_t cols,
-                                            int64_t ld,
-                                            float* values,
-                                            cusparseOrder_t order)
-{
-  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, order);
+                                            int64_t rows, int64_t cols,
+                                            int64_t ld, float* values,
+                                            cusparseOrder_t order) {
+  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F,
+                             order);
 }
 template <>
 inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
-                                            int64_t rows,
-                                            int64_t cols,
-                                            int64_t ld,
-                                            double* values,
-                                            cusparseOrder_t order)
-{
-  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, order);
+                                            int64_t rows, int64_t cols,
+                                            int64_t ld, double* values,
+                                            cusparseOrder_t order) {
+  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F,
+                             order);
 }
 /** @} */
 
@@ -457,89 +337,58 @@ inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle,
-                                         cusparseOperation_t opA,
-                                         const T* alpha,
-                                         const cusparseSpMatDescr_t matA,
-                                         const cusparseDnVecDescr_t vecX,
-                                         const T* beta,
-                                         const cusparseDnVecDescr_t vecY,
-                                         cusparseSpMVAlg_t alg,
-                                         size_t* bufferSize,
-                                         cudaStream_t stream);
+cusparseStatus_t cusparsespmv_buffersize(
+  cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha,
+  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
+  const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
+  size_t* bufferSize, cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle,
-                                                cusparseOperation_t opA,
-                                                const float* alpha,
-                                                const cusparseSpMatDescr_t matA,
-                                                const cusparseDnVecDescr_t vecX,
-                                                const float* beta,
-                                                const cusparseDnVecDescr_t vecY,
-                                                cusparseSpMVAlg_t alg,
-                                                size_t* bufferSize,
-                                                cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsespmv_buffersize(
+  cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha,
+  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
+  const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
+  size_t* bufferSize, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV_bufferSize(
-    handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, bufferSize);
+  return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY,
+                                 CUDA_R_32F, alg, bufferSize);
 }
 template <>
-inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle,
-                                                cusparseOperation_t opA,
-                                                const double* alpha,
-                                                const cusparseSpMatDescr_t matA,
-                                                const cusparseDnVecDescr_t vecX,
-                                                const double* beta,
-                                                const cusparseDnVecDescr_t vecY,
-                                                cusparseSpMVAlg_t alg,
-                                                size_t* bufferSize,
-                                                cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsespmv_buffersize(
+  cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha,
+  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
+  const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
+  size_t* bufferSize, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV_bufferSize(
-    handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, bufferSize);
+  return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY,
+                                 CUDA_R_64F, alg, bufferSize);
 }
 
 template <typename T>
-cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
-                              cusparseOperation_t opA,
-                              const T* alpha,
-                              const cusparseSpMatDescr_t matA,
-                              const cusparseDnVecDescr_t vecX,
-                              const T* beta,
+cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA,
+                              const T* alpha, const cusparseSpMatDescr_t matA,
+                              const cusparseDnVecDescr_t vecX, const T* beta,
                               const cusparseDnVecDescr_t vecY,
-                              cusparseSpMVAlg_t alg,
-                              T* externalBuffer,
+                              cusparseSpMVAlg_t alg, T* externalBuffer,
                               cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
-                                     cusparseOperation_t opA,
-                                     const float* alpha,
-                                     const cusparseSpMatDescr_t matA,
-                                     const cusparseDnVecDescr_t vecX,
-                                     const float* beta,
-                                     const cusparseDnVecDescr_t vecY,
-                                     cusparseSpMVAlg_t alg,
-                                     float* externalBuffer,
-                                     cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsespmv(
+  cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha,
+  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
+  const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
+  float* externalBuffer, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, externalBuffer);
+  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F,
+                      alg, externalBuffer);
 }
 template <>
-inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
-                                     cusparseOperation_t opA,
-                                     const double* alpha,
-                                     const cusparseSpMatDescr_t matA,
-                                     const cusparseDnVecDescr_t vecX,
-                                     const double* beta,
-                                     const cusparseDnVecDescr_t vecY,
-                                     cusparseSpMVAlg_t alg,
-                                     double* externalBuffer,
-                                     cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsespmv(
+  cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha,
+  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
+  const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
+  double* externalBuffer, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer);
+  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F,
+                      alg, externalBuffer);
 }
 /** @} */
 #else
@@ -549,59 +398,29 @@ inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
  */
 template <typename T>
 cusparseStatus_t cusparsecsrmv(  // NOLINT
-  cusparseHandle_t handle,
-  cusparseOperation_t trans,
-  int m,
-  int n,
-  int nnz,
-  const T* alpha,
-  const cusparseMatDescr_t descr,
-  const T* csrVal,
-  const int* csrRowPtr,
-  const int* csrColInd,
-  const T* x,
-  const T* beta,
-  T* y,
+  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
+  const T* alpha, const cusparseMatDescr_t descr, const T* csrVal,
+  const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y,
   cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle,
-                                      cusparseOperation_t trans,
-                                      int m,
-                                      int n,
-                                      int nnz,
-                                      const float* alpha,
-                                      const cusparseMatDescr_t descr,
-                                      const float* csrVal,
-                                      const int* csrRowPtr,
-                                      const int* csrColInd,
-                                      const float* x,
-                                      const float* beta,
-                                      float* y,
-                                      cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsecsrmv(
+  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
+  const float* alpha, const cusparseMatDescr_t descr, const float* csrVal,
+  const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta,
+  float* y, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseScsrmv(
-    handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y);
+  return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal,
+                        csrRowPtr, csrColInd, x, beta, y);
 }
 template <>
-inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle,
-                                      cusparseOperation_t trans,
-                                      int m,
-                                      int n,
-                                      int nnz,
-                                      const double* alpha,
-                                      const cusparseMatDescr_t descr,
-                                      const double* csrVal,
-                                      const int* csrRowPtr,
-                                      const int* csrColInd,
-                                      const double* x,
-                                      const double* beta,
-                                      double* y,
-                                      cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsecsrmv(
+  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
+  const double* alpha, const cusparseMatDescr_t descr, const double* csrVal,
+  const int* csrRowPtr, const int* csrColInd, const double* x,
+  const double* beta, double* y, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseDcsrmv(
-    handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y);
+  return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal,
+                        csrRowPtr, csrColInd, x, beta, y);
 }
 /** @} */
 #endif
@@ -612,96 +431,58 @@ inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle,
-                                         cusparseOperation_t opA,
-                                         cusparseOperation_t opB,
-                                         const T* alpha,
-                                         const cusparseSpMatDescr_t matA,
-                                         const cusparseDnMatDescr_t matB,
-                                         const T* beta,
-                                         cusparseDnMatDescr_t matC,
-                                         cusparseSpMMAlg_t alg,
-                                         size_t* bufferSize,
-                                         cudaStream_t stream);
+cusparseStatus_t cusparsespmm_bufferSize(
+  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+  const T* alpha, const cusparseSpMatDescr_t matA,
+  const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC,
+  cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle,
-                                                cusparseOperation_t opA,
-                                                cusparseOperation_t opB,
-                                                const float* alpha,
-                                                const cusparseSpMatDescr_t matA,
-                                                const cusparseDnMatDescr_t matB,
-                                                const float* beta,
-                                                cusparseDnMatDescr_t matC,
-                                                cusparseSpMMAlg_t alg,
-                                                size_t* bufferSize,
-                                                cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsespmm_bufferSize(
+  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+  const float* alpha, const cusparseSpMatDescr_t matA,
+  const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC,
+  cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM_bufferSize(
-    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, bufferSize);
+  return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta,
+                                 matC, CUDA_R_32F, alg, bufferSize);
 }
 template <>
-inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle,
-                                                cusparseOperation_t opA,
-                                                cusparseOperation_t opB,
-                                                const double* alpha,
-                                                const cusparseSpMatDescr_t matA,
-                                                const cusparseDnMatDescr_t matB,
-                                                const double* beta,
-                                                cusparseDnMatDescr_t matC,
-                                                cusparseSpMMAlg_t alg,
-                                                size_t* bufferSize,
-                                                cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsespmm_bufferSize(
+  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+  const double* alpha, const cusparseSpMatDescr_t matA,
+  const cusparseDnMatDescr_t matB, const double* beta,
+  cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize,
+  cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM_bufferSize(
-    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, bufferSize);
+  return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta,
+                                 matC, CUDA_R_64F, alg, bufferSize);
 }
 template <typename T>
-inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
-                                     cusparseOperation_t opA,
-                                     cusparseOperation_t opB,
-                                     const T* alpha,
-                                     const cusparseSpMatDescr_t matA,
-                                     const cusparseDnMatDescr_t matB,
-                                     const T* beta,
-                                     cusparseDnMatDescr_t matC,
-                                     cusparseSpMMAlg_t alg,
-                                     T* externalBuffer,
-                                     cudaStream_t stream);
+inline cusparseStatus_t cusparsespmm(
+  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+  const T* alpha, const cusparseSpMatDescr_t matA,
+  const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC,
+  cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
-                                     cusparseOperation_t opA,
-                                     cusparseOperation_t opB,
-                                     const float* alpha,
-                                     const cusparseSpMatDescr_t matA,
-                                     const cusparseDnMatDescr_t matB,
-                                     const float* beta,
-                                     cusparseDnMatDescr_t matC,
-                                     cusparseSpMMAlg_t alg,
-                                     float* externalBuffer,
-                                     cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsespmm(
+  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+  const float* alpha, const cusparseSpMatDescr_t matA,
+  const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC,
+  cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM(
-    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, externalBuffer);
+  return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC,
+                      CUDA_R_32F, alg, externalBuffer);
 }
 template <>
-inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
-                                     cusparseOperation_t opA,
-                                     cusparseOperation_t opB,
-                                     const double* alpha,
-                                     const cusparseSpMatDescr_t matA,
-                                     const cusparseDnMatDescr_t matB,
-                                     const double* beta,
-                                     cusparseDnMatDescr_t matC,
-                                     cusparseSpMMAlg_t alg,
-                                     double* externalBuffer,
-                                     cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsespmm(
+  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+  const double* alpha, const cusparseSpMatDescr_t matA,
+  const cusparseDnMatDescr_t matB, const double* beta,
+  cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer,
+  cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM(
-    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, externalBuffer);
+  return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC,
+                      CUDA_R_64F, alg, externalBuffer);
 }
 /** @} */
 #else
@@ -711,68 +492,31 @@ inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
  */
 template <typename T>
 cusparseStatus_t cusparsecsrmm(  // NOLINT
-  cusparseHandle_t handle,
-  cusparseOperation_t trans,
-  int m,
-  int n,
-  int k,
-  int nnz,
-  const T* alpha,
-  const cusparseMatDescr_t descr,
-  const T* csrVal,
-  const int* csrRowPtr,
-  const int* csrColInd,
-  const T* x,
-  const int ldx,
-  const T* beta,
-  T* y,
-  const int ldy,
-  cudaStream_t stream);
+  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
+  int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal,
+  const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx,
+  const T* beta, T* y, const int ldy, cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle,
-                                      cusparseOperation_t trans,
-                                      int m,
-                                      int n,
-                                      int k,
-                                      int nnz,
-                                      const float* alpha,
-                                      const cusparseMatDescr_t descr,
-                                      const float* csrVal,
-                                      const int* csrRowPtr,
-                                      const int* csrColInd,
-                                      const float* x,
-                                      const int ldx,
-                                      const float* beta,
-                                      float* y,
-                                      const int ldy,
-                                      cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsecsrmm(
+  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
+  int nnz, const float* alpha, const cusparseMatDescr_t descr,
+  const float* csrVal, const int* csrRowPtr, const int* csrColInd,
+  const float* x, const int ldx, const float* beta, float* y, const int ldy,
+  cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseScsrmm(
-    handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
+  return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal,
+                        csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
 }
 template <>
-inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle,
-                                      cusparseOperation_t trans,
-                                      int m,
-                                      int n,
-                                      int k,
-                                      int nnz,
-                                      const double* alpha,
-                                      const cusparseMatDescr_t descr,
-                                      const double* csrVal,
-                                      const int* csrRowPtr,
-                                      const int* csrColInd,
-                                      const double* x,
-                                      const int ldx,
-                                      const double* beta,
-                                      double* y,
-                                      const int ldy,
-                                      cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsecsrmm(
+  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
+  int nnz, const double* alpha, const cusparseMatDescr_t descr,
+  const double* csrVal, const int* csrRowPtr, const int* csrColInd,
+  const double* x, const int ldx, const double* beta, double* y, const int ldy,
+  cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseDcsrmm(
-    handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
+  return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal,
+                        csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
 }
 /** @} */
 #endif
@@ -783,22 +527,15 @@ inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle,
  */
 template <typename T>
 void cusparsecsr2coo(  // NOLINT
-  cusparseHandle_t handle,
-  const int n,
-  const int nnz,
-  const T* csrRowPtr,
-  T* cooRowInd,
-  cudaStream_t stream);
+  cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr,
+  T* cooRowInd, cudaStream_t stream);
 template <>
-inline void cusparsecsr2coo(cusparseHandle_t handle,
-                            const int n,
-                            const int nnz,
-                            const int* csrRowPtr,
-                            int* cooRowInd,
-                            cudaStream_t stream)
-{
+inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz,
+                            const int* csrRowPtr, int* cooRowInd,
+                            cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd,
+                                  CUSPARSE_INDEX_BASE_ZERO));
 }
 /** @} */
 
@@ -816,8 +553,7 @@ inline void cusparsecsr2coo(cusparseHandle_t handle,
 // template<>
 inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle,
                                                cusparsePointerMode_t mode,
-                                               cudaStream_t stream)
-{
+                                               cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
   return cusparseSetPointerMode(handle, mode);
 }
@@ -828,203 +564,69 @@ inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
-                                            cusparseAlgMode_t alg,
-                                            cusparseOperation_t transA,
-                                            int m,
-                                            int n,
-                                            int nnz,
-                                            const T* alpha,
-                                            const cusparseMatDescr_t descrA,
-                                            const T* csrValA,
-                                            const int* csrRowPtrA,
-                                            const int* csrColIndA,
-                                            const T* x,
-                                            const T* beta,
-                                            T* y,
-                                            size_t* bufferSizeInBytes,
-                                            cudaStream_t stream);
-template <>
-inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
-                                                   cusparseAlgMode_t alg,
-                                                   cusparseOperation_t transA,
-                                                   int m,
-                                                   int n,
-                                                   int nnz,
-                                                   const float* alpha,
-                                                   const cusparseMatDescr_t descrA,
-                                                   const float* csrValA,
-                                                   const int* csrRowPtrA,
-                                                   const int* csrColIndA,
-                                                   const float* x,
-                                                   const float* beta,
-                                                   float* y,
-                                                   size_t* bufferSizeInBytes,
-                                                   cudaStream_t stream)
-{
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx_bufferSize(handle,
-                                    alg,
-                                    transA,
-                                    m,
-                                    n,
-                                    nnz,
-                                    alpha,
-                                    CUDA_R_32F,
-                                    descrA,
-                                    csrValA,
-                                    CUDA_R_32F,
-                                    csrRowPtrA,
-                                    csrColIndA,
-                                    x,
-                                    CUDA_R_32F,
-                                    beta,
-                                    CUDA_R_32F,
-                                    y,
-                                    CUDA_R_32F,
-                                    CUDA_R_32F,
-                                    bufferSizeInBytes);
-}
-template <>
-inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
-                                                   cusparseAlgMode_t alg,
-                                                   cusparseOperation_t transA,
-                                                   int m,
-                                                   int n,
-                                                   int nnz,
-                                                   const double* alpha,
-                                                   const cusparseMatDescr_t descrA,
-                                                   const double* csrValA,
-                                                   const int* csrRowPtrA,
-                                                   const int* csrColIndA,
-                                                   const double* x,
-                                                   const double* beta,
-                                                   double* y,
-                                                   size_t* bufferSizeInBytes,
-                                                   cudaStream_t stream)
-{
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx_bufferSize(handle,
-                                    alg,
-                                    transA,
-                                    m,
-                                    n,
-                                    nnz,
-                                    alpha,
-                                    CUDA_R_64F,
-                                    descrA,
-                                    csrValA,
-                                    CUDA_R_64F,
-                                    csrRowPtrA,
-                                    csrColIndA,
-                                    x,
-                                    CUDA_R_64F,
-                                    beta,
-                                    CUDA_R_64F,
-                                    y,
-                                    CUDA_R_64F,
-                                    CUDA_R_64F,
-                                    bufferSizeInBytes);
+cusparseStatus_t cusparsecsrmvex_bufferSize(
+  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+  int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA,
+  const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x,
+  const T* beta, T* y, size_t* bufferSizeInBytes, cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsecsrmvex_bufferSize(
+  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+  int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA,
+  const float* csrValA, const int* csrRowPtrA, const int* csrColIndA,
+  const float* x, const float* beta, float* y, size_t* bufferSizeInBytes,
+  cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx_bufferSize(
+    handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, descrA, csrValA,
+    CUDA_R_32F, csrRowPtrA, csrColIndA, x, CUDA_R_32F, beta, CUDA_R_32F, y,
+    CUDA_R_32F, CUDA_R_32F, bufferSizeInBytes);
+}
+template <>
+inline cusparseStatus_t cusparsecsrmvex_bufferSize(
+  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+  int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA,
+  const double* csrValA, const int* csrRowPtrA, const int* csrColIndA,
+  const double* x, const double* beta, double* y, size_t* bufferSizeInBytes,
+  cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx_bufferSize(
+    handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, descrA, csrValA,
+    CUDA_R_64F, csrRowPtrA, csrColIndA, x, CUDA_R_64F, beta, CUDA_R_64F, y,
+    CUDA_R_64F, CUDA_R_64F, bufferSizeInBytes);
 }
 
 template <typename T>
-cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
-                                 cusparseAlgMode_t alg,
-                                 cusparseOperation_t transA,
-                                 int m,
-                                 int n,
-                                 int nnz,
-                                 const T* alpha,
-                                 const cusparseMatDescr_t descrA,
-                                 const T* csrValA,
-                                 const int* csrRowPtrA,
-                                 const int* csrColIndA,
-                                 const T* x,
-                                 const T* beta,
-                                 T* y,
-                                 T* buffer,
-                                 cudaStream_t stream);
-template <>
-inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
-                                        cusparseAlgMode_t alg,
-                                        cusparseOperation_t transA,
-                                        int m,
-                                        int n,
-                                        int nnz,
-                                        const float* alpha,
-                                        const cusparseMatDescr_t descrA,
-                                        const float* csrValA,
-                                        const int* csrRowPtrA,
-                                        const int* csrColIndA,
-                                        const float* x,
-                                        const float* beta,
-                                        float* y,
-                                        float* buffer,
-                                        cudaStream_t stream)
-{
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx(handle,
-                         alg,
-                         transA,
-                         m,
-                         n,
-                         nnz,
-                         alpha,
-                         CUDA_R_32F,
-                         descrA,
-                         csrValA,
-                         CUDA_R_32F,
-                         csrRowPtrA,
-                         csrColIndA,
-                         x,
-                         CUDA_R_32F,
-                         beta,
-                         CUDA_R_32F,
-                         y,
-                         CUDA_R_32F,
-                         CUDA_R_32F,
-                         buffer);
-}
-template <>
-inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
-                                        cusparseAlgMode_t alg,
-                                        cusparseOperation_t transA,
-                                        int m,
-                                        int n,
-                                        int nnz,
-                                        const double* alpha,
-                                        const cusparseMatDescr_t descrA,
-                                        const double* csrValA,
-                                        const int* csrRowPtrA,
-                                        const int* csrColIndA,
-                                        const double* x,
-                                        const double* beta,
-                                        double* y,
-                                        double* buffer,
-                                        cudaStream_t stream)
-{
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx(handle,
-                         alg,
-                         transA,
-                         m,
-                         n,
-                         nnz,
-                         alpha,
-                         CUDA_R_64F,
-                         descrA,
-                         csrValA,
-                         CUDA_R_64F,
-                         csrRowPtrA,
-                         csrColIndA,
-                         x,
-                         CUDA_R_64F,
-                         beta,
-                         CUDA_R_64F,
-                         y,
-                         CUDA_R_64F,
-                         CUDA_R_64F,
-                         buffer);
+cusparseStatus_t cusparsecsrmvex(
+  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+  int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA,
+  const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x,
+  const T* beta, T* y, T* buffer, cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsecsrmvex(
+  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+  int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA,
+  const float* csrValA, const int* csrRowPtrA, const int* csrColIndA,
+  const float* x, const float* beta, float* y, float* buffer,
+  cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F,
+                         descrA, csrValA, CUDA_R_32F, csrRowPtrA, csrColIndA, x,
+                         CUDA_R_32F, beta, CUDA_R_32F, y, CUDA_R_32F,
+                         CUDA_R_32F, buffer);
+}
+template <>
+inline cusparseStatus_t cusparsecsrmvex(
+  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+  int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA,
+  const double* csrValA, const int* csrRowPtrA, const int* csrColIndA,
+  const double* x, const double* beta, double* y, double* buffer,
+  cudaStream_t stream) {
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F,
+                         descrA, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, x,
+                         CUDA_R_64F, beta, CUDA_R_64F, y, CUDA_R_64F,
+                         CUDA_R_64F, buffer);
 }
 
 /** @} */
@@ -1035,180 +637,68 @@ inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
  */
 
 template <typename T>
-cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle,
-                                            int m,
-                                            int n,
-                                            int nnz,
-                                            const T* csrVal,
-                                            const int* csrRowPtr,
-                                            const int* csrColInd,
-                                            void* cscVal,
-                                            int* cscColPtr,
-                                            int* cscRowInd,
-                                            cusparseAction_t copyValues,
-                                            cusparseIndexBase_t idxBase,
-                                            cusparseCsr2CscAlg_t alg,
-                                            size_t* bufferSize,
-                                            cudaStream_t stream);
+cusparseStatus_t cusparsecsr2csc_bufferSize(
+  cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal,
+  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
+  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
+  cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream);
 
 template <>
-inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle,
-                                                   int m,
-                                                   int n,
-                                                   int nnz,
-                                                   const float* csrVal,
-                                                   const int* csrRowPtr,
-                                                   const int* csrColInd,
-                                                   void* cscVal,
-                                                   int* cscColPtr,
-                                                   int* cscRowInd,
-                                                   cusparseAction_t copyValues,
-                                                   cusparseIndexBase_t idxBase,
-                                                   cusparseCsr2CscAlg_t alg,
-                                                   size_t* bufferSize,
-                                                   cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsecsr2csc_bufferSize(
+  cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal,
+  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
+  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
+  cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2_bufferSize(handle,
-                                       m,
-                                       n,
-                                       nnz,
-                                       csrVal,
-                                       csrRowPtr,
-                                       csrColInd,
-                                       cscVal,
-                                       cscColPtr,
-                                       cscRowInd,
-                                       CUDA_R_32F,
-                                       copyValues,
-                                       idxBase,
-                                       alg,
-                                       bufferSize);
+  return cusparseCsr2cscEx2_bufferSize(
+    handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr,
+    cscRowInd, CUDA_R_32F, copyValues, idxBase, alg, bufferSize);
 }
 template <>
-inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle,
-                                                   int m,
-                                                   int n,
-                                                   int nnz,
-                                                   const double* csrVal,
-                                                   const int* csrRowPtr,
-                                                   const int* csrColInd,
-                                                   void* cscVal,
-                                                   int* cscColPtr,
-                                                   int* cscRowInd,
-                                                   cusparseAction_t copyValues,
-                                                   cusparseIndexBase_t idxBase,
-                                                   cusparseCsr2CscAlg_t alg,
-                                                   size_t* bufferSize,
-                                                   cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsecsr2csc_bufferSize(
+  cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal,
+  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
+  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
+  cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2_bufferSize(handle,
-                                       m,
-                                       n,
-                                       nnz,
-                                       csrVal,
-                                       csrRowPtr,
-                                       csrColInd,
-                                       cscVal,
-                                       cscColPtr,
-                                       cscRowInd,
-                                       CUDA_R_64F,
-                                       copyValues,
-                                       idxBase,
-                                       alg,
-                                       bufferSize);
+  return cusparseCsr2cscEx2_bufferSize(
+    handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr,
+    cscRowInd, CUDA_R_64F, copyValues, idxBase, alg, bufferSize);
 }
 
 template <typename T>
-cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
-                                 int m,
-                                 int n,
-                                 int nnz,
-                                 const T* csrVal,
-                                 const int* csrRowPtr,
-                                 const int* csrColInd,
-                                 void* cscVal,
-                                 int* cscColPtr,
-                                 int* cscRowInd,
-                                 cusparseAction_t copyValues,
-                                 cusparseIndexBase_t idxBase,
-                                 cusparseCsr2CscAlg_t alg,
-                                 void* buffer,
-                                 cudaStream_t stream);
+cusparseStatus_t cusparsecsr2csc(
+  cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal,
+  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
+  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
+  cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream);
 
 template <>
-inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
-                                        int m,
-                                        int n,
-                                        int nnz,
-                                        const float* csrVal,
-                                        const int* csrRowPtr,
-                                        const int* csrColInd,
-                                        void* cscVal,
-                                        int* cscColPtr,
-                                        int* cscRowInd,
-                                        cusparseAction_t copyValues,
-                                        cusparseIndexBase_t idxBase,
-                                        cusparseCsr2CscAlg_t alg,
-                                        void* buffer,
-                                        cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsecsr2csc(
+  cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal,
+  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
+  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
+  cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2(handle,
-                            m,
-                            n,
-                            nnz,
-                            csrVal,
-                            csrRowPtr,
-                            csrColInd,
-                            cscVal,
-                            cscColPtr,
-                            cscRowInd,
-                            CUDA_R_32F,
-                            copyValues,
-                            idxBase,
-                            alg,
-                            buffer);
+  return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
+                            cscVal, cscColPtr, cscRowInd, CUDA_R_32F,
+                            copyValues, idxBase, alg, buffer);
 }
 
 template <>
-inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
-                                        int m,
-                                        int n,
-                                        int nnz,
-                                        const double* csrVal,
-                                        const int* csrRowPtr,
-                                        const int* csrColInd,
-                                        void* cscVal,
-                                        int* cscColPtr,
-                                        int* cscRowInd,
-                                        cusparseAction_t copyValues,
-                                        cusparseIndexBase_t idxBase,
-                                        cusparseCsr2CscAlg_t alg,
-                                        void* buffer,
-                                        cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsecsr2csc(
+  cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal,
+  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
+  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
+  cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2(handle,
-                            m,
-                            n,
-                            nnz,
-                            csrVal,
-                            csrRowPtr,
-                            csrColInd,
-                            cscVal,
-                            cscColPtr,
-                            cscRowInd,
-                            CUDA_R_64F,
-                            copyValues,
-                            idxBase,
-                            alg,
-                            buffer);
+  return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
+                            cscVal, cscColPtr, cscRowInd, CUDA_R_64F,
+                            copyValues, idxBase, alg, buffer);
 }
 
 /** @} */
@@ -1219,329 +709,120 @@ inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
  */
 
 template <typename T>
-cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle,
-                                                int m,
-                                                int n,
-                                                int k,
-                                                const T* alpha,
-                                                const T* beta,
-                                                const cusparseMatDescr_t matA,
-                                                int nnzA,
-                                                const int* rowindA,
-                                                const int* indicesA,
-                                                const cusparseMatDescr_t matB,
-                                                int nnzB,
-                                                const int* rowindB,
-                                                const int* indicesB,
-                                                const cusparseMatDescr_t matD,
-                                                int nnzD,
-                                                const int* rowindD,
-                                                const int* indicesD,
-                                                csrgemm2Info_t info,
-                                                size_t* pBufferSizeInBytes,
-                                                cudaStream_t stream);
-
-template <>
-inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle,
-                                                       int m,
-                                                       int n,
-                                                       int k,
-                                                       const float* alpha,
-                                                       const float* beta,
-                                                       const cusparseMatDescr_t matA,
-                                                       int nnzA,
-                                                       const int* rowindA,
-                                                       const int* indicesA,
-                                                       const cusparseMatDescr_t matB,
-                                                       int nnzB,
-                                                       const int* rowindB,
-                                                       const int* indicesB,
-                                                       const cusparseMatDescr_t matD,
-                                                       int nnzD,
-                                                       const int* rowindD,
-                                                       const int* indicesD,
-                                                       csrgemm2Info_t info,
-                                                       size_t* pBufferSizeInBytes,
-                                                       cudaStream_t stream)
-{
+cusparseStatus_t cusparsecsrgemm2_buffersizeext(
+  cusparseHandle_t handle, int m, int n, int k, const T* alpha, const T* beta,
+  const cusparseMatDescr_t matA, int nnzA, const int* rowindA,
+  const int* indicesA, const cusparseMatDescr_t matB, int nnzB,
+  const int* rowindB, const int* indicesB, const cusparseMatDescr_t matD,
+  int nnzD, const int* rowindD, const int* indicesD, csrgemm2Info_t info,
+  size_t* pBufferSizeInBytes, cudaStream_t stream);
+
+template <>
+inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(
+  cusparseHandle_t handle, int m, int n, int k, const float* alpha,
+  const float* beta, const cusparseMatDescr_t matA, int nnzA,
+  const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB,
+  int nnzB, const int* rowindB, const int* indicesB,
+  const cusparseMatDescr_t matD, int nnzD, const int* rowindD,
+  const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes,
+  cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseScsrgemm2_bufferSizeExt(handle,
-                                         m,
-                                         n,
-                                         k,
-                                         alpha,
-                                         matA,
-                                         nnzA,
-                                         rowindA,
-                                         indicesA,
-                                         matB,
-                                         nnzB,
-                                         rowindB,
-                                         indicesB,
-                                         beta,
-                                         matD,
-                                         nnzD,
-                                         rowindD,
-                                         indicesD,
-                                         info,
-                                         pBufferSizeInBytes);
+  return cusparseScsrgemm2_bufferSizeExt(
+    handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB,
+    indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes);
 #pragma GCC diagnostic pop
 }
 
 template <>
-inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle,
-                                                       int m,
-                                                       int n,
-                                                       int k,
-                                                       const double* alpha,
-                                                       const double* beta,
-                                                       const cusparseMatDescr_t matA,
-                                                       int nnzA,
-                                                       const int* rowindA,
-                                                       const int* indicesA,
-                                                       const cusparseMatDescr_t matB,
-                                                       int nnzB,
-                                                       const int* rowindB,
-                                                       const int* indicesB,
-                                                       const cusparseMatDescr_t matD,
-                                                       int nnzD,
-                                                       const int* rowindD,
-                                                       const int* indicesD,
-                                                       csrgemm2Info_t info,
-                                                       size_t* pBufferSizeInBytes,
-                                                       cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(
+  cusparseHandle_t handle, int m, int n, int k, const double* alpha,
+  const double* beta, const cusparseMatDescr_t matA, int nnzA,
+  const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB,
+  int nnzB, const int* rowindB, const int* indicesB,
+  const cusparseMatDescr_t matD, int nnzD, const int* rowindD,
+  const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes,
+  cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDcsrgemm2_bufferSizeExt(handle,
-                                         m,
-                                         n,
-                                         k,
-                                         alpha,
-                                         matA,
-                                         nnzA,
-                                         rowindA,
-                                         indicesA,
-                                         matB,
-                                         nnzB,
-                                         rowindB,
-                                         indicesB,
-                                         beta,
-                                         matD,
-                                         nnzD,
-                                         rowindD,
-                                         indicesD,
-                                         info,
-                                         pBufferSizeInBytes);
+  return cusparseDcsrgemm2_bufferSizeExt(
+    handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB,
+    indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes);
 #pragma GCC diagnostic pop
 }
 
-inline cusparseStatus_t cusparsecsrgemm2nnz(cusparseHandle_t handle,
-                                            int m,
-                                            int n,
-                                            int k,
-                                            const cusparseMatDescr_t matA,
-                                            int nnzA,
-                                            const int* rowindA,
-                                            const int* indicesA,
-                                            const cusparseMatDescr_t matB,
-                                            int nnzB,
-                                            const int* rowindB,
-                                            const int* indicesB,
-                                            const cusparseMatDescr_t matD,
-                                            int nnzD,
-                                            const int* rowindD,
-                                            const int* indicesD,
-                                            const cusparseMatDescr_t matC,
-                                            int* rowindC,
-                                            int* nnzC,
-                                            const csrgemm2Info_t info,
-                                            void* pBuffer,
-                                            cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsecsrgemm2nnz(
+  cusparseHandle_t handle, int m, int n, int k, const cusparseMatDescr_t matA,
+  int nnzA, const int* rowindA, const int* indicesA,
+  const cusparseMatDescr_t matB, int nnzB, const int* rowindB,
+  const int* indicesB, const cusparseMatDescr_t matD, int nnzD,
+  const int* rowindD, const int* indicesD, const cusparseMatDescr_t matC,
+  int* rowindC, int* nnzC, const csrgemm2Info_t info, void* pBuffer,
+  cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseXcsrgemm2Nnz(handle,
-                              m,
-                              n,
-                              k,
-                              matA,
-                              nnzA,
-                              rowindA,
-                              indicesA,
-                              matB,
-                              nnzB,
-                              rowindB,
-                              indicesB,
-                              matD,
-                              nnzD,
-                              rowindD,
-                              indicesD,
-                              matC,
-                              rowindC,
-                              nnzC,
-                              info,
+  return cusparseXcsrgemm2Nnz(handle, m, n, k, matA, nnzA, rowindA, indicesA,
+                              matB, nnzB, rowindB, indicesB, matD, nnzD,
+                              rowindD, indicesD, matC, rowindC, nnzC, info,
                               pBuffer);
 #pragma GCC diagnostic pop
 }
 
 template <typename T>
-cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
-                                  int m,
-                                  int n,
-                                  int k,
-                                  const T* alpha,
-                                  const cusparseMatDescr_t descrA,
-                                  int nnzA,
-                                  const T* csrValA,
-                                  const int* csrRowPtrA,
-                                  const int* csrColIndA,
-                                  const cusparseMatDescr_t descrB,
-                                  int nnzB,
-                                  const T* csrValB,
-                                  const int* csrRowPtrB,
-                                  const int* csrColIndB,
-                                  const T* beta,
-                                  const cusparseMatDescr_t descrD,
-                                  int nnzD,
-                                  const T* csrValD,
-                                  const int* csrRowPtrD,
-                                  const int* csrColIndD,
-                                  const cusparseMatDescr_t descrC,
-                                  T* csrValC,
-                                  const int* csrRowPtrC,
-                                  int* csrColIndC,
-                                  const csrgemm2Info_t info,
-                                  void* pBuffer,
-                                  cudaStream_t stream);
-
-template <>
-inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
-                                         int m,
-                                         int n,
-                                         int k,
-                                         const float* alpha,
-                                         const cusparseMatDescr_t descrA,
-                                         int nnzA,
-                                         const float* csrValA,
-                                         const int* csrRowPtrA,
-                                         const int* csrColIndA,
-                                         const cusparseMatDescr_t descrB,
-                                         int nnzB,
-                                         const float* csrValB,
-                                         const int* csrRowPtrB,
-                                         const int* csrColIndB,
-                                         const float* beta,
-                                         const cusparseMatDescr_t descrD,
-                                         int nnzD,
-                                         const float* csrValD,
-                                         const int* csrRowPtrD,
-                                         const int* csrColIndD,
-                                         const cusparseMatDescr_t descrC,
-                                         float* csrValC,
-                                         const int* csrRowPtrC,
-                                         int* csrColIndC,
-                                         const csrgemm2Info_t info,
-                                         void* pBuffer,
-                                         cudaStream_t stream)
-{
+cusparseStatus_t cusparsecsrgemm2(
+  cusparseHandle_t handle, int m, int n, int k, const T* alpha,
+  const cusparseMatDescr_t descrA, int nnzA, const T* csrValA,
+  const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB,
+  int nnzB, const T* csrValB, const int* csrRowPtrB, const int* csrColIndB,
+  const T* beta, const cusparseMatDescr_t descrD, int nnzD, const T* csrValD,
+  const int* csrRowPtrD, const int* csrColIndD, const cusparseMatDescr_t descrC,
+  T* csrValC, const int* csrRowPtrC, int* csrColIndC, const csrgemm2Info_t info,
+  void* pBuffer, cudaStream_t stream);
+
+template <>
+inline cusparseStatus_t cusparsecsrgemm2(
+  cusparseHandle_t handle, int m, int n, int k, const float* alpha,
+  const cusparseMatDescr_t descrA, int nnzA, const float* csrValA,
+  const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB,
+  int nnzB, const float* csrValB, const int* csrRowPtrB, const int* csrColIndB,
+  const float* beta, const cusparseMatDescr_t descrD, int nnzD,
+  const float* csrValD, const int* csrRowPtrD, const int* csrColIndD,
+  const cusparseMatDescr_t descrC, float* csrValC, const int* csrRowPtrC,
+  int* csrColIndC, const csrgemm2Info_t info, void* pBuffer,
+  cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseScsrgemm2(handle,
-                           m,
-                           n,
-                           k,
-                           alpha,
-                           descrA,
-                           nnzA,
-                           csrValA,
-                           csrRowPtrA,
-                           csrColIndA,
-                           descrB,
-                           nnzB,
-                           csrValB,
-                           csrRowPtrB,
-                           csrColIndB,
-                           beta,
-                           descrD,
-                           nnzD,
-                           csrValD,
-                           csrRowPtrD,
-                           csrColIndD,
-                           descrC,
-                           csrValC,
-                           csrRowPtrC,
-                           csrColIndC,
-                           info,
-                           pBuffer);
+  return cusparseScsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA,
+                           csrRowPtrA, csrColIndA, descrB, nnzB, csrValB,
+                           csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD,
+                           csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC,
+                           csrColIndC, info, pBuffer);
 #pragma GCC diagnostic pop
 }
 
 template <>
-inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
-                                         int m,
-                                         int n,
-                                         int k,
-                                         const double* alpha,
-                                         const cusparseMatDescr_t descrA,
-                                         int nnzA,
-                                         const double* csrValA,
-                                         const int* csrRowPtrA,
-                                         const int* csrColIndA,
-                                         const cusparseMatDescr_t descrB,
-                                         int nnzB,
-                                         const double* csrValB,
-                                         const int* csrRowPtrB,
-                                         const int* csrColIndB,
-                                         const double* beta,
-                                         const cusparseMatDescr_t descrD,
-                                         int nnzD,
-                                         const double* csrValD,
-                                         const int* csrRowPtrD,
-                                         const int* csrColIndD,
-                                         const cusparseMatDescr_t descrC,
-                                         double* csrValC,
-                                         const int* csrRowPtrC,
-                                         int* csrColIndC,
-                                         const csrgemm2Info_t info,
-                                         void* pBuffer,
-                                         cudaStream_t stream)
-{
+inline cusparseStatus_t cusparsecsrgemm2(
+  cusparseHandle_t handle, int m, int n, int k, const double* alpha,
+  const cusparseMatDescr_t descrA, int nnzA, const double* csrValA,
+  const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB,
+  int nnzB, const double* csrValB, const int* csrRowPtrB, const int* csrColIndB,
+  const double* beta, const cusparseMatDescr_t descrD, int nnzD,
+  const double* csrValD, const int* csrRowPtrD, const int* csrColIndD,
+  const cusparseMatDescr_t descrC, double* csrValC, const int* csrRowPtrC,
+  int* csrColIndC, const csrgemm2Info_t info, void* pBuffer,
+  cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDcsrgemm2(handle,
-                           m,
-                           n,
-                           k,
-                           alpha,
-                           descrA,
-                           nnzA,
-                           csrValA,
-                           csrRowPtrA,
-                           csrColIndA,
-                           descrB,
-                           nnzB,
-                           csrValB,
-                           csrRowPtrB,
-                           csrColIndB,
-                           beta,
-                           descrD,
-                           nnzD,
-                           csrValD,
-                           csrRowPtrD,
-                           csrColIndD,
-                           descrC,
-                           csrValC,
-                           csrRowPtrC,
-                           csrColIndC,
-                           info,
-                           pBuffer);
+  return cusparseDcsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA,
+                           csrRowPtrA, csrColIndA, descrB, nnzB, csrValB,
+                           csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD,
+                           csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC,
+                           csrColIndC, info, pBuffer);
 #pragma GCC diagnostic pop
 }
 
@@ -1553,46 +834,33 @@ inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
  */
 
 template <typename T>
-cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
-                                   int m,
-                                   int n,
+cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n,
                                    const cusparseMatDescr_t descrA,
-                                   const T* csrValA,
-                                   const int* csrRowPtrA,
-                                   const int* csrColIndA,
-                                   T* A,
-                                   int lda,
+                                   const T* csrValA, const int* csrRowPtrA,
+                                   const int* csrColIndA, T* A, int lda,
                                    cudaStream_t stream);
 
 template <>
-inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
-                                          int m,
-                                          int n,
+inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n,
                                           const cusparseMatDescr_t descrA,
                                           const float* csrValA,
                                           const int* csrRowPtrA,
-                                          const int* csrColIndA,
-                                          float* A,
-                                          int lda,
-                                          cudaStream_t stream)
-{
+                                          const int* csrColIndA, float* A,
+                                          int lda, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+  return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA,
+                            csrColIndA, A, lda);
 }
 template <>
-inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
-                                          int m,
-                                          int n,
+inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n,
                                           const cusparseMatDescr_t descrA,
                                           const double* csrValA,
                                           const int* csrRowPtrA,
-                                          const int* csrColIndA,
-                                          double* A,
-                                          int lda,
-                                          cudaStream_t stream)
-{
+                                          const int* csrColIndA, double* A,
+                                          int lda, cudaStream_t stream) {
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
+  return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA,
+                            csrColIndA, A, lda);
 }
 
 /** @} */
diff --git a/cpp/include/raft/sparse/distance/bin_distance.cuh b/cpp/include/raft/sparse/distance/bin_distance.cuh
index aef19122da..f3109556b7 100644
--- a/cpp/include/raft/sparse/distance/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/bin_distance.cuh
@@ -37,11 +37,9 @@ namespace distance {
 
 // @TODO: Move this into sparse prims (coo_norm)
 template <typename value_idx, typename value_t>
-__global__ void compute_binary_row_norm_kernel(value_t* out,
-                                               const value_idx* __restrict__ coo_rows,
-                                               const value_t* __restrict__ data,
-                                               value_idx nnz)
-{
+__global__ void compute_binary_row_norm_kernel(
+  value_t *out, const value_idx *__restrict__ coo_rows,
+  const value_t *__restrict__ data, value_idx nnz) {
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i < nnz) {
     // We do conditional here only because it's
@@ -53,64 +51,55 @@ __global__ void compute_binary_row_norm_kernel(value_t* out,
 }
 
 template <typename value_idx, typename value_t, typename expansion_f>
-__global__ void compute_binary_warp_kernel(value_t* __restrict__ C,
-                                           const value_t* __restrict__ Q_norms,
-                                           const value_t* __restrict__ R_norms,
-                                           value_idx n_rows,
-                                           value_idx n_cols,
-                                           expansion_f expansion_func)
-{
+__global__ void compute_binary_warp_kernel(value_t *__restrict__ C,
+                                           const value_t *__restrict__ Q_norms,
+                                           const value_t *__restrict__ R_norms,
+                                           value_idx n_rows, value_idx n_cols,
+                                           expansion_f expansion_func) {
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
-  value_idx i   = tid / n_cols;
-  value_idx j   = tid % n_cols;
+  value_idx i = tid / n_cols;
+  value_idx j = tid % n_cols;
 
   if (i >= n_rows || j >= n_cols) return;
 
-  value_t q_norm            = Q_norms[i];
-  value_t r_norm            = R_norms[j];
-  value_t dot               = C[(size_t)i * n_cols + j];
+  value_t q_norm = Q_norms[i];
+  value_t r_norm = R_norms[j];
+  value_t dot = C[(size_t)i * n_cols + j];
   C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm);
 }
 
-template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
-void compute_binary(value_t* C,
-                    const value_t* Q_norms,
-                    const value_t* R_norms,
-                    value_idx n_rows,
-                    value_idx n_cols,
-                    expansion_f expansion_func,
-                    cudaStream_t stream)
-{
+template <typename value_idx, typename value_t, typename expansion_f,
+          int tpb = 1024>
+void compute_binary(value_t *C, const value_t *Q_norms, const value_t *R_norms,
+                    value_idx n_rows, value_idx n_cols,
+                    expansion_f expansion_func, cudaStream_t stream) {
   int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
   compute_binary_warp_kernel<<<blocks, tpb, 0, stream>>>(
     C, Q_norms, R_norms, n_rows, n_cols, expansion_func);
 }
 
-template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
-void compute_bin_distance(value_t* out,
-                          const value_idx* Q_coo_rows,
-                          const value_t* Q_data,
-                          value_idx Q_nnz,
-                          const value_idx* R_coo_rows,
-                          const value_t* R_data,
-                          value_idx R_nnz,
-                          value_idx m,
-                          value_idx n,
+template <typename value_idx, typename value_t, typename expansion_f,
+          int tpb = 1024>
+void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows,
+                          const value_t *Q_data, value_idx Q_nnz,
+                          const value_idx *R_coo_rows, const value_t *R_data,
+                          value_idx R_nnz, value_idx m, value_idx n,
                           std::shared_ptr<raft::mr::device::allocator> alloc,
-                          cudaStream_t stream,
-                          expansion_f expansion_func)
-{
+                          cudaStream_t stream, expansion_f expansion_func) {
   rmm::device_uvector<value_t> Q_norms(m, stream);
   rmm::device_uvector<value_t> R_norms(n, stream);
-  CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(
+    cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(
+    cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
 
   compute_binary_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_norms.data(), Q_coo_rows, Q_data, Q_nnz);
   compute_binary_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
     R_norms.data(), R_coo_rows, R_data, R_nnz);
 
-  compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream);
+  compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func,
+                 stream);
 }
 
 /**
@@ -120,52 +109,44 @@ void compute_bin_distance(value_t* out,
 template <typename value_idx = int, typename value_t = float>
 class jaccard_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit jaccard_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
-  {
-  }
+  explicit jaccard_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config),
+      workspace(0, config.handle.get_stream()),
+      ip_dists(config) {}
 
-  void compute(value_t* out_dists)
-  {
+  void compute(value_t *out_dists) {
     ip_dists.compute(out_dists);
 
-    value_idx* b_indices = ip_dists.b_rows_coo();
-    value_t* b_data      = ip_dists.b_data_coo();
+    value_idx *b_indices = ip_dists.b_rows_coo();
+    value_t *b_data = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr,
-                                      config_->a_nrows,
-                                      search_coo_rows.data(),
-                                      config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
+                                      search_coo_rows.data(), config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_bin_distance(out_dists,
-                         search_coo_rows.data(),
-                         config_->a_data,
-                         config_->a_nnz,
-                         b_indices,
-                         b_data,
-                         config_->b_nnz,
-                         config_->a_nrows,
-                         config_->b_nrows,
-                         config_->handle.get_device_allocator(),
-                         config_->handle.get_stream(),
-                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-                           value_t q_r_union = q_norm + r_norm;
-                           value_t denom     = q_r_union - dot;
-
-                           value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom);
-
-                           // flip the similarity when both rows are 0
-                           bool both_empty = q_r_union == 0;
-                           return 1 - ((!both_empty * jacc) + both_empty);
-                         });
+    compute_bin_distance(
+      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
+      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
+      config_->handle.get_device_allocator(), config_->handle.get_stream(),
+      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+        value_t q_r_union = q_norm + r_norm;
+        value_t denom = q_r_union - dot;
+
+        value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom);
+
+        // flip the similarity when both rows are 0
+        bool both_empty = q_r_union == 0;
+        return 1 - ((!both_empty * jacc) + both_empty);
+      });
   }
 
   ~jaccard_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
@@ -177,48 +158,40 @@ class jaccard_expanded_distances_t : public distances_t<value_t> {
 template <typename value_idx = int, typename value_t = float>
 class dice_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit dice_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
-  {
-  }
+  explicit dice_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config),
+      workspace(0, config.handle.get_stream()),
+      ip_dists(config) {}
 
-  void compute(value_t* out_dists)
-  {
+  void compute(value_t *out_dists) {
     ip_dists.compute(out_dists);
 
-    value_idx* b_indices = ip_dists.b_rows_coo();
-    value_t* b_data      = ip_dists.b_data_coo();
+    value_idx *b_indices = ip_dists.b_rows_coo();
+    value_t *b_data = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr,
-                                      config_->a_nrows,
-                                      search_coo_rows.data(),
-                                      config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
+                                      search_coo_rows.data(), config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_bin_distance(out_dists,
-                         search_coo_rows.data(),
-                         config_->a_data,
-                         config_->a_nnz,
-                         b_indices,
-                         b_data,
-                         config_->b_nnz,
-                         config_->a_nrows,
-                         config_->b_nrows,
-                         config_->handle.get_device_allocator(),
-                         config_->handle.get_stream(),
-                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-                           value_t q_r_union = q_norm + r_norm;
-                           value_t dice      = (2 * dot) / q_r_union;
-                           bool both_empty   = q_r_union == 0;
-                           return 1 - ((!both_empty * dice) + both_empty);
-                         });
+    compute_bin_distance(
+      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
+      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
+      config_->handle.get_device_allocator(), config_->handle.get_stream(),
+      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+        value_t q_r_union = q_norm + r_norm;
+        value_t dice = (2 * dot) / q_r_union;
+        bool both_empty = q_r_union == 0;
+        return 1 - ((!both_empty * dice) + both_empty);
+      });
   }
 
   ~dice_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h
index 29c823bcdb..1c55412eec 100644
--- a/cpp/include/raft/sparse/distance/common.h
+++ b/cpp/include/raft/sparse/distance/common.h
@@ -24,31 +24,31 @@ namespace distance {
 
 template <typename value_idx, typename value_t>
 struct distances_config_t {
-  distances_config_t(const raft::handle_t& handle_) : handle(handle_) {}
+  distances_config_t(const raft::handle_t &handle_) : handle(handle_) {}
 
   // left side
   value_idx a_nrows;
   value_idx a_ncols;
   value_idx a_nnz;
-  value_idx* a_indptr;
-  value_idx* a_indices;
-  value_t* a_data;
+  value_idx *a_indptr;
+  value_idx *a_indices;
+  value_t *a_data;
 
   // right side
   value_idx b_nrows;
   value_idx b_ncols;
   value_idx b_nnz;
-  value_idx* b_indptr;
-  value_idx* b_indices;
-  value_t* b_data;
+  value_idx *b_indptr;
+  value_idx *b_indices;
+  value_t *b_data;
 
-  const raft::handle_t& handle;
+  const raft::handle_t &handle;
 };
 
 template <typename value_t>
 class distances_t {
  public:
-  virtual void compute(value_t* out) {}
+  virtual void compute(value_t *out) {}
   virtual ~distances_t() = default;
 };
 
diff --git a/cpp/include/raft/sparse/distance/coo_spmv.cuh b/cpp/include/raft/sparse/distance/coo_spmv.cuh
index cdf1be0c68..3a78f9ada0 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv.cuh
@@ -41,29 +41,19 @@ namespace raft {
 namespace sparse {
 namespace distance {
 
-template <typename value_idx,
-          typename value_t,
-          int threads_per_block = 1024,
-          typename product_f,
-          typename accum_f,
-          typename write_f,
+template <typename value_idx, typename value_t, int threads_per_block = 1024,
+          typename product_f, typename accum_f, typename write_f,
           typename strategy_t>
 inline void balanced_coo_pairwise_generalized_spmv(
-  value_t* out_dists,
-  const distances_config_t<value_idx, value_t>& config_,
-  value_idx* coo_rows_b,
-  product_f product_func,
-  accum_f accum_func,
-  write_f write_func,
-  strategy_t strategy,
-  int chunk_size = 500000)
-{
-  CUDA_CHECK(cudaMemsetAsync(out_dists,
-                             0,
-                             sizeof(value_t) * config_.a_nrows * config_.b_nrows,
-                             config_.handle.get_stream()));
-
-  strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
+  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
+  value_idx *coo_rows_b, product_f product_func, accum_f accum_func,
+  write_f write_func, strategy_t strategy, int chunk_size = 500000) {
+  CUDA_CHECK(cudaMemsetAsync(
+    out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows,
+    config_.handle.get_stream()));
+
+  strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func,
+                    chunk_size);
 };
 
 /**
@@ -99,55 +89,39 @@ inline void balanced_coo_pairwise_generalized_spmv(
  *            this value was found through profiling and represents a reasonable
  *            setting for both large and small densities
  */
-template <typename value_idx,
-          typename value_t,
-          int threads_per_block = 1024,
-          typename product_f,
-          typename accum_f,
-          typename write_f>
+template <typename value_idx, typename value_t, int threads_per_block = 1024,
+          typename product_f, typename accum_f, typename write_f>
 inline void balanced_coo_pairwise_generalized_spmv(
-  value_t* out_dists,
-  const distances_config_t<value_idx, value_t>& config_,
-  value_idx* coo_rows_b,
-  product_f product_func,
-  accum_f accum_func,
-  write_f write_func,
-  int chunk_size = 500000)
-{
-  CUDA_CHECK(cudaMemsetAsync(out_dists,
-                             0,
-                             sizeof(value_t) * config_.a_nrows * config_.b_nrows,
-                             config_.handle.get_stream()));
+  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
+  value_idx *coo_rows_b, product_f product_func, accum_f accum_func,
+  write_f write_func, int chunk_size = 500000) {
+  CUDA_CHECK(cudaMemsetAsync(
+    out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows,
+    config_.handle.get_stream()));
 
   int max_cols = max_cols_per_block<value_idx, value_t>();
 
   if (max_cols > config_.a_ncols) {
-    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
-    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(
+      config_);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func,
+                      write_func, chunk_size);
   } else {
     hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
-    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func,
+                      write_func, chunk_size);
   }
 };
 
-template <typename value_idx,
-          typename value_t,
-          int threads_per_block = 1024,
-          typename product_f,
-          typename accum_f,
-          typename write_f,
+template <typename value_idx, typename value_t, int threads_per_block = 1024,
+          typename product_f, typename accum_f, typename write_f,
           typename strategy_t>
 inline void balanced_coo_pairwise_generalized_spmv_rev(
-  value_t* out_dists,
-  const distances_config_t<value_idx, value_t>& config_,
-  value_idx* coo_rows_a,
-  product_f product_func,
-  accum_f accum_func,
-  write_f write_func,
-  strategy_t strategy,
-  int chunk_size = 500000)
-{
-  strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
+  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
+  value_idx *coo_rows_a, product_f product_func, accum_f accum_func,
+  write_f write_func, strategy_t strategy, int chunk_size = 500000) {
+  strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func,
+                        write_func, chunk_size);
 };
 
 /**
@@ -186,30 +160,24 @@ inline void balanced_coo_pairwise_generalized_spmv_rev(
  *            this value was found through profiling and represents a reasonable
  *            setting for both large and small densities
  */
-template <typename value_idx,
-          typename value_t,
-          int threads_per_block = 1024,
-          typename product_f,
-          typename accum_f,
-          typename write_f>
+template <typename value_idx, typename value_t, int threads_per_block = 1024,
+          typename product_f, typename accum_f, typename write_f>
 inline void balanced_coo_pairwise_generalized_spmv_rev(
-  value_t* out_dists,
-  const distances_config_t<value_idx, value_t>& config_,
-  value_idx* coo_rows_a,
-  product_f product_func,
-  accum_f accum_func,
-  write_f write_func,
-  int chunk_size = 500000)
-{
+  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
+  value_idx *coo_rows_a, product_f product_func, accum_f accum_func,
+  write_f write_func, int chunk_size = 500000) {
   // try dense first
   int max_cols = max_cols_per_block<value_idx, value_t>();
 
   if (max_cols > config_.b_ncols) {
-    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
-    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(
+      config_);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func,
+                          write_func, chunk_size);
   } else {
     hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
-    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func,
+                          write_func, chunk_size);
   }
 };
 
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
index 7a83e73183..5ace978a23 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/base_strategy.cuh
@@ -32,114 +32,58 @@ namespace distance {
 template <typename value_idx, typename value_t, int tpb>
 class coo_spmv_strategy {
  public:
-  coo_spmv_strategy(const distances_config_t<value_idx, value_t>& config_) : config(config_)
-  {
+  coo_spmv_strategy(const distances_config_t<value_idx, value_t> &config_)
+    : config(config_) {
     smem = raft::getSharedMemPerBlock();
   }
 
-  template <typename strategy_t,
-            typename indptr_it,
-            typename product_f,
-            typename accum_f,
-            typename write_f>
-  void _dispatch_base(strategy_t& strategy,
-                      int smem_dim,
-                      indptr_it& a_indptr,
-                      value_t* out_dists,
-                      value_idx* coo_rows_b,
-                      product_f product_func,
-                      accum_f accum_func,
-                      write_f write_func,
-                      int chunk_size,
-                      int n_blocks,
-                      int n_blocks_per_row)
-  {
-    CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
-                                                                           indptr_it,
-                                                                           value_idx,
-                                                                           value_t,
-                                                                           false,
-                                                                           tpb,
-                                                                           product_f,
-                                                                           accum_f,
-                                                                           write_f>,
-                                      cudaFuncCachePreferShared));
+  template <typename strategy_t, typename indptr_it, typename product_f,
+            typename accum_f, typename write_f>
+  void _dispatch_base(strategy_t &strategy, int smem_dim, indptr_it &a_indptr,
+                      value_t *out_dists, value_idx *coo_rows_b,
+                      product_f product_func, accum_f accum_func,
+                      write_f write_func, int chunk_size, int n_blocks,
+                      int n_blocks_per_row) {
+    CUDA_CHECK(cudaFuncSetCacheConfig(
+      balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
+                                           value_t, false, tpb, product_f,
+                                           accum_f, write_f>,
+      cudaFuncCachePreferShared));
 
-    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, false, tpb>
-      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(strategy,
-                                                            a_indptr,
-                                                            config.a_indices,
-                                                            config.a_data,
-                                                            config.a_nnz,
-                                                            coo_rows_b,
-                                                            config.b_indices,
-                                                            config.b_data,
-                                                            config.a_nrows,
-                                                            config.b_nrows,
-                                                            smem_dim,
-                                                            config.b_nnz,
-                                                            out_dists,
-                                                            n_blocks_per_row,
-                                                            chunk_size,
-                                                            config.b_ncols,
-                                                            product_func,
-                                                            accum_func,
-                                                            write_func);
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
+                                         value_t, false, tpb>
+      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(
+        strategy, a_indptr, config.a_indices, config.a_data, config.a_nnz,
+        coo_rows_b, config.b_indices, config.b_data, config.a_nrows,
+        config.b_nrows, smem_dim, config.b_nnz, out_dists, n_blocks_per_row,
+        chunk_size, config.b_ncols, product_func, accum_func, write_func);
   }
 
-  template <typename strategy_t,
-            typename indptr_it,
-            typename product_f,
-            typename accum_f,
-            typename write_f>
-  void _dispatch_base_rev(strategy_t& strategy,
-                          int smem_dim,
-                          indptr_it& b_indptr,
-                          value_t* out_dists,
-                          value_idx* coo_rows_a,
-                          product_f product_func,
-                          accum_f accum_func,
-                          write_f write_func,
-                          int chunk_size,
-                          int n_blocks,
-                          int n_blocks_per_row)
-  {
-    CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
-                                                                           indptr_it,
-                                                                           value_idx,
-                                                                           value_t,
-                                                                           true,
-                                                                           tpb,
-                                                                           product_f,
-                                                                           accum_f,
-                                                                           write_f>,
-                                      cudaFuncCachePreferShared));
+  template <typename strategy_t, typename indptr_it, typename product_f,
+            typename accum_f, typename write_f>
+  void _dispatch_base_rev(strategy_t &strategy, int smem_dim,
+                          indptr_it &b_indptr, value_t *out_dists,
+                          value_idx *coo_rows_a, product_f product_func,
+                          accum_f accum_func, write_f write_func,
+                          int chunk_size, int n_blocks, int n_blocks_per_row) {
+    CUDA_CHECK(cudaFuncSetCacheConfig(
+      balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
+                                           value_t, true, tpb, product_f,
+                                           accum_f, write_f>,
+      cudaFuncCachePreferShared));
 
-    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, true, tpb>
-      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(strategy,
-                                                            b_indptr,
-                                                            config.b_indices,
-                                                            config.b_data,
-                                                            config.b_nnz,
-                                                            coo_rows_a,
-                                                            config.a_indices,
-                                                            config.a_data,
-                                                            config.b_nrows,
-                                                            config.a_nrows,
-                                                            smem_dim,
-                                                            config.a_nnz,
-                                                            out_dists,
-                                                            n_blocks_per_row,
-                                                            chunk_size,
-                                                            config.a_ncols,
-                                                            product_func,
-                                                            accum_func,
-                                                            write_func);
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
+                                         value_t, true, tpb>
+      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(
+        strategy, b_indptr, config.b_indices, config.b_data, config.b_nnz,
+        coo_rows_a, config.a_indices, config.a_data, config.b_nrows,
+        config.a_nrows, smem_dim, config.a_nnz, out_dists, n_blocks_per_row,
+        chunk_size, config.a_ncols, product_func, accum_func, write_func);
   }
 
  protected:
   int smem;
-  const distances_config_t<value_idx, value_t>& config;
+  const distances_config_t<value_idx, value_t> &config;
 };
 
 }  // namespace distance
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
index 6586067b56..44c3833f96 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -29,15 +29,11 @@ namespace distance {
 template <typename value_idx>
 class mask_row_it {
  public:
-  mask_row_it(const value_idx* full_indptr_,
-              const value_idx& n_rows_,
-              value_idx* mask_row_idx_ = NULL)
-    : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_)
-  {
-  }
+  mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_,
+              value_idx *mask_row_idx_ = NULL)
+    : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) {}
 
-  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
-  {
+  __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) {
     if (mask_row_idx != NULL) {
       return mask_row_idx[blockIdx.x / n_blocks_nnz_b];
     } else {
@@ -45,49 +41,37 @@ class mask_row_it {
     }
   }
 
-  __device__ inline void get_row_offsets(const value_idx& row_idx,
-                                         value_idx& start_offset,
-                                         value_idx& stop_offset,
-                                         const value_idx& n_blocks_nnz_b,
-                                         bool& first_a_chunk,
-                                         bool& last_a_chunk)
-  {
+  __device__ inline void get_row_offsets(
+    const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset,
+    const value_idx &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) {
     start_offset = full_indptr[row_idx];
-    stop_offset  = full_indptr[row_idx + 1] - 1;
+    stop_offset = full_indptr[row_idx + 1] - 1;
   }
 
-  __device__ constexpr inline void get_indices_boundary(const value_idx* indices,
-                                                        value_idx& indices_len,
-                                                        value_idx& start_offset,
-                                                        value_idx& stop_offset,
-                                                        value_idx& start_index,
-                                                        value_idx& stop_index,
-                                                        bool& first_a_chunk,
-                                                        bool& last_a_chunk)
-  {
+  __device__ constexpr inline void get_indices_boundary(
+    const value_idx *indices, value_idx &indices_len, value_idx &start_offset,
+    value_idx &stop_offset, value_idx &start_index, value_idx &stop_index,
+    bool &first_a_chunk, bool &last_a_chunk) {
     // do nothing;
   }
 
-  __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a,
-                                                        value_idx& stop_index_a,
-                                                        value_idx& index_b)
-  {
+  __device__ constexpr inline bool check_indices_bounds(
+    value_idx &start_index_a, value_idx &stop_index_a, value_idx &index_b) {
     return true;
   }
 
   const value_idx *full_indptr, &n_rows;
-  value_idx* mask_row_idx;
+  value_idx *mask_row_idx;
 };
 
 template <typename value_idx>
-__global__ void fill_chunk_indices_kernel(value_idx* n_chunks_per_row,
-                                          value_idx* chunk_indices,
-                                          value_idx n_rows)
-{
+__global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row,
+                                          value_idx *chunk_indices,
+                                          value_idx n_rows) {
   auto tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n_rows) {
     auto start = n_chunks_per_row[tid];
-    auto end   = n_chunks_per_row[tid + 1];
+    auto end = n_chunks_per_row[tid + 1];
 
 #pragma unroll
     for (int i = start; i < end; i++) {
@@ -99,89 +83,73 @@ __global__ void fill_chunk_indices_kernel(value_idx* n_chunks_per_row,
 template <typename value_idx>
 class chunked_mask_row_it : public mask_row_it<value_idx> {
  public:
-  chunked_mask_row_it(const value_idx* full_indptr_,
-                      const value_idx& n_rows_,
-                      value_idx* mask_row_idx_,
-                      int row_chunk_size_,
-                      const value_idx* n_chunks_per_row_,
-                      const value_idx* chunk_indices_,
+  chunked_mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_,
+                      value_idx *mask_row_idx_, int row_chunk_size_,
+                      const value_idx *n_chunks_per_row_,
+                      const value_idx *chunk_indices_,
                       const cudaStream_t stream_)
     : mask_row_it<value_idx>(full_indptr_, n_rows_, mask_row_idx_),
       row_chunk_size(row_chunk_size_),
       n_chunks_per_row(n_chunks_per_row_),
       chunk_indices(chunk_indices_),
-      stream(stream_)
-  {
-  }
+      stream(stream_) {}
 
-  static void init(const value_idx* indptr,
-                   const value_idx* mask_row_idx,
-                   const value_idx& n_rows,
-                   const int row_chunk_size,
-                   rmm::device_uvector<value_idx>& n_chunks_per_row,
-                   rmm::device_uvector<value_idx>& chunk_indices,
-                   cudaStream_t stream)
-  {
+  static void init(const value_idx *indptr, const value_idx *mask_row_idx,
+                   const value_idx &n_rows, const int row_chunk_size,
+                   rmm::device_uvector<value_idx> &n_chunks_per_row,
+                   rmm::device_uvector<value_idx> &chunk_indices,
+                   cudaStream_t stream) {
     auto policy = rmm::exec_policy(stream);
 
     constexpr value_idx first_element = 0;
     n_chunks_per_row.set_element_async(0, first_element, stream);
     n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size);
-    thrust::transform(
-      policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor);
+    thrust::transform(policy, mask_row_idx, mask_row_idx + n_rows,
+                      n_chunks_per_row.begin() + 1, chunk_functor);
 
-    thrust::inclusive_scan(
-      policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1);
+    thrust::inclusive_scan(policy, n_chunks_per_row.begin() + 1,
+                           n_chunks_per_row.end(),
+                           n_chunks_per_row.begin() + 1);
 
-    raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream);
+    raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1,
+                      stream);
 
     fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream);
   }
 
-  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
-  {
+  __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) {
     return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]];
   }
 
-  __device__ inline void get_row_offsets(const value_idx& row_idx,
-                                         value_idx& start_offset,
-                                         value_idx& stop_offset,
-                                         const int& n_blocks_nnz_b,
-                                         bool& first_a_chunk,
-                                         bool& last_a_chunk)
-  {
-    auto chunk_index    = blockIdx.x / n_blocks_nnz_b;
-    auto chunk_val      = chunk_indices[chunk_index];
-    auto prev_n_chunks  = n_chunks_per_row[chunk_val];
+  __device__ inline void get_row_offsets(
+    const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset,
+    const int &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) {
+    auto chunk_index = blockIdx.x / n_blocks_nnz_b;
+    auto chunk_val = chunk_indices[chunk_index];
+    auto prev_n_chunks = n_chunks_per_row[chunk_val];
     auto relative_chunk = chunk_index - prev_n_chunks;
-    first_a_chunk       = relative_chunk == 0;
+    first_a_chunk = relative_chunk == 0;
 
     start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size;
-    stop_offset  = start_offset + row_chunk_size;
+    stop_offset = start_offset + row_chunk_size;
 
     auto final_stop_offset = this->full_indptr[row_idx + 1];
 
     last_a_chunk = stop_offset >= final_stop_offset;
-    stop_offset  = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1;
+    stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1;
   }
 
-  __device__ inline void get_indices_boundary(const value_idx* indices,
-                                              value_idx& row_idx,
-                                              value_idx& start_offset,
-                                              value_idx& stop_offset,
-                                              value_idx& start_index,
-                                              value_idx& stop_index,
-                                              bool& first_a_chunk,
-                                              bool& last_a_chunk)
-  {
+  __device__ inline void get_indices_boundary(
+    const value_idx *indices, value_idx &row_idx, value_idx &start_offset,
+    value_idx &stop_offset, value_idx &start_index, value_idx &stop_index,
+    bool &first_a_chunk, bool &last_a_chunk) {
     start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1;
-    stop_index  = last_a_chunk ? stop_index : indices[stop_offset];
+    stop_index = last_a_chunk ? stop_index : indices[stop_offset];
   }
 
-  __device__ inline bool check_indices_bounds(value_idx& start_index_a,
-                                              value_idx& stop_index_a,
-                                              value_idx& index_b)
-  {
+  __device__ inline bool check_indices_bounds(value_idx &start_index_a,
+                                              value_idx &stop_index_a,
+                                              value_idx &index_b) {
     return (index_b >= start_index_a && index_b <= stop_index_a);
   }
 
@@ -192,34 +160,30 @@ class chunked_mask_row_it : public mask_row_it<value_idx> {
 
   struct n_chunks_per_row_functor {
    public:
-    n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_)
-      : indptr(indptr_), row_chunk_size(row_chunk_size_)
-    {
-    }
+    n_chunks_per_row_functor(const value_idx *indptr_,
+                             value_idx row_chunk_size_)
+      : indptr(indptr_), row_chunk_size(row_chunk_size_) {}
 
-    __host__ __device__ value_idx operator()(const value_idx& i)
-    {
+    __host__ __device__ value_idx operator()(const value_idx &i) {
       auto degree = indptr[i + 1] - indptr[i];
       return raft::ceildiv(degree, (value_idx)row_chunk_size);
     }
 
-    const value_idx* indptr;
+    const value_idx *indptr;
     value_idx row_chunk_size;
   };
 
  private:
-  static void fill_chunk_indices(const value_idx& n_rows,
-                                 rmm::device_uvector<value_idx>& n_chunks_per_row,
-                                 rmm::device_uvector<value_idx>& chunk_indices,
-                                 cudaStream_t stream)
-  {
+  static void fill_chunk_indices(
+    const value_idx &n_rows, rmm::device_uvector<value_idx> &n_chunks_per_row,
+    rmm::device_uvector<value_idx> &chunk_indices, cudaStream_t stream) {
     auto n_threads = std::min(n_rows, 256);
-    auto n_blocks  = raft::ceildiv(n_rows, (value_idx)n_threads);
+    auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads);
 
     chunk_indices.resize(total_row_blocks, stream);
 
-    fill_chunk_indices_kernel<value_idx>
-      <<<n_blocks, n_threads, 0, stream>>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows);
+    fill_chunk_indices_kernel<value_idx><<<n_blocks, n_threads, 0, stream>>>(
+      n_chunks_per_row.data(), chunk_indices.data(), n_rows);
   }
 };
 
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh
index aac98d6b02..c463654a3b 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/dense_smem_strategy.cuh
@@ -25,91 +25,71 @@ namespace distance {
 template <typename value_idx, typename value_t, int tpb>
 class dense_smem_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
  public:
-  using smem_type   = value_t*;
+  using smem_type = value_t *;
   using insert_type = smem_type;
-  using find_type   = smem_type;
+  using find_type = smem_type;
 
-  dense_smem_strategy(const distances_config_t<value_idx, value_t>& config_)
-    : coo_spmv_strategy<value_idx, value_t, tpb>(config_)
-  {
-  }
+  dense_smem_strategy(const distances_config_t<value_idx, value_t> &config_)
+    : coo_spmv_strategy<value_idx, value_t, tpb>(config_) {}
 
-  inline static int smem_per_block(int n_cols)
-  {
-    return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t));
+  inline static int smem_per_block(int n_cols) {
+    return (n_cols * sizeof(value_t)) +
+           ((1024 / raft::warp_size()) * sizeof(value_t));
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch(value_t* out_dists,
-                value_idx* coo_rows_b,
-                product_f product_func,
-                accum_f accum_func,
-                write_f write_func,
-                int chunk_size)
-  {
-    auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024);
-    auto n_blocks         = this->config.a_nrows * n_blocks_per_row;
-
-    mask_row_it<value_idx> a_indptr(this->config.a_indptr, this->config.a_nrows);
-
-    this->_dispatch_base(*this,
-                         this->config.b_ncols,
-                         a_indptr,
-                         out_dists,
-                         coo_rows_b,
-                         product_func,
-                         accum_func,
-                         write_func,
-                         chunk_size,
-                         n_blocks,
-                         n_blocks_per_row);
+  void dispatch(value_t *out_dists, value_idx *coo_rows_b,
+                product_f product_func, accum_f accum_func, write_f write_func,
+                int chunk_size) {
+    auto n_blocks_per_row =
+      raft::ceildiv(this->config.b_nnz, chunk_size * 1024);
+    auto n_blocks = this->config.a_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> a_indptr(this->config.a_indptr,
+                                    this->config.a_nrows);
+
+    this->_dispatch_base(*this, this->config.b_ncols, a_indptr, out_dists,
+                         coo_rows_b, product_func, accum_func, write_func,
+                         chunk_size, n_blocks, n_blocks_per_row);
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch_rev(value_t* out_dists,
-                    value_idx* coo_rows_a,
-                    product_f product_func,
-                    accum_f accum_func,
-                    write_f write_func,
-                    int chunk_size)
-  {
-    auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024);
-    auto n_blocks         = this->config.b_nrows * n_blocks_per_row;
-
-    mask_row_it<value_idx> b_indptr(this->config.b_indptr, this->config.b_nrows);
-
-    this->_dispatch_base_rev(*this,
-                             this->config.a_ncols,
-                             b_indptr,
-                             out_dists,
-                             coo_rows_a,
-                             product_func,
-                             accum_func,
-                             write_func,
-                             chunk_size,
-                             n_blocks,
-                             n_blocks_per_row);
+  void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a,
+                    product_f product_func, accum_f accum_func,
+                    write_f write_func, int chunk_size) {
+    auto n_blocks_per_row =
+      raft::ceildiv(this->config.a_nnz, chunk_size * 1024);
+    auto n_blocks = this->config.b_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> b_indptr(this->config.b_indptr,
+                                    this->config.b_nrows);
+
+    this->_dispatch_base_rev(*this, this->config.a_ncols, b_indptr, out_dists,
+                             coo_rows_a, product_func, accum_func, write_func,
+                             chunk_size, n_blocks, n_blocks_per_row);
   }
 
-  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
-  {
+  __device__ inline insert_type init_insert(smem_type cache,
+                                            const value_idx &cache_size) {
     for (int k = threadIdx.x; k < cache_size; k += blockDim.x) {
       cache[k] = 0.0;
     }
     return cache;
   }
 
-  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
-  {
+  __device__ inline void insert(insert_type cache, const value_idx &key,
+                                const value_t &value) {
     cache[key] = value;
   }
 
-  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
-  {
+  __device__ inline find_type init_find(smem_type cache,
+                                        const value_idx &cache_size) {
     return cache;
   }
 
-  __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; }
+  __device__ inline value_t find(find_type cache, const value_idx &key) {
+    return cache[key];
+  }
 };
 
 }  // namespace distance
diff --git a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
index 3f8f4b21ad..1295d24103 100644
--- a/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/coo_spmv_strategies/hash_strategy.cuh
@@ -1,18 +1,18 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+  * Copyright (c) 2021, NVIDIA CORPORATION.
+  *
+  * Licensed under the Apache License, Version 2.0 (the "License");
+  * you may not use this file except in compliance with the License.
+  * You may obtain a copy of the License at
+  *
+  *     http://www.apache.org/licenses/LICENSE-2.0
+  *
+  * Unless required by applicable law or agreed to in writing, software
+  * distributed under the License is distributed on an "AS IS" BASIS,
+  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  * See the License for the specific language governing permissions and
+  * limitations under the License.
+  */
 
 #pragma once
 
@@ -38,238 +38,177 @@ template <typename value_idx, typename value_t, int tpb>
 class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
  public:
   using insert_type =
-    typename cuco::static_map<value_idx, value_t, cuda::thread_scope_block>::device_mutable_view;
-  using smem_type = typename insert_type::slot_type*;
+    typename cuco::static_map<value_idx, value_t,
+                              cuda::thread_scope_block>::device_mutable_view;
+  using smem_type = typename insert_type::slot_type *;
   using find_type =
-    typename cuco::static_map<value_idx, value_t, cuda::thread_scope_block>::device_view;
+    typename cuco::static_map<value_idx, value_t,
+                              cuda::thread_scope_block>::device_view;
 
-  hash_strategy(const distances_config_t<value_idx, value_t>& config_,
-                float capacity_threshold_ = 0.5,
-                int map_size_             = get_map_size())
+  hash_strategy(const distances_config_t<value_idx, value_t> &config_,
+                float capacity_threshold_ = 0.5, int map_size_ = get_map_size())
     : coo_spmv_strategy<value_idx, value_t, tpb>(config_),
       capacity_threshold(capacity_threshold_),
-      map_size(map_size_)
-  {
-  }
+      map_size(map_size_) {}
 
-  void chunking_needed(const value_idx* indptr,
-                       const value_idx n_rows,
-                       rmm::device_uvector<value_idx>& mask_indptr,
-                       std::tuple<value_idx, value_idx>& n_rows_divided,
-                       cudaStream_t stream)
-  {
+  void chunking_needed(const value_idx *indptr, const value_idx n_rows,
+                       rmm::device_uvector<value_idx> &mask_indptr,
+                       std::tuple<value_idx, value_idx> &n_rows_divided,
+                       cudaStream_t stream) {
     auto policy = rmm::exec_policy(stream);
 
-    auto less                   = thrust::copy_if(policy,
-                                thrust::make_counting_iterator(value_idx(0)),
-                                thrust::make_counting_iterator(n_rows),
-                                mask_indptr.data(),
-                                fits_in_hash_table(indptr, 0, capacity_threshold * map_size));
+    auto less = thrust::copy_if(
+      policy, thrust::make_counting_iterator(value_idx(0)),
+      thrust::make_counting_iterator(n_rows), mask_indptr.data(),
+      fits_in_hash_table(indptr, 0, capacity_threshold * map_size));
     std::get<0>(n_rows_divided) = less - mask_indptr.data();
 
     auto more = thrust::copy_if(
-      policy,
-      thrust::make_counting_iterator(value_idx(0)),
-      thrust::make_counting_iterator(n_rows),
-      less,
-      fits_in_hash_table(
-        indptr, capacity_threshold * map_size, std::numeric_limits<value_idx>::max()));
+      policy, thrust::make_counting_iterator(value_idx(0)),
+      thrust::make_counting_iterator(n_rows), less,
+      fits_in_hash_table(indptr, capacity_threshold * map_size,
+                         std::numeric_limits<value_idx>::max()));
     std::get<1>(n_rows_divided) = more - less;
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch(value_t* out_dists,
-                value_idx* coo_rows_b,
-                product_f product_func,
-                accum_f accum_func,
-                write_f write_func,
-                int chunk_size)
-  {
+  void dispatch(value_t *out_dists, value_idx *coo_rows_b,
+                product_f product_func, accum_f accum_func, write_f write_func,
+                int chunk_size) {
     auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb);
-    rmm::device_uvector<value_idx> mask_indptr(this->config.a_nrows,
-                                               this->config.handle.get_stream());
+    rmm::device_uvector<value_idx> mask_indptr(
+      this->config.a_nrows, this->config.handle.get_stream());
     std::tuple<value_idx, value_idx> n_rows_divided;
 
-    chunking_needed(this->config.a_indptr,
-                    this->config.a_nrows,
-                    mask_indptr,
-                    n_rows_divided,
-                    this->config.handle.get_stream());
+    chunking_needed(this->config.a_indptr, this->config.a_nrows, mask_indptr,
+                    n_rows_divided, this->config.handle.get_stream());
 
     auto less_rows = std::get<0>(n_rows_divided);
     if (less_rows > 0) {
-      mask_row_it<value_idx> less(this->config.a_indptr, less_rows, mask_indptr.data());
+      mask_row_it<value_idx> less(this->config.a_indptr, less_rows,
+                                  mask_indptr.data());
 
       auto n_less_blocks = less_rows * n_blocks_per_row;
-      this->_dispatch_base(*this,
-                           map_size,
-                           less,
-                           out_dists,
-                           coo_rows_b,
-                           product_func,
-                           accum_func,
-                           write_func,
-                           chunk_size,
-                           n_less_blocks,
-                           n_blocks_per_row);
+      this->_dispatch_base(*this, map_size, less, out_dists, coo_rows_b,
+                           product_func, accum_func, write_func, chunk_size,
+                           n_less_blocks, n_blocks_per_row);
     }
 
     auto more_rows = std::get<1>(n_rows_divided);
     if (more_rows > 0) {
-      rmm::device_uvector<value_idx> n_chunks_per_row(more_rows + 1,
-                                                      this->config.handle.get_stream());
-      rmm::device_uvector<value_idx> chunk_indices(0, this->config.handle.get_stream());
-      chunked_mask_row_it<value_idx>::init(this->config.a_indptr,
-                                           mask_indptr.data() + less_rows,
-                                           more_rows,
-                                           capacity_threshold * map_size,
-                                           n_chunks_per_row,
-                                           chunk_indices,
-                                           this->config.handle.get_stream());
-
-      chunked_mask_row_it<value_idx> more(this->config.a_indptr,
-                                          more_rows,
-                                          mask_indptr.data() + less_rows,
-                                          capacity_threshold * map_size,
-                                          n_chunks_per_row.data(),
-                                          chunk_indices.data(),
-                                          this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> n_chunks_per_row(
+        more_rows + 1, this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> chunk_indices(
+        0, this->config.handle.get_stream());
+      chunked_mask_row_it<value_idx>::init(
+        this->config.a_indptr, mask_indptr.data() + less_rows, more_rows,
+        capacity_threshold * map_size, n_chunks_per_row, chunk_indices,
+        this->config.handle.get_stream());
+
+      chunked_mask_row_it<value_idx> more(
+        this->config.a_indptr, more_rows, mask_indptr.data() + less_rows,
+        capacity_threshold * map_size, n_chunks_per_row.data(),
+        chunk_indices.data(), this->config.handle.get_stream());
 
       auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
-      this->_dispatch_base(*this,
-                           map_size,
-                           more,
-                           out_dists,
-                           coo_rows_b,
-                           product_func,
-                           accum_func,
-                           write_func,
-                           chunk_size,
-                           n_more_blocks,
-                           n_blocks_per_row);
+      this->_dispatch_base(*this, map_size, more, out_dists, coo_rows_b,
+                           product_func, accum_func, write_func, chunk_size,
+                           n_more_blocks, n_blocks_per_row);
     }
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch_rev(value_t* out_dists,
-                    value_idx* coo_rows_a,
-                    product_f product_func,
-                    accum_f accum_func,
-                    write_f write_func,
-                    int chunk_size)
-  {
+  void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a,
+                    product_f product_func, accum_f accum_func,
+                    write_f write_func, int chunk_size) {
     auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb);
-    rmm::device_uvector<value_idx> mask_indptr(this->config.b_nrows,
-                                               this->config.handle.get_stream());
+    rmm::device_uvector<value_idx> mask_indptr(
+      this->config.b_nrows, this->config.handle.get_stream());
     std::tuple<value_idx, value_idx> n_rows_divided;
 
-    chunking_needed(this->config.b_indptr,
-                    this->config.b_nrows,
-                    mask_indptr,
-                    n_rows_divided,
-                    this->config.handle.get_stream());
+    chunking_needed(this->config.b_indptr, this->config.b_nrows, mask_indptr,
+                    n_rows_divided, this->config.handle.get_stream());
 
     auto less_rows = std::get<0>(n_rows_divided);
     if (less_rows > 0) {
-      mask_row_it<value_idx> less(this->config.b_indptr, less_rows, mask_indptr.data());
+      mask_row_it<value_idx> less(this->config.b_indptr, less_rows,
+                                  mask_indptr.data());
 
       auto n_less_blocks = less_rows * n_blocks_per_row;
-      this->_dispatch_base_rev(*this,
-                               map_size,
-                               less,
-                               out_dists,
-                               coo_rows_a,
-                               product_func,
-                               accum_func,
-                               write_func,
-                               chunk_size,
-                               n_less_blocks,
-                               n_blocks_per_row);
+      this->_dispatch_base_rev(*this, map_size, less, out_dists, coo_rows_a,
+                               product_func, accum_func, write_func, chunk_size,
+                               n_less_blocks, n_blocks_per_row);
     }
 
     auto more_rows = std::get<1>(n_rows_divided);
     if (more_rows > 0) {
-      rmm::device_uvector<value_idx> n_chunks_per_row(more_rows + 1,
-                                                      this->config.handle.get_stream());
-      rmm::device_uvector<value_idx> chunk_indices(0, this->config.handle.get_stream());
-      chunked_mask_row_it<value_idx>::init(this->config.b_indptr,
-                                           mask_indptr.data() + less_rows,
-                                           more_rows,
-                                           capacity_threshold * map_size,
-                                           n_chunks_per_row,
-                                           chunk_indices,
-                                           this->config.handle.get_stream());
-
-      chunked_mask_row_it<value_idx> more(this->config.b_indptr,
-                                          more_rows,
-                                          mask_indptr.data() + less_rows,
-                                          capacity_threshold * map_size,
-                                          n_chunks_per_row.data(),
-                                          chunk_indices.data(),
-                                          this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> n_chunks_per_row(
+        more_rows + 1, this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> chunk_indices(
+        0, this->config.handle.get_stream());
+      chunked_mask_row_it<value_idx>::init(
+        this->config.b_indptr, mask_indptr.data() + less_rows, more_rows,
+        capacity_threshold * map_size, n_chunks_per_row, chunk_indices,
+        this->config.handle.get_stream());
+
+      chunked_mask_row_it<value_idx> more(
+        this->config.b_indptr, more_rows, mask_indptr.data() + less_rows,
+        capacity_threshold * map_size, n_chunks_per_row.data(),
+        chunk_indices.data(), this->config.handle.get_stream());
 
       auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
-      this->_dispatch_base_rev(*this,
-                               map_size,
-                               more,
-                               out_dists,
-                               coo_rows_a,
-                               product_func,
-                               accum_func,
-                               write_func,
-                               chunk_size,
-                               n_more_blocks,
-                               n_blocks_per_row);
+      this->_dispatch_base_rev(*this, map_size, more, out_dists, coo_rows_a,
+                               product_func, accum_func, write_func, chunk_size,
+                               n_more_blocks, n_blocks_per_row);
     }
   }
 
-  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
-  {
+  __device__ inline insert_type init_insert(smem_type cache,
+                                            const value_idx &cache_size) {
     return insert_type::make_from_uninitialized_slots(
       cooperative_groups::this_thread_block(), cache, cache_size, -1, 0);
   }
 
-  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
-  {
+  __device__ inline void insert(insert_type cache, const value_idx &key,
+                                const value_t &value) {
     auto success = cache.insert(cuco::pair<value_idx, value_t>(key, value));
   }
 
-  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
-  {
+  __device__ inline find_type init_find(smem_type cache,
+                                        const value_idx &cache_size) {
     return find_type(cache, cache_size, -1, 0);
   }
 
-  __device__ inline value_t find(find_type cache, const value_idx& key)
-  {
+  __device__ inline value_t find(find_type cache, const value_idx &key) {
     auto a_pair = cache.find(key);
 
     value_t a_col = 0.0;
-    if (a_pair != cache.end()) { a_col = a_pair->second; }
+    if (a_pair != cache.end()) {
+      a_col = a_pair->second;
+    }
     return a_col;
   }
 
   struct fits_in_hash_table {
    public:
-    fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_)
-      : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_)
-    {
-    }
+    fits_in_hash_table(const value_idx *indptr_, value_idx degree_l_,
+                       value_idx degree_r_)
+      : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) {}
 
-    __host__ __device__ bool operator()(const value_idx& i)
-    {
+    __host__ __device__ bool operator()(const value_idx &i) {
       auto degree = indptr[i + 1] - indptr[i];
 
       return degree >= degree_l && degree < degree_r;
     }
 
    private:
-    const value_idx* indptr;
+    const value_idx *indptr;
     const value_idx degree_l, degree_r;
   };
 
-  inline static int get_map_size()
-  {
-    return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
+  inline static int get_map_size() {
+    return (raft::getSharedMemPerBlock() -
+            ((tpb / raft::warp_size()) * sizeof(value_t))) /
            sizeof(typename insert_type::slot_type);
   }
 
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
index b12252ab25..51f9a05394 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
@@ -27,88 +27,68 @@ namespace sparse {
 namespace distance {
 
 /**
- * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
- * sparse-matrix-sparse-vector multiplication layout (SPMV).
- * This is intended to be scheduled n_chunks_b times for each row of a.
- * The steps are as follows:
- *
- * 1. Load row from A into dense vector in shared memory.
- *    This can be further chunked in the future if necessary to support larger
- *    column sizes.
- * 2. Threads of block all step through chunks of B in parallel.
- *    When a new row is encountered in row_indices_b, a segmented
- *    reduction is performed across the warps and then across the
- *    block and the final value written out to host memory.
- *
- * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf
- *
- * @tparam value_idx index type
- * @tparam value_t value type
- * @tparam tpb threads per block configured on launch
- * @tparam rev if this is true, the reduce/accumulate functions are only
- *         executed when A[col] == 0.0. when executed before/after !rev
- *         and A & B are reversed, this allows the full symmetric difference
- *         and intersection to be computed.
- * @tparam kv_t data type stored in shared mem cache
- * @tparam product_f reduce function type (semiring product() function).
- *                  accepts two arguments of value_t and returns a value_t
- * @tparam accum_f accumulation function type (semiring sum() function).
- *                 accepts two arguments of value_t and returns a value_t
- * @tparam write_f function to write value out. this should be mathematically
- *                 equivalent to the accumulate function but implemented as
- *                 an atomic operation on global memory. Accepts two arguments
- *                 of value_t* and value_t and updates the value given by the
- *                 pointer.
- * @param[in] indptrA column pointer array for A
- * @param[in] indicesA column indices array for A
- * @param[in] dataA data array for A
- * @param[in] rowsB coo row array for B
- * @param[in] indicesB column indices array for B
- * @param[in] dataB data array for B
- * @param[in] m number of rows in A
- * @param[in] n number of rows in B
- * @param[in] dim number of features
- * @param[in] nnz_b number of nonzeros in B
- * @param[out] out array of size m*n
- * @param[in] n_blocks_per_row number of blocks of B per row of A
- * @param[in] chunk_size number of nnz for B to use for each row of A
- * @param[in] buffer_size amount of smem to use for each row of A
- * @param[in] product_func semiring product() function
- * @param[in] accum_func semiring sum() function
- * @param[in] write_func atomic semiring sum() function
- */
-template <typename strategy_t,
-          typename indptr_it,
-          typename value_idx,
-          typename value_t,
-          bool rev,
-          int tpb,
-          typename product_f,
-          typename accum_f,
-          typename write_f>
-__global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy,
-                                                     indptr_it indptrA,
-                                                     value_idx* indicesA,
-                                                     value_t* dataA,
-                                                     value_idx nnz_a,
-                                                     value_idx* rowsB,
-                                                     value_idx* indicesB,
-                                                     value_t* dataB,
-                                                     value_idx m,
-                                                     value_idx n,
-                                                     int dim,
-                                                     value_idx nnz_b,
-                                                     value_t* out,
-                                                     int n_blocks_per_row,
-                                                     int chunk_size,
-                                                     value_idx b_ncols,
-                                                     product_f product_func,
-                                                     accum_f accum_func,
-                                                     write_f write_func)
-{
+  * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
+  * sparse-matrix-sparse-vector multiplication layout (SPMV).
+  * This is intended to be scheduled n_chunks_b times for each row of a.
+  * The steps are as follows:
+  *
+  * 1. Load row from A into dense vector in shared memory.
+  *    This can be further chunked in the future if necessary to support larger
+  *    column sizes.
+  * 2. Threads of block all step through chunks of B in parallel.
+  *    When a new row is encountered in row_indices_b, a segmented
+  *    reduction is performed across the warps and then across the
+  *    block and the final value written out to host memory.
+  *
+  * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf
+  *
+  * @tparam value_idx index type
+  * @tparam value_t value type
+  * @tparam tpb threads per block configured on launch
+  * @tparam rev if this is true, the reduce/accumulate functions are only
+  *         executed when A[col] == 0.0. when executed before/after !rev
+  *         and A & B are reversed, this allows the full symmetric difference
+  *         and intersection to be computed.
+  * @tparam kv_t data type stored in shared mem cache
+  * @tparam product_f reduce function type (semiring product() function).
+  *                  accepts two arguments of value_t and returns a value_t
+  * @tparam accum_f accumulation function type (semiring sum() function).
+  *                 accepts two arguments of value_t and returns a value_t
+  * @tparam write_f function to write value out. this should be mathematically
+  *                 equivalent to the accumulate function but implemented as
+  *                 an atomic operation on global memory. Accepts two arguments
+  *                 of value_t* and value_t and updates the value given by the
+  *                 pointer.
+  * @param[in] indptrA column pointer array for A
+  * @param[in] indicesA column indices array for A
+  * @param[in] dataA data array for A
+  * @param[in] rowsB coo row array for B
+  * @param[in] indicesB column indices array for B
+  * @param[in] dataB data array for B
+  * @param[in] m number of rows in A
+  * @param[in] n number of rows in B
+  * @param[in] dim number of features
+  * @param[in] nnz_b number of nonzeros in B
+  * @param[out] out array of size m*n
+  * @param[in] n_blocks_per_row number of blocks of B per row of A
+  * @param[in] chunk_size number of nnz for B to use for each row of A
+  * @param[in] buffer_size amount of smem to use for each row of A
+  * @param[in] product_func semiring product() function
+  * @param[in] accum_func semiring sum() function
+  * @param[in] write_func atomic semiring sum() function
+  */
+template <typename strategy_t, typename indptr_it, typename value_idx,
+          typename value_t, bool rev, int tpb, typename product_f,
+          typename accum_f, typename write_f>
+__global__ void balanced_coo_generalized_spmv_kernel(
+  strategy_t strategy, indptr_it indptrA, value_idx *indicesA, value_t *dataA,
+  value_idx nnz_a, value_idx *rowsB, value_idx *indicesB, value_t *dataB,
+  value_idx m, value_idx n, int dim, value_idx nnz_b, value_t *out,
+  int n_blocks_per_row, int chunk_size, value_idx b_ncols,
+  product_f product_func, accum_f accum_func, write_f write_func) {
   typedef cub::WarpReduce<value_t> warp_reduce;
 
-  value_idx cur_row_a        = indptrA.get_row_idx(n_blocks_per_row);
+  value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row);
   value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row;
 
   // chunk starting offset
@@ -116,17 +96,18 @@ __global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy,
   // how many total cols will be processed by this block (should be <= chunk_size * n_threads)
   value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset);
 
-  int tid     = threadIdx.x;
+  int tid = threadIdx.x;
   int warp_id = tid / raft::warp_size();
 
   // compute id relative to current warp
   unsigned int lane_id = tid & (raft::warp_size() - 1);
-  value_idx ind        = ind_offset + threadIdx.x;
+  value_idx ind = ind_offset + threadIdx.x;
 
   extern __shared__ char smem[];
 
-  typename strategy_t::smem_type A                = (typename strategy_t::smem_type)(smem);
-  typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim);
+  typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem);
+  typename warp_reduce::TempStorage *temp_storage =
+    (typename warp_reduce::TempStorage *)(A + dim);
 
   auto inserter = strategy.init_insert(A, dim);
 
@@ -134,12 +115,13 @@ __global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy,
 
   value_idx start_offset_a, stop_offset_a;
   bool first_a_chunk, last_a_chunk;
-  indptrA.get_row_offsets(
-    cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk);
+  indptrA.get_row_offsets(cur_row_a, start_offset_a, stop_offset_a,
+                          n_blocks_per_row, first_a_chunk, last_a_chunk);
 
   // Convert current row vector in A to dense
   for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) {
-    strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]);
+    strategy.insert(inserter, indicesA[start_offset_a + i],
+                    dataA[start_offset_a + i]);
   }
 
   __syncthreads();
@@ -150,36 +132,34 @@ __global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy,
   if (ind >= nnz_b) return;
 
   value_idx start_index_a = 0, stop_index_a = b_ncols - 1;
-  indptrA.get_indices_boundary(indicesA,
-                               cur_row_a,
-                               start_offset_a,
-                               stop_offset_a,
-                               start_index_a,
-                               stop_index_a,
-                               first_a_chunk,
-                               last_a_chunk);
+  indptrA.get_indices_boundary(indicesA, cur_row_a, start_offset_a,
+                               stop_offset_a, start_index_a, stop_index_a,
+                               first_a_chunk, last_a_chunk);
 
   value_idx cur_row_b = -1;
-  value_t c           = 0.0;
+  value_t c = 0.0;
 
   auto warp_red = warp_reduce(*(temp_storage + warp_id));
 
   if (tid < active_chunk_size) {
     cur_row_b = rowsB[ind];
 
-    auto index_b   = indicesB[ind];
-    auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+    auto index_b = indicesB[ind];
+    auto in_bounds =
+      indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
 
     if (in_bounds) {
       value_t a_col = strategy.find(finder, index_b);
-      if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); }
+      if (!rev || a_col == 0.0) {
+        c = product_func(a_col, dataB[ind]);
+      }
     }
   }
 
   // loop through chunks in parallel, reducing when a new row is
   // encountered by each thread
   for (int i = tid; i < active_chunk_size; i += blockDim.x) {
-    value_idx ind_next   = ind + blockDim.x;
+    value_idx ind_next = ind + blockDim.x;
     value_idx next_row_b = -1;
 
     if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next];
@@ -190,13 +170,14 @@ __global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy,
       // grab the threads currently participating in loops.
       // because any other threads should have returned already.
       unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b);
-      bool is_leader          = get_lowest_peer(peer_group) == lane_id;
-      value_t v               = warp_red.HeadSegmentedReduce(c, is_leader, accum_func);
+      bool is_leader = get_lowest_peer(peer_group) == lane_id;
+      value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func);
 
       // thread with lowest lane id among peers writes out
       if (is_leader && v != 0.0) {
         // this conditional should be uniform, since rev is constant
-        size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a;
+        size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b
+                          : (size_t)cur_row_b * m + cur_row_a;
         write_func(out + idx, v);
       }
 
@@ -206,12 +187,15 @@ __global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy,
     if (next_row_b != -1) {
       ind = ind_next;
 
-      auto index_b   = indicesB[ind];
-      auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+      auto index_b = indicesB[ind];
+      auto in_bounds =
+        indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
       if (in_bounds) {
         value_t a_col = strategy.find(finder, index_b);
 
-        if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); }
+        if (!rev || a_col == 0.0) {
+          c = accum_func(c, product_func(a_col, dataB[ind]));
+        }
       }
 
       cur_row_b = next_row_b;
diff --git a/cpp/include/raft/sparse/distance/distance.cuh b/cpp/include/raft/sparse/distance/distance.cuh
index 228a62ed7a..a1974b3666 100644
--- a/cpp/include/raft/sparse/distance/distance.cuh
+++ b/cpp/include/raft/sparse/distance/distance.cuh
@@ -74,17 +74,16 @@ static const std::unordered_set<raft::distance::DistanceType> supportedDistance{
  * @param[in] metric distance metric to use
  */
 template <typename value_idx = int, typename value_t = float>
-void pairwiseDistance(value_t* out,
+void pairwiseDistance(value_t *out,
                       distances_config_t<value_idx, value_t> input_config,
-                      raft::distance::DistanceType metric,
-                      float metric_arg)
-{
+                      raft::distance::DistanceType metric, float metric_arg) {
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
       l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L2SqrtExpanded:
-      l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::InnerProduct:
       ip_distances_t<value_idx, value_t>(input_config).compute(out);
@@ -93,49 +92,62 @@ void pairwiseDistance(value_t* out,
       l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L2SqrtUnexpanded:
-      l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::L1:
       l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
-      lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg).compute(out);
+      lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg)
+        .compute(out);
       break;
     case raft::distance::DistanceType::Linf:
-      linf_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      linf_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::Canberra:
-      canberra_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      canberra_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::JaccardExpanded:
-      jaccard_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      jaccard_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      cosine_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      cosine_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      hellinger_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      hellinger_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::DiceExpanded:
       dice_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::CorrelationExpanded:
-      correlation_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      correlation_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::RusselRaoExpanded:
-      russelrao_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      russelrao_expanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::HammingUnexpanded:
-      hamming_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      hamming_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::JensenShannon:
-      jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
     case raft::distance::DistanceType::KLDivergence:
-      kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
+      kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config)
+        .compute(out);
       break;
 
-    default: THROW("Unsupported distance: %d", metric);
+    default:
+      THROW("Unsupported distance: %d", metric);
   }
 }
 
diff --git a/cpp/include/raft/sparse/distance/ip_distance.cuh b/cpp/include/raft/sparse/distance/ip_distance.cuh
index 8d77f9f5b5..882ccba027 100644
--- a/cpp/include/raft/sparse/distance/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/ip_distance.cuh
@@ -45,13 +45,10 @@ class ip_distances_t : public distances_t<value_t> {
    * Computes simple sparse inner product distances as sum(x_y * y_k)
    * @param[in] config specifies inputs, outputs, and sizes
    */
-  ip_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream())
-  {
-    raft::sparse::convert::csr_to_coo(config_->b_indptr,
-                                      config_->b_nrows,
-                                      coo_rows_b.data(),
-                                      config_->b_nnz,
+  ip_distances_t(const distances_config_t<value_idx, value_t> &config)
+    : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) {
+    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
+                                      coo_rows_b.data(), config_->b_nnz,
                                       config_->handle.get_stream());
   }
 
@@ -59,21 +56,21 @@ class ip_distances_t : public distances_t<value_t> {
    * Performs pairwise distance computation and computes output distances
    * @param out_distances dense output matrix (size a_nrows * b_nrows)
    */
-  void compute(value_t* out_distances)
-  {
+  void compute(value_t *out_distances) {
     /**
-     * Compute pairwise distances and return dense matrix in row-major format
-     */
+	   * Compute pairwise distances and return dense matrix in row-major format
+	   */
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_distances, *config_, coo_rows_b.data(), Product(), Sum(), AtomicAdd());
+      out_distances, *config_, coo_rows_b.data(), Product(), Sum(),
+      AtomicAdd());
   }
 
-  value_idx* b_rows_coo() { return coo_rows_b.data(); }
+  value_idx *b_rows_coo() { return coo_rows_b.data(); }
 
-  value_t* b_data_coo() { return config_->b_data; }
+  value_t *b_data_coo() { return config_->b_data; }
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
   rmm::device_uvector<value_idx> coo_rows_b;
 };
 };  // END namespace distance
diff --git a/cpp/include/raft/sparse/distance/l2_distance.cuh b/cpp/include/raft/sparse/distance/l2_distance.cuh
index a9a2d1ee91..8886d4c9df 100644
--- a/cpp/include/raft/sparse/distance/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/l2_distance.cuh
@@ -41,36 +41,35 @@ namespace distance {
 
 // @TODO: Move this into sparse prims (coo_norm)
 template <typename value_idx, typename value_t>
-__global__ void compute_row_norm_kernel(value_t* out,
-                                        const value_idx* __restrict__ coo_rows,
-                                        const value_t* __restrict__ data,
-                                        value_idx nnz)
-{
+__global__ void compute_row_norm_kernel(value_t *out,
+                                        const value_idx *__restrict__ coo_rows,
+                                        const value_t *__restrict__ data,
+                                        value_idx nnz) {
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); }
+  if (i < nnz) {
+    atomicAdd(&out[coo_rows[i]], data[i] * data[i]);
+  }
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_row_sum_kernel(value_t* out,
-                                       const value_idx* __restrict__ coo_rows,
-                                       const value_t* __restrict__ data,
-                                       value_idx nnz)
-{
+__global__ void compute_row_sum_kernel(value_t *out,
+                                       const value_idx *__restrict__ coo_rows,
+                                       const value_t *__restrict__ data,
+                                       value_idx nnz) {
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); }
+  if (i < nnz) {
+    atomicAdd(&out[coo_rows[i]], data[i]);
+  }
 }
 
 template <typename value_idx, typename value_t, typename expansion_f>
-__global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C,
-                                              const value_t* __restrict__ Q_sq_norms,
-                                              const value_t* __restrict__ R_sq_norms,
-                                              value_idx n_rows,
-                                              value_idx n_cols,
-                                              expansion_f expansion_func)
-{
+__global__ void compute_euclidean_warp_kernel(
+  value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms,
+  const value_t *__restrict__ R_sq_norms, value_idx n_rows, value_idx n_cols,
+  expansion_f expansion_func) {
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
-  value_idx i   = tid / n_cols;
-  value_idx j   = tid % n_cols;
+  value_idx i = tid / n_cols;
+  value_idx j = tid % n_cols;
 
   if (i >= n_rows || j >= n_cols) return;
 
@@ -84,29 +83,25 @@ __global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C,
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_correlation_warp_kernel(value_t* __restrict__ C,
-                                                const value_t* __restrict__ Q_sq_norms,
-                                                const value_t* __restrict__ R_sq_norms,
-                                                const value_t* __restrict__ Q_norms,
-                                                const value_t* __restrict__ R_norms,
-                                                value_idx n_rows,
-                                                value_idx n_cols,
-                                                value_idx n)
-{
+__global__ void compute_correlation_warp_kernel(
+  value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms,
+  const value_t *__restrict__ R_sq_norms, const value_t *__restrict__ Q_norms,
+  const value_t *__restrict__ R_norms, value_idx n_rows, value_idx n_cols,
+  value_idx n) {
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
-  value_idx i   = tid / n_cols;
-  value_idx j   = tid % n_cols;
+  value_idx i = tid / n_cols;
+  value_idx j = tid % n_cols;
 
   if (i >= n_rows || j >= n_cols) return;
 
-  value_t dot  = C[(size_t)i * n_cols + j];
+  value_t dot = C[(size_t)i * n_cols + j];
   value_t Q_l1 = Q_norms[i];
   value_t R_l1 = R_norms[j];
 
   value_t Q_l2 = Q_sq_norms[i];
   value_t R_l2 = R_sq_norms[j];
 
-  value_t numer   = n * dot - (Q_l1 * R_l1);
+  value_t numer = n * dot - (Q_l1 * R_l1);
   value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1);
   value_t R_denom = n * R_l2 - (R_l1 * R_l1);
 
@@ -116,77 +111,58 @@ __global__ void compute_correlation_warp_kernel(value_t* __restrict__ C,
   C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001);
 }
 
-template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
-void compute_euclidean(value_t* C,
-                       const value_t* Q_sq_norms,
-                       const value_t* R_sq_norms,
-                       value_idx n_rows,
-                       value_idx n_cols,
-                       cudaStream_t stream,
-                       expansion_f expansion_func)
-{
+template <typename value_idx, typename value_t, int tpb = 256,
+          typename expansion_f>
+void compute_euclidean(value_t *C, const value_t *Q_sq_norms,
+                       const value_t *R_sq_norms, value_idx n_rows,
+                       value_idx n_cols, cudaStream_t stream,
+                       expansion_f expansion_func) {
   int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
   compute_euclidean_warp_kernel<<<blocks, tpb, 0, stream>>>(
     C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func);
 }
 
-template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
-void compute_l2(value_t* out,
-                const value_idx* Q_coo_rows,
-                const value_t* Q_data,
-                value_idx Q_nnz,
-                const value_idx* R_coo_rows,
-                const value_t* R_data,
-                value_idx R_nnz,
-                value_idx m,
-                value_idx n,
+template <typename value_idx, typename value_t, int tpb = 256,
+          typename expansion_f>
+void compute_l2(value_t *out, const value_idx *Q_coo_rows,
+                const value_t *Q_data, value_idx Q_nnz,
+                const value_idx *R_coo_rows, const value_t *R_data,
+                value_idx R_nnz, value_idx m, value_idx n,
                 std::shared_ptr<raft::mr::device::allocator> alloc,
-                cudaStream_t stream,
-                expansion_f expansion_func)
-{
+                cudaStream_t stream, expansion_f expansion_func) {
   rmm::device_uvector<value_t> Q_sq_norms(m, stream);
   rmm::device_uvector<value_t> R_sq_norms(n, stream);
-  CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(
+    cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(
+    cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
 
   compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
   compute_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
     R_sq_norms.data(), R_coo_rows, R_data, R_nnz);
 
-  compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func);
+  compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream,
+                    expansion_func);
 }
 
 template <typename value_idx, typename value_t, int tpb = 256>
-void compute_correlation(value_t* C,
-                         const value_t* Q_sq_norms,
-                         const value_t* R_sq_norms,
-                         const value_t* Q_norms,
-                         const value_t* R_norms,
-                         value_idx n_rows,
-                         value_idx n_cols,
-                         value_idx n,
-                         cudaStream_t stream)
-{
+void compute_correlation(value_t *C, const value_t *Q_sq_norms,
+                         const value_t *R_sq_norms, const value_t *Q_norms,
+                         const value_t *R_norms, value_idx n_rows,
+                         value_idx n_cols, value_idx n, cudaStream_t stream) {
   int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
   compute_correlation_warp_kernel<<<blocks, tpb, 0, stream>>>(
     C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n);
 }
 
 template <typename value_idx, typename value_t, int tpb = 256>
-void compute_corr(value_t* out,
-                  const value_idx* Q_coo_rows,
-                  const value_t* Q_data,
-                  value_idx Q_nnz,
-                  const value_idx* R_coo_rows,
-                  const value_t* R_data,
-                  value_idx R_nnz,
-                  value_idx m,
-                  value_idx n,
-                  value_idx n_cols,
+void compute_corr(value_t *out, const value_idx *Q_coo_rows,
+                  const value_t *Q_data, value_idx Q_nnz,
+                  const value_idx *R_coo_rows, const value_t *R_data,
+                  value_idx R_nnz, value_idx m, value_idx n, value_idx n_cols,
                   std::shared_ptr<raft::mr::device::allocator> alloc,
-                  cudaStream_t stream)
-{
+                  cudaStream_t stream) {
   // sum_sq for std dev
   rmm::device_uvector<value_t> Q_sq_norms(m, stream);
   rmm::device_uvector<value_t> R_sq_norms(n, stream);
@@ -195,11 +171,15 @@ void compute_corr(value_t* out,
   rmm::device_uvector<value_t> Q_norms(m, stream);
   rmm::device_uvector<value_t> R_norms(n, stream);
 
-  CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(
+    cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(
+    cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
 
-  CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(
+    cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(
+    cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
 
   compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
@@ -211,15 +191,8 @@ void compute_corr(value_t* out,
   compute_row_sum_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
     R_norms.data(), R_coo_rows, R_data, R_nnz);
 
-  compute_correlation(out,
-                      Q_sq_norms.data(),
-                      R_sq_norms.data(),
-                      Q_norms.data(),
-                      R_norms.data(),
-                      m,
-                      n,
-                      n_cols,
-                      stream);
+  compute_correlation(out, Q_sq_norms.data(), R_sq_norms.data(), Q_norms.data(),
+                      R_norms.data(), m, n, n_cols, stream);
 }
 
 /**
@@ -229,45 +202,35 @@ void compute_corr(value_t* out,
 template <typename value_idx = int, typename value_t = float>
 class l2_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit l2_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : config_(&config), ip_dists(config)
-  {
-  }
+  explicit l2_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config), ip_dists(config) {}
 
-  void compute(value_t* out_dists)
-  {
+  void compute(value_t *out_dists) {
     ip_dists.compute(out_dists);
 
-    value_idx* b_indices = ip_dists.b_rows_coo();
-    value_t* b_data      = ip_dists.b_data_coo();
+    value_idx *b_indices = ip_dists.b_rows_coo();
+    value_t *b_data = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr,
-                                      config_->a_nrows,
-                                      search_coo_rows.data(),
-                                      config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
+                                      search_coo_rows.data(), config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_l2(out_dists,
-               search_coo_rows.data(),
-               config_->a_data,
-               config_->a_nnz,
-               b_indices,
-               b_data,
-               config_->b_nnz,
-               config_->a_nrows,
-               config_->b_nrows,
-               config_->handle.get_device_allocator(),
-               config_->handle.get_stream(),
-               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-                 return -2 * dot + q_norm + r_norm;
-               });
+    compute_l2(
+      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
+      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
+      config_->handle.get_device_allocator(), config_->handle.get_stream(),
+      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+        return -2 * dot + q_norm + r_norm;
+      });
   }
 
   ~l2_expanded_distances_t() = default;
 
  protected:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
@@ -276,21 +239,18 @@ class l2_expanded_distances_t : public distances_t<value_t> {
  * The expanded form is more efficient for sparse data.
  */
 template <typename value_idx = int, typename value_t = float>
-class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t<value_idx, value_t> {
+class l2_sqrt_expanded_distances_t
+  : public l2_expanded_distances_t<value_idx, value_t> {
  public:
-  explicit l2_sqrt_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : l2_expanded_distances_t<value_idx, value_t>(config)
-  {
-  }
+  explicit l2_sqrt_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : l2_expanded_distances_t<value_idx, value_t>(config) {}
 
-  void compute(value_t* out_dists) override
-  {
+  void compute(value_t *out_dists) override {
     l2_expanded_distances_t<value_idx, value_t>::compute(out_dists);
     // Sqrt Post-processing
     raft::linalg::unaryOp<value_t>(
-      out_dists,
-      out_dists,
-      this->config_->a_nrows * this->config_->b_nrows,
+      out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows,
       [] __device__(value_t input) {
         int neg = input < 0 ? -1 : 1;
         return sqrt(abs(input) * neg);
@@ -304,35 +264,25 @@ class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t<value_idx, v
 template <typename value_idx, typename value_t>
 class correlation_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit correlation_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : config_(&config), ip_dists(config)
-  {
-  }
+  explicit correlation_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config), ip_dists(config) {}
 
-  void compute(value_t* out_dists)
-  {
+  void compute(value_t *out_dists) {
     ip_dists.compute(out_dists);
 
-    value_idx* b_indices = ip_dists.b_rows_coo();
-    value_t* b_data      = ip_dists.b_data_coo();
+    value_idx *b_indices = ip_dists.b_rows_coo();
+    value_t *b_data = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr,
-                                      config_->a_nrows,
-                                      search_coo_rows.data(),
-                                      config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
+                                      search_coo_rows.data(), config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_corr(out_dists,
-                 search_coo_rows.data(),
-                 config_->a_data,
-                 config_->a_nnz,
-                 b_indices,
-                 b_data,
-                 config_->b_nnz,
-                 config_->a_nrows,
-                 config_->b_nrows,
-                 config_->b_ncols,
+    compute_corr(out_dists, search_coo_rows.data(), config_->a_data,
+                 config_->a_nnz, b_indices, b_data, config_->b_nnz,
+                 config_->a_nrows, config_->b_nrows, config_->b_ncols,
                  config_->handle.get_device_allocator(),
                  config_->handle.get_stream());
   }
@@ -340,62 +290,54 @@ class correlation_expanded_distances_t : public distances_t<value_t> {
   ~correlation_expanded_distances_t() = default;
 
  protected:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
 /**
- * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) *
- * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data.
+ * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * sqrt(sum(y_k)^2)))
+ * The expanded form is more efficient for sparse data.
  */
 template <typename value_idx = int, typename value_t = float>
 class cosine_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit cosine_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
-  {
-  }
+  explicit cosine_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config),
+      workspace(0, config.handle.get_stream()),
+      ip_dists(config) {}
 
-  void compute(value_t* out_dists)
-  {
+  void compute(value_t *out_dists) {
     ip_dists.compute(out_dists);
 
-    value_idx* b_indices = ip_dists.b_rows_coo();
-    value_t* b_data      = ip_dists.b_data_coo();
+    value_idx *b_indices = ip_dists.b_rows_coo();
+    value_t *b_data = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr,
-                                      config_->a_nrows,
-                                      search_coo_rows.data(),
-                                      config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(
+      config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
+                                      search_coo_rows.data(), config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_l2(out_dists,
-               search_coo_rows.data(),
-               config_->a_data,
-               config_->a_nnz,
-               b_indices,
-               b_data,
-               config_->b_nnz,
-               config_->a_nrows,
-               config_->b_nrows,
-               config_->handle.get_device_allocator(),
-               config_->handle.get_stream(),
-               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-                 value_t norms = sqrt(q_norm) * sqrt(r_norm);
-                 // deal with potential for 0 in denominator by forcing 0/1 instead
-                 value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
-
-                 // flip the similarity when both rows are 0
-                 bool both_empty = (q_norm == 0) && (r_norm == 0);
-                 return 1 - ((!both_empty * cos) + both_empty);
-               });
+    compute_l2(
+      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
+      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
+      config_->handle.get_device_allocator(), config_->handle.get_stream(),
+      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+        value_t norms = sqrt(q_norm) * sqrt(r_norm);
+        // deal with potential for 0 in denominator by forcing 0/1 instead
+        value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
+
+        // flip the similarity when both rows are 0
+        bool both_empty = (q_norm == 0) && (r_norm == 0);
+        return 1 - ((!both_empty * cos) + both_empty);
+      });
   }
 
   ~cosine_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
@@ -412,34 +354,25 @@ class cosine_expanded_distances_t : public distances_t<value_t> {
 template <typename value_idx = int, typename value_t = float>
 class hellinger_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit hellinger_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : config_(&config), workspace(0, config.handle.get_stream())
-  {
-  }
+  explicit hellinger_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config), workspace(0, config.handle.get_stream()) {}
 
-  void compute(value_t* out_dists)
-  {
+  void compute(value_t *out_dists) {
     rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
                                             config_->handle.get_stream());
 
-    raft::sparse::convert::csr_to_coo(config_->b_indptr,
-                                      config_->b_nrows,
-                                      coo_rows.data(),
-                                      config_->b_nnz,
+    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
+                                      coo_rows.data(), config_->b_nnz,
                                       config_->handle.get_stream());
 
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_dists,
-      *config_,
-      coo_rows.data(),
-      [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); },
-      Sum(),
+      out_dists, *config_, coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, Sum(),
       AtomicAdd());
 
     raft::linalg::unaryOp<value_t>(
-      out_dists,
-      out_dists,
-      config_->a_nrows * config_->b_nrows,
+      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) {
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
         bool rectifier = (1 - input) > 0;
@@ -451,43 +384,42 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
   ~hellinger_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
   rmm::device_uvector<char> workspace;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class russelrao_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit russelrao_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
-  {
-  }
+  explicit russelrao_expanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config),
+      workspace(0, config.handle.get_stream()),
+      ip_dists(config) {}
 
-  void compute(value_t* out_dists)
-  {
+  void compute(value_t *out_dists) {
     ip_dists.compute(out_dists);
 
-    value_t n_cols     = config_->a_ncols;
+    value_t n_cols = config_->a_ncols;
     value_t n_cols_inv = 1.0 / n_cols;
     raft::linalg::unaryOp<value_t>(
-      out_dists,
-      out_dists,
-      config_->a_nrows * config_->b_nrows,
+      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; },
       config_->handle.get_stream());
 
-    auto exec_policy  = rmm::exec_policy(config_->handle.get_stream());
-    auto diags        = thrust::counting_iterator<value_idx>(0);
+    auto exec_policy = rmm::exec_policy(config_->handle.get_stream());
+    auto diags = thrust::counting_iterator<value_idx>(0);
     value_idx b_nrows = config_->b_nrows;
-    thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) {
-      out_dists[input * b_nrows + input] = 0.0;
-    });
+    thrust::for_each(exec_policy, diags, diags + config_->a_nrows,
+                     [=] __device__(value_idx input) {
+                       out_dists[input * b_nrows + input] = 0.0;
+                     });
   }
 
   ~russelrao_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
diff --git a/cpp/include/raft/sparse/distance/lp_distance.cuh b/cpp/include/raft/sparse/distance/lp_distance.cuh
index 7f9511ff03..885d55ee50 100644
--- a/cpp/include/raft/sparse/distance/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/lp_distance.cuh
@@ -38,33 +38,23 @@ namespace raft {
 namespace sparse {
 namespace distance {
 
-template <typename value_idx = int,
-          typename value_t   = float,
-          typename product_f,
-          typename accum_f,
-          typename write_f>
-void unexpanded_lp_distances(value_t* out_dists,
-                             const distances_config_t<value_idx, value_t>* config_,
-                             product_f product_func,
-                             accum_f accum_func,
-                             write_f write_func)
-{
+template <typename value_idx = int, typename value_t = float,
+          typename product_f, typename accum_f, typename write_f>
+void unexpanded_lp_distances(
+  value_t *out_dists, const distances_config_t<value_idx, value_t> *config_,
+  product_f product_func, accum_f accum_func, write_f write_func) {
   rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
                                           config_->handle.get_stream());
 
-  raft::sparse::convert::csr_to_coo(config_->b_indptr,
-                                    config_->b_nrows,
-                                    coo_rows.data(),
-                                    config_->b_nnz,
+  raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
+                                    coo_rows.data(), config_->b_nnz,
                                     config_->handle.get_stream());
 
   balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
     out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func);
 
-  raft::sparse::convert::csr_to_coo(config_->a_indptr,
-                                    config_->a_nrows,
-                                    coo_rows.data(),
-                                    config_->a_nnz,
+  raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
+                                    coo_rows.data(), config_->a_nnz,
                                     config_->handle.get_stream());
 
   balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
@@ -81,51 +71,48 @@ void unexpanded_lp_distances(value_t* out_dists,
 template <typename value_idx = int, typename value_t = float>
 class l1_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  l1_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
-  {
-  }
+  l1_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config) {}
 
-  void compute(value_t* out_dists)
-  {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(), Sum(), AtomicAdd());
+  void compute(value_t *out_dists) {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(),
+                                                Sum(), AtomicAdd());
   }
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class l2_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  l2_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
-  {
-  }
+  l2_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config) {}
 
-  void compute(value_t* out_dists)
-  {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, SqDiff(), Sum(), AtomicAdd());
+  void compute(value_t *out_dists) {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, SqDiff(),
+                                                Sum(), AtomicAdd());
   }
 
  protected:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
-class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t<value_idx, value_t> {
+class l2_sqrt_unexpanded_distances_t
+  : public l2_unexpanded_distances_t<value_idx, value_t> {
  public:
-  l2_sqrt_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : l2_unexpanded_distances_t<value_idx, value_t>(config)
-  {
-  }
+  l2_sqrt_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : l2_unexpanded_distances_t<value_idx, value_t>(config) {}
 
-  void compute(value_t* out_dists)
-  {
+  void compute(value_t *out_dists) {
     l2_unexpanded_distances_t<value_idx, value_t>::compute(out_dists);
     // Sqrt Post-processing
     raft::linalg::unaryOp<value_t>(
-      out_dists,
-      out_dists,
-      this->config_->a_nrows * this->config_->b_nrows,
+      out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows,
       [] __device__(value_t input) {
         int neg = input < 0 ? -1 : 1;
         return sqrt(abs(input) * neg);
@@ -137,33 +124,29 @@ class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t<value_id
 template <typename value_idx = int, typename value_t = float>
 class linf_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit linf_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : config_(&config)
-  {
-  }
+  explicit linf_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config) {}
 
-  void compute(value_t* out_dists)
-  {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(), Max(), AtomicMax());
+  void compute(value_t *out_dists) {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(),
+                                                Max(), AtomicMax());
   }
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class canberra_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit canberra_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : config_(&config)
-  {
-  }
+  explicit canberra_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config) {}
 
-  void compute(value_t* out_dists)
-  {
+  void compute(value_t *out_dists) {
     unexpanded_lp_distances<value_idx, value_t>(
-      out_dists,
-      config_,
+      out_dists, config_,
       [] __device__(value_t a, value_t b) {
         value_t d = fabs(a) + fabs(b);
 
@@ -171,82 +154,70 @@ class canberra_unexpanded_distances_t : public distances_t<value_t> {
         // forcing 1/0 instead
         return ((d != 0) * fabs(a - b)) / (d + (d == 0));
       },
-      Sum(),
-      AtomicAdd());
+      Sum(), AtomicAdd());
   }
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class lp_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit lp_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config,
-                                     value_t p_)
-    : config_(&config), p(p_)
-  {
-  }
+  explicit lp_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config, value_t p_)
+    : config_(&config), p(p_) {}
 
-  void compute(value_t* out_dists)
-  {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, PDiff(p), Sum(), AtomicAdd());
+  void compute(value_t *out_dists) {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, PDiff(p),
+                                                Sum(), AtomicAdd());
 
     float one_over_p = 1.0f / p;
     raft::linalg::unaryOp<value_t>(
-      out_dists,
-      out_dists,
-      config_->a_nrows * config_->b_nrows,
+      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return pow(input, one_over_p); },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
   value_t p;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class hamming_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit hamming_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
-    : config_(&config)
-  {
-  }
+  explicit hamming_unexpanded_distances_t(
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config) {}
 
-  void compute(value_t* out_dists)
-  {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, NotEqual(), Sum(), AtomicAdd());
+  void compute(value_t *out_dists) {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, NotEqual(),
+                                                Sum(), AtomicAdd());
 
     value_t n_cols = 1.0 / config_->a_ncols;
     raft::linalg::unaryOp<value_t>(
-      out_dists,
-      out_dists,
-      config_->a_nrows * config_->b_nrows,
+      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return input * n_cols; },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
  public:
   explicit jensen_shannon_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t>& config)
-    : config_(&config)
-  {
-  }
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config) {}
 
-  void compute(value_t* out_dists)
-  {
+  void compute(value_t *out_dists) {
     unexpanded_lp_distances<value_idx, value_t>(
-      out_dists,
-      config_,
+      out_dists, config_,
       [] __device__(value_t a, value_t b) {
-        value_t m   = 0.5f * (a + b);
+        value_t m = 0.5f * (a + b);
         bool a_zero = a == 0;
         bool b_zero = b == 0;
 
@@ -256,61 +227,49 @@ class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
         bool x_zero = x == 0;
         bool y_zero = y == 0;
 
-        return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero)));
+        return (-a * (!x_zero * log(x + x_zero))) +
+               (-b * (!y_zero * log(y + y_zero)));
       },
-      Sum(),
-      AtomicAdd());
+      Sum(), AtomicAdd());
 
     raft::linalg::unaryOp<value_t>(
-      out_dists,
-      out_dists,
-      config_->a_nrows * config_->b_nrows,
+      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return sqrt(0.5 * input); },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class kl_divergence_unexpanded_distances_t : public distances_t<value_t> {
  public:
   explicit kl_divergence_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t>& config)
-    : config_(&config)
-  {
-  }
+    const distances_config_t<value_idx, value_t> &config)
+    : config_(&config) {}
 
-  void compute(value_t* out_dists)
-  {
+  void compute(value_t *out_dists) {
     rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
                                             config_->handle.get_stream());
 
-    raft::sparse::convert::csr_to_coo(config_->b_indptr,
-                                      config_->b_nrows,
-                                      coo_rows.data(),
-                                      config_->b_nnz,
+    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
+                                      coo_rows.data(), config_->b_nnz,
                                       config_->handle.get_stream());
 
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_dists,
-      *config_,
-      coo_rows.data(),
-      [] __device__(value_t a, value_t b) { return a * log(a / b); },
-      Sum(),
+      out_dists, *config_, coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return a * log(a / b); }, Sum(),
       AtomicAdd());
 
     raft::linalg::unaryOp<value_t>(
-      out_dists,
-      out_dists,
-      config_->a_nrows * config_->b_nrows,
+      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return 0.5 * input; },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t>* config_;
+  const distances_config_t<value_idx, value_t> *config_;
 };
 
 };  // END namespace distance
diff --git a/cpp/include/raft/sparse/distance/operators.cuh b/cpp/include/raft/sparse/distance/operators.cuh
index 3a9d0ba879..89acda8b1a 100644
--- a/cpp/include/raft/sparse/distance/operators.cuh
+++ b/cpp/include/raft/sparse/distance/operators.cuh
@@ -24,24 +24,21 @@ namespace distance {
 
 struct Sum {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
     return a + b;
   }
 };
 
 struct NotEqual {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
     return a != b;
   }
 };
 
 struct SqDiff {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
     return (a - b) * (a - b);
   }
 };
@@ -52,48 +49,44 @@ struct PDiff {
   PDiff(float p_) : p(p_) {}
 
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
     return pow(a - b, p);
   }
 };
 
 struct Max {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
     return fmax(a, b);
   }
 };
 
 struct AtomicAdd {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b)
-  {
+  __host__ __device__ __forceinline__ value_t operator()(value_t *a,
+                                                         value_t b) {
     return atomicAdd(a, b);
   }
 };
 
 struct AtomicMax {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b)
-  {
+  __host__ __device__ __forceinline__ value_t operator()(value_t *a,
+                                                         value_t b) {
     return atomicMax(a, b);
   }
 };
 
 struct Product {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
     return a * b;
   }
 };
 
 struct AbsDiff {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
-  {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
     return fabs(a - b);
   }
 };
diff --git a/cpp/include/raft/sparse/distance/utils.cuh b/cpp/include/raft/sparse/distance/utils.cuh
index d78b927e46..6b6d77a2d5 100644
--- a/cpp/include/raft/sparse/distance/utils.cuh
+++ b/cpp/include/raft/sparse/distance/utils.cuh
@@ -34,10 +34,10 @@ namespace distance {
  * @return the maximum number of columns that can be stored in smem
  */
 template <typename value_idx, typename value_t, int tpb = 1024>
-inline int max_cols_per_block()
-{
+inline int max_cols_per_block() {
   // max cols = (total smem available - cub reduction smem)
-  return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
+  return (raft::getSharedMemPerBlock() -
+          ((tpb / raft::warp_size()) * sizeof(value_t))) /
          sizeof(value_t);
 }
 
diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h
index 1738dd7498..29f541498b 100644
--- a/cpp/include/raft/sparse/hierarchy/common.h
+++ b/cpp/include/raft/sparse/hierarchy/common.h
@@ -37,15 +37,13 @@ class linkage_output {
   value_idx n_leaves;
   value_idx n_connected_components;
 
-  value_idx* labels;  // size: m
+  value_idx *labels;  // size: m
 
-  value_idx* children;  // size: (m-1, 2)
+  value_idx *children;  // size: (m-1, 2)
 };
 
-class linkage_output_int_float : public linkage_output<int, float> {
-};
-class linkage_output__int64_float : public linkage_output<int64_t, float> {
-};
+class linkage_output_int_float : public linkage_output<int, float> {};
+class linkage_output__int64_float : public linkage_output<int64_t, float> {};
 
 };  // namespace hierarchy
 };  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 95df7f4642..1ac075489a 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -42,32 +42,31 @@ class UnionFind {
   value_idx n_indices;
 
   UnionFind(value_idx N_)
-    : n_indices(2 * N_ - 1), parent(2 * N_ - 1, -1), size(2 * N_ - 1, 1), next_label(N_)
-  {
+    : n_indices(2 * N_ - 1),
+      parent(2 * N_ - 1, -1),
+      size(2 * N_ - 1, 1),
+      next_label(N_) {
     memset(size.data() + N_, 0, (size.size() - N_) * sizeof(value_idx));
   }
 
-  value_idx find(value_idx n)
-  {
+  value_idx find(value_idx n) {
     value_idx p;
     p = n;
 
-    while (parent[n] != -1)
-      n = parent[n];
+    while (parent[n] != -1) n = parent[n];
 
     // path compression
     while (parent[p] != n) {
-      p                                   = parent[p == -1 ? n_indices - 1 : p];
+      p = parent[p == -1 ? n_indices - 1 : p];
       parent[p == -1 ? n_indices - 1 : p] = n;
     }
     return n;
   }
 
-  void perform_union(value_idx m, value_idx n)
-  {
+  void perform_union(value_idx m, value_idx n) {
     size[next_label] = size[m] + size[n];
-    parent[m]        = next_label;
-    parent[n]        = next_label;
+    parent[m] = next_label;
+    parent[n] = next_label;
 
     next_label += 1;
   }
@@ -96,17 +95,12 @@ class UnionFind {
  * @param[out] out_size cluster sizes of output
  */
 template <typename value_idx, typename value_t>
-void build_dendrogram_host(const handle_t& handle,
-                           const value_idx* rows,
-                           const value_idx* cols,
-                           const value_t* data,
-                           size_t nnz,
-                           value_idx* children,
-                           value_t* out_delta,
-                           value_idx* out_size)
-{
+void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
+                           const value_idx *cols, const value_t *data,
+                           size_t nnz, value_idx *children, value_t *out_delta,
+                           value_idx *out_size) {
   auto d_alloc = handle.get_device_allocator();
-  auto stream  = handle.get_stream();
+  auto stream = handle.get_stream();
 
   value_idx n_edges = nnz;
 
@@ -127,8 +121,8 @@ void build_dendrogram_host(const handle_t& handle,
   UnionFind<value_idx, value_t> U(nnz + 1);
 
   for (value_idx i = 0; i < nnz; i++) {
-    value_idx a   = mst_src_h[i];
-    value_idx b   = mst_dst_h[i];
+    value_idx a = mst_src_h[i];
+    value_idx b = mst_dst_h[i];
     value_t delta = mst_weights_h[i];
 
     value_idx aa = U.find(a);
@@ -136,10 +130,10 @@ void build_dendrogram_host(const handle_t& handle,
 
     value_idx children_idx = i * 2;
 
-    children_h[children_idx]     = aa;
+    children_h[children_idx] = aa;
     children_h[children_idx + 1] = bb;
-    out_delta_h[i]               = delta;
-    out_size_h[i]                = U.size[aa] + U.size[bb];
+    out_delta_h[i] = delta;
+    out_size_h[i] = U.size[aa] + U.size[bb];
 
     U.perform_union(aa, bb);
   }
@@ -150,15 +144,13 @@ void build_dendrogram_host(const handle_t& handle,
 }
 
 template <typename value_idx>
-__global__ void write_levels_kernel(const value_idx* children,
-                                    value_idx* parents,
-                                    value_idx n_vertices)
-{
+__global__ void write_levels_kernel(const value_idx *children,
+                                    value_idx *parents, value_idx n_vertices) {
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid < n_vertices) {
     value_idx level = tid / 2;
     value_idx child = children[tid];
-    parents[child]  = level;
+    parents[child] = level;
   }
 }
 
@@ -174,17 +166,14 @@ __global__ void write_levels_kernel(const value_idx* children,
  * @param labels
  */
 template <typename value_idx>
-__global__ void inherit_labels(const value_idx* children,
-                               const value_idx* levels,
-                               size_t n_leaves,
-                               value_idx* labels,
-                               int cut_level,
-                               value_idx n_vertices)
-{
+__global__ void inherit_labels(const value_idx *children,
+                               const value_idx *levels, size_t n_leaves,
+                               value_idx *labels, int cut_level,
+                               value_idx n_vertices) {
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid < n_vertices) {
-    value_idx node      = children[tid];
+    value_idx node = children[tid];
     value_idx cur_level = tid / 2;
 
     /**
@@ -194,12 +183,12 @@ __global__ void inherit_labels(const value_idx* children,
     if (cur_level > cut_level) return;
 
     value_idx cur_parent = node;
-    value_idx label      = labels[cur_parent];
+    value_idx label = labels[cur_parent];
 
     while (label == -1) {
       cur_parent = cur_level + n_leaves;
-      cur_level  = levels[cur_parent];
-      label      = labels[cur_parent];
+      cur_level = levels[cur_parent];
+      label = labels[cur_parent];
     }
 
     labels[node] = label;
@@ -208,16 +197,15 @@ __global__ void inherit_labels(const value_idx* children,
 
 template <typename value_idx>
 struct init_label_roots {
-  init_label_roots(value_idx* labels_) : labels(labels_) {}
+  init_label_roots(value_idx *labels_) : labels(labels_) {}
 
   template <typename Tuple>
-  __host__ __device__ void operator()(Tuple t)
-  {
+  __host__ __device__ void operator()(Tuple t) {
     labels[thrust::get<1>(t)] = thrust::get<0>(t);
   }
 
  private:
-  value_idx* labels;
+  value_idx *labels;
 };
 
 /**
@@ -233,14 +221,11 @@ struct init_label_roots {
  * @param n_leaves
  */
 template <typename value_idx, int tpb = 256>
-void extract_flattened_clusters(const raft::handle_t& handle,
-                                value_idx* labels,
-                                const value_idx* children,
-                                size_t n_clusters,
-                                size_t n_leaves)
-{
-  auto d_alloc       = handle.get_device_allocator();
-  auto stream        = handle.get_stream();
+void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
+                                const value_idx *children, size_t n_clusters,
+                                size_t n_leaves) {
+  auto d_alloc = handle.get_device_allocator();
+  auto stream = handle.get_stream();
   auto thrust_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
 
   // Handle special case where n_clusters == 1
@@ -258,8 +243,10 @@ void extract_flattened_clusters(const raft::handle_t& handle,
 
     size_t n_edges = (n_leaves - 1) * 2;
 
-    thrust::device_ptr<const value_idx> d_ptr = thrust::device_pointer_cast(children);
-    value_idx n_vertices = *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1;
+    thrust::device_ptr<const value_idx> d_ptr =
+      thrust::device_pointer_cast(children);
+    value_idx n_vertices =
+      *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1;
 
     // Prevent potential infinite loop from labeling disconnected
     // connectivities graph.
@@ -270,7 +257,8 @@ void extract_flattened_clusters(const raft::handle_t& handle,
     rmm::device_uvector<value_idx> levels(n_vertices, stream);
 
     value_idx n_blocks = ceildiv(n_vertices, (value_idx)tpb);
-    write_levels_kernel<<<n_blocks, tpb, 0, stream>>>(children, levels.data(), n_vertices);
+    write_levels_kernel<<<n_blocks, tpb, 0, stream>>>(children, levels.data(),
+                                                      n_vertices);
     /**
      * Step 1: Find label roots:
      *
@@ -284,26 +272,27 @@ void extract_flattened_clusters(const raft::handle_t& handle,
     rmm::device_uvector<value_idx> label_roots(child_size, stream);
 
     value_idx children_cpy_start = n_edges - child_size;
-    raft::copy_async(label_roots.data(), children + children_cpy_start, child_size, stream);
+    raft::copy_async(label_roots.data(), children + children_cpy_start,
+                     child_size, stream);
 
-    thrust::sort(thrust_policy,
-                 label_roots.data(),
+    thrust::sort(thrust_policy, label_roots.data(),
                  label_roots.data() + (child_size),
                  thrust::greater<value_idx>());
 
     rmm::device_uvector<value_idx> tmp_labels(n_vertices, stream);
 
     // Init labels to -1
-    thrust::fill(thrust_policy, tmp_labels.data(), tmp_labels.data() + n_vertices, -1);
+    thrust::fill(thrust_policy, tmp_labels.data(),
+                 tmp_labels.data() + n_vertices, -1);
 
     // Write labels for cluster roots to "labels"
     thrust::counting_iterator<uint> first(0);
 
-    auto z_iter = thrust::make_zip_iterator(
-      thrust::make_tuple(first, label_roots.data() + (label_roots.size() - n_clusters)));
+    auto z_iter = thrust::make_zip_iterator(thrust::make_tuple(
+      first, label_roots.data() + (label_roots.size() - n_clusters)));
 
-    thrust::for_each(
-      thrust_policy, z_iter, z_iter + n_clusters, init_label_roots<value_idx>(tmp_labels.data()));
+    thrust::for_each(thrust_policy, z_iter, z_iter + n_clusters,
+                     init_label_roots<value_idx>(tmp_labels.data()));
 
     /**
      * Step 2: Propagate labels by having children iterate through their parents
@@ -313,8 +302,9 @@ void extract_flattened_clusters(const raft::handle_t& handle,
      */
     value_idx cut_level = (n_edges / 2) - (n_clusters - 1);
 
-    inherit_labels<<<n_blocks, tpb, 0, stream>>>(
-      children, levels.data(), n_leaves, tmp_labels.data(), cut_level, n_vertices);
+    inherit_labels<<<n_blocks, tpb, 0, stream>>>(children, levels.data(),
+                                                 n_leaves, tmp_labels.data(),
+                                                 cut_level, n_vertices);
 
     // copy tmp labels to actual labels
     raft::copy_async(labels, tmp_labels.data(), n_leaves, stream);
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index 096f1c650f..7cf959dda6 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -37,17 +37,14 @@ namespace raft {
 namespace hierarchy {
 namespace detail {
 
-template <raft::hierarchy::LinkageDistance dist_type, typename value_idx, typename value_t>
+template <raft::hierarchy::LinkageDistance dist_type, typename value_idx,
+          typename value_t>
 struct distance_graph_impl {
-  void run(const raft::handle_t& handle,
-           const value_t* X,
-           size_t m,
-           size_t n,
+  void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n,
            raft::distance::DistanceType metric,
-           rmm::device_uvector<value_idx>& indptr,
-           rmm::device_uvector<value_idx>& indices,
-           rmm::device_uvector<value_t>& data,
-           int c);
+           rmm::device_uvector<value_idx> &indptr,
+           rmm::device_uvector<value_idx> &indices,
+           rmm::device_uvector<value_t> &data, int c);
 };
 
 /**
@@ -56,51 +53,50 @@ struct distance_graph_impl {
  * @tparam value_t
  */
 template <typename value_idx, typename value_t>
-struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH, value_idx, value_t> {
-  void run(const raft::handle_t& handle,
-           const value_t* X,
-           size_t m,
-           size_t n,
+struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
+                           value_idx, value_t> {
+  void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n,
            raft::distance::DistanceType metric,
-           rmm::device_uvector<value_idx>& indptr,
-           rmm::device_uvector<value_idx>& indices,
-           rmm::device_uvector<value_t>& data,
-           int c)
-  {
-    auto d_alloc     = handle.get_device_allocator();
-    auto stream      = handle.get_stream();
+           rmm::device_uvector<value_idx> &indptr,
+           rmm::device_uvector<value_idx> &indices,
+           rmm::device_uvector<value_t> &data, int c) {
+    auto d_alloc = handle.get_device_allocator();
+    auto stream = handle.get_stream();
     auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
 
     // Need to symmetrize knn into undirected graph
     raft::sparse::COO<value_t, value_idx> knn_graph_coo(d_alloc, stream);
 
-    raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c);
+    raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo,
+                                       c);
 
     indices.resize(knn_graph_coo.nnz, stream);
     data.resize(knn_graph_coo.nnz, stream);
 
     // self-loops get max distance
-    auto transform_in = thrust::make_zip_iterator(
-      thrust::make_tuple(knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals()));
-
-    thrust::transform(exec_policy,
-                      transform_in,
-                      transform_in + knn_graph_coo.nnz,
-                      knn_graph_coo.vals(),
-                      [=] __device__(const thrust::tuple<value_idx, value_idx, value_t>& tup) {
-                        bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup);
-                        return (self_loop * std::numeric_limits<value_t>::max()) +
-                               (!self_loop * thrust::get<2>(tup));
-                      });
-
-    raft::sparse::convert::sorted_coo_to_csr(
-      knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, d_alloc, stream);
+    auto transform_in = thrust::make_zip_iterator(thrust::make_tuple(
+      knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals()));
+
+    thrust::transform(
+      exec_policy, transform_in, transform_in + knn_graph_coo.nnz,
+      knn_graph_coo.vals(),
+      [=] __device__(const thrust::tuple<value_idx, value_idx, value_t> &tup) {
+        bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup);
+        return (self_loop * std::numeric_limits<value_t>::max()) +
+               (!self_loop * thrust::get<2>(tup));
+      });
+
+    raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(),
+                                             knn_graph_coo.nnz, indptr.data(),
+                                             m + 1, d_alloc, stream);
 
     // TODO: Wouldn't need to copy here if we could compute knn
     // graph directly on the device uvectors
     // ref: https://github.com/rapidsai/raft/issues/227
-    raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz, stream);
-    raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz, stream);
+    raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz,
+                     stream);
+    raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz,
+                     stream);
   }
 };
 
@@ -120,17 +116,13 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH, value_id
  * @param[out] c constant 'c' used for nearest neighbors-based distances
  *             which will guarantee k <= log(n) + c
  */
-template <typename value_idx, typename value_t, raft::hierarchy::LinkageDistance dist_type>
-void get_distance_graph(const raft::handle_t& handle,
-                        const value_t* X,
-                        size_t m,
-                        size_t n,
-                        raft::distance::DistanceType metric,
-                        rmm::device_uvector<value_idx>& indptr,
-                        rmm::device_uvector<value_idx>& indices,
-                        rmm::device_uvector<value_t>& data,
-                        int c)
-{
+template <typename value_idx, typename value_t,
+          raft::hierarchy::LinkageDistance dist_type>
+void get_distance_graph(const raft::handle_t &handle, const value_t *X,
+                        size_t m, size_t n, raft::distance::DistanceType metric,
+                        rmm::device_uvector<value_idx> &indptr,
+                        rmm::device_uvector<value_idx> &indices,
+                        rmm::device_uvector<value_t> &data, int c) {
   auto stream = handle.get_stream();
 
   indptr.resize(m + 1, stream);
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index f939e87484..765a5ad77f 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -37,10 +37,9 @@ namespace hierarchy {
 namespace detail {
 
 template <typename value_idx, typename value_t>
-void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t>& coo1,
-                raft::Graph_COO<value_idx, value_idx, value_t>& coo2,
-                cudaStream_t stream)
-{
+void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
+                raft::Graph_COO<value_idx, value_idx, value_t> &coo2,
+                cudaStream_t stream) {
   /** Add edges to existing mst **/
   int final_nnz = coo2.n_edges + coo1.n_edges;
 
@@ -51,9 +50,12 @@ void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t>& coo1,
   /**
    * Construct final edge list
    */
-  raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), coo2.n_edges, stream);
-  raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), coo2.n_edges, stream);
-  raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), coo2.n_edges, stream);
+  raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(),
+                   coo2.n_edges, stream);
+  raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(),
+                   coo2.n_edges, stream);
+  raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(),
+                   coo2.n_edges, stream);
 
   coo1.n_edges = final_nnz;
 }
@@ -72,18 +74,14 @@ void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t>& coo1,
  * @return updated MST edge list
  */
 template <typename value_idx, typename value_t, typename red_op>
-void connect_knn_graph(
-  const raft::handle_t& handle,
-  const value_t* X,
-  raft::Graph_COO<value_idx, value_idx, value_t>& msf,
-  size_t m,
-  size_t n,
-  value_idx* color,
-  red_op reduction_op,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
-{
+void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
+                       raft::Graph_COO<value_idx, value_idx, value_t> &msf,
+                       size_t m, size_t n, value_idx *color,
+                       red_op reduction_op,
+                       raft::distance::DistanceType metric =
+                         raft::distance::DistanceType::L2SqrtExpanded) {
   auto d_alloc = handle.get_device_allocator();
-  auto stream  = handle.get_stream();
+  auto stream = handle.get_stream();
 
   raft::sparse::COO<value_t, value_idx> connected_edges(d_alloc, stream);
 
@@ -91,21 +89,15 @@ void connect_knn_graph(
     handle, connected_edges, X, color, m, n, reduction_op);
 
   rmm::device_uvector<value_idx> indptr2(m + 1, stream);
-  raft::sparse::convert::sorted_coo_to_csr(
-    connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, d_alloc, stream);
+  raft::sparse::convert::sorted_coo_to_csr(connected_edges.rows(),
+                                           connected_edges.nnz, indptr2.data(),
+                                           m + 1, d_alloc, stream);
 
   // On the second call, we hand the MST the original colors
   // and the new set of edges and let it restart the optimization process
-  auto new_mst = raft::mst::mst<value_idx, value_idx, value_t, double>(handle,
-                                                                       indptr2.data(),
-                                                                       connected_edges.cols(),
-                                                                       connected_edges.vals(),
-                                                                       m,
-                                                                       connected_edges.nnz,
-                                                                       color,
-                                                                       stream,
-                                                                       false,
-                                                                       false);
+  auto new_mst = raft::mst::mst<value_idx, value_idx, value_t, double>(
+    handle, indptr2.data(), connected_edges.cols(), connected_edges.vals(), m,
+    connected_edges.nnz, color, stream, false, false);
 
   merge_msts<value_idx, value_t>(msf, new_mst, stream);
 }
@@ -135,35 +127,29 @@ void connect_knn_graph(
  *  argument is really just a safeguard against the potential for infinite loops.
  */
 template <typename value_idx, typename value_t, typename red_op>
-void build_sorted_mst(
-  const raft::handle_t& handle,
-  const value_t* X,
-  const value_idx* indptr,
-  const value_idx* indices,
-  const value_t* pw_dists,
-  size_t m,
-  size_t n,
-  value_idx* mst_src,
-  value_idx* mst_dst,
-  value_t* mst_weight,
-  value_idx* color,
-  size_t nnz,
-  red_op reduction_op,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded,
-  int max_iter                        = 10)
-{
+void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
+                      const value_idx *indptr, const value_idx *indices,
+                      const value_t *pw_dists, size_t m, size_t n,
+                      value_idx *mst_src, value_idx *mst_dst,
+                      value_t *mst_weight, value_idx *color, size_t nnz,
+                      red_op reduction_op,
+                      raft::distance::DistanceType metric =
+                        raft::distance::DistanceType::L2SqrtExpanded,
+                      int max_iter = 10) {
   auto d_alloc = handle.get_device_allocator();
-  auto stream  = handle.get_stream();
+  auto stream = handle.get_stream();
 
   // We want to have MST initialize colors on first call.
   auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(
-    handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, true);
+    handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false,
+    true);
 
-  int iters        = 1;
+  int iters = 1;
   int n_components = linkage::get_n_components(color, m, d_alloc, stream);
 
   while (n_components > 1 && iters < max_iter) {
-    connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color, reduction_op);
+    connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color,
+                                          reduction_op);
 
     iters++;
 
@@ -190,8 +176,9 @@ void build_sorted_mst(
                " or increase 'max_iter'",
                max_iter);
 
-  raft::sparse::op::coo_sort_by_weight(
-    mst_coo.src.data(), mst_coo.dst.data(), mst_coo.weights.data(), mst_coo.n_edges, stream);
+  raft::sparse::op::coo_sort_by_weight(mst_coo.src.data(), mst_coo.dst.data(),
+                                       mst_coo.weights.data(), mst_coo.n_edges,
+                                       stream);
 
   raft::copy_async(mst_src, mst_coo.src.data(), mst_coo.n_edges, stream);
   raft::copy_async(mst_dst, mst_coo.dst.data(), mst_coo.n_edges, stream);
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index fe9538120f..01a033945c 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -44,26 +44,20 @@ static const size_t EMPTY = 0;
  * @param[in] n number of columns in X
  * @param[in] metric distance metrix to use when constructing connectivities graph
  * @param[out] out struct containing output dendrogram and cluster assignments
- * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect
- control
+ * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect control
  *            of k. The algorithm will set `k = log(n) + c`
  * @param[in] n_clusters number of clusters to assign data samples
  */
-template <typename value_idx,
-          typename value_t,
+template <typename value_idx, typename value_t,
           LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(const raft::handle_t& handle,
-                    const value_t* X,
-                    size_t m,
-                    size_t n,
-                    raft::distance::DistanceType metric,
-                    linkage_output<value_idx, value_t>* out,
-                    int c,
-                    size_t n_clusters)
-{
-  ASSERT(n_clusters <= m, "n_clusters must be less than or equal to the number of data points");
-
-  auto stream  = handle.get_stream();
+void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
+                    size_t n, raft::distance::DistanceType metric,
+                    linkage_output<value_idx, value_t> *out, int c,
+                    size_t n_clusters) {
+  ASSERT(n_clusters <= m,
+         "n_clusters must be less than or equal to the number of data points");
+
+  auto stream = handle.get_stream();
   auto d_alloc = handle.get_device_allocator();
 
   rmm::device_uvector<value_idx> indptr(EMPTY, stream);
@@ -85,20 +79,10 @@ void single_linkage(const raft::handle_t& handle,
    */
   rmm::device_uvector<value_idx> color(m, stream);
   raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> op(color.data(), m);
-  detail::build_sorted_mst<value_idx, value_t>(handle,
-                                               X,
-                                               indptr.data(),
-                                               indices.data(),
-                                               pw_dists.data(),
-                                               m,
-                                               n,
-                                               mst_rows.data(),
-                                               mst_cols.data(),
-                                               mst_data.data(),
-                                               color.data(),
-                                               indices.size(),
-                                               op,
-                                               metric);
+  detail::build_sorted_mst<value_idx, value_t>(
+    handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n,
+    mst_rows.data(), mst_cols.data(), mst_data.data(), color.data(),
+    indices.size(), op, metric);
 
   pw_dists.release();
 
@@ -110,19 +94,15 @@ void single_linkage(const raft::handle_t& handle,
   rmm::device_uvector<value_t> out_delta(n_edges, stream);
   rmm::device_uvector<value_idx> out_size(n_edges, stream);
   // Create dendrogram
-  detail::build_dendrogram_host<value_idx, value_t>(handle,
-                                                    mst_rows.data(),
-                                                    mst_cols.data(),
-                                                    mst_data.data(),
-                                                    n_edges,
-                                                    out->children,
-                                                    out_delta.data(),
-                                                    out_size.data());
-  detail::extract_flattened_clusters(handle, out->labels, out->children, n_clusters, m);
-
-  out->m                      = m;
-  out->n_clusters             = n_clusters;
-  out->n_leaves               = m;
+  detail::build_dendrogram_host<value_idx, value_t>(
+    handle, mst_rows.data(), mst_cols.data(), mst_data.data(), n_edges,
+    out->children, out_delta.data(), out_size.data());
+  detail::extract_flattened_clusters(handle, out->labels, out->children,
+                                     n_clusters, m);
+
+  out->m = m;
+  out->n_clusters = n_clusters;
+  out->n_leaves = m;
   out->n_connected_components = 1;
 }
 
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
index 01735a102d..47b1ba6e41 100644
--- a/cpp/include/raft/sparse/linalg/add.cuh
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -40,47 +40,40 @@ namespace sparse {
 namespace linalg {
 
 template <typename T, int TPB_X = 128>
-__global__ void csr_add_calc_row_counts_kernel(const int* a_ind,
-                                               const int* a_indptr,
-                                               const T* a_val,
-                                               int nnz1,
-                                               const int* b_ind,
-                                               const int* b_indptr,
-                                               const T* b_val,
-                                               int nnz2,
-                                               int m,
-                                               int* out_rowcounts)
-{
+__global__ void csr_add_calc_row_counts_kernel(
+  const int *a_ind, const int *a_indptr, const T *a_val, int nnz1,
+  const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m,
+  int *out_rowcounts) {
   // loop through columns in each set of rows and
   // calculate number of unique cols across both rows
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < m) {
     int a_start_idx = a_ind[row];
-    int a_stop_idx  = get_stop_idx(row, m, nnz1, a_ind);
+    int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
 
     int b_start_idx = b_ind[row];
-    int b_stop_idx  = get_stop_idx(row, m, nnz2, b_ind);
+    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
 
     /**
-     * Union of columns within each row of A and B so that we can scan through
-     * them, adding their values together.
-     */
+         * Union of columns within each row of A and B so that we can scan through
+         * them, adding their values together.
+         */
     int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx);
 
-    int* arr        = new int[max_size];
+    int *arr = new int[max_size];
     int cur_arr_idx = 0;
     for (int j = a_start_idx; j < a_stop_idx; j++) {
       arr[cur_arr_idx] = a_indptr[j];
       cur_arr_idx++;
     }
 
-    int arr_size   = cur_arr_idx;
+    int arr_size = cur_arr_idx;
     int final_size = arr_size;
 
     for (int j = b_start_idx; j < b_stop_idx; j++) {
       int cur_col = b_indptr[j];
-      bool found  = false;
+      bool found = false;
       for (int k = 0; k < arr_size; k++) {
         if (arr[k] == cur_col) {
           found = true;
@@ -88,7 +81,9 @@ __global__ void csr_add_calc_row_counts_kernel(const int* a_ind,
         }
       }
 
-      if (!found) { final_size++; }
+      if (!found) {
+        final_size++;
+      }
     }
 
     out_rowcounts[row] = final_size;
@@ -99,19 +94,11 @@ __global__ void csr_add_calc_row_counts_kernel(const int* a_ind,
 }
 
 template <typename T, int TPB_X = 128>
-__global__ void csr_add_kernel(const int* a_ind,
-                               const int* a_indptr,
-                               const T* a_val,
-                               int nnz1,
-                               const int* b_ind,
-                               const int* b_indptr,
-                               const T* b_val,
-                               int nnz2,
-                               int m,
-                               int* out_ind,
-                               int* out_indptr,
-                               T* out_val)
-{
+__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
+                               const T *a_val, int nnz1, const int *b_ind,
+                               const int *b_indptr, const T *b_val, int nnz2,
+                               int m, int *out_ind, int *out_indptr,
+                               T *out_val) {
   // 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
@@ -122,21 +109,21 @@ __global__ void csr_add_kernel(const int* a_ind,
     int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
 
     int b_start_idx = b_ind[row];
-    int b_stop_idx  = get_stop_idx(row, m, nnz2, b_ind);
+    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
 
     int o_idx = out_ind[row];
 
     int cur_o_idx = o_idx;
     for (int j = a_start_idx; j < a_stop_idx; j++) {
       out_indptr[cur_o_idx] = a_indptr[j];
-      out_val[cur_o_idx]    = a_val[j];
+      out_val[cur_o_idx] = a_val[j];
       cur_o_idx++;
     }
 
     int arr_size = cur_o_idx - o_idx;
     for (int j = b_start_idx; j < b_stop_idx; j++) {
       int cur_col = b_indptr[j];
-      bool found  = false;
+      bool found = false;
       for (int k = o_idx; k < o_idx + arr_size; k++) {
         // If we found a match, sum the two values
         if (out_indptr[k] == cur_col) {
@@ -149,7 +136,7 @@ __global__ void csr_add_kernel(const int* a_ind,
       // if we didn't find a match, add the value for b
       if (!found) {
         out_indptr[o_idx + arr_size] = cur_col;
-        out_val[o_idx + arr_size]    = b_val[j];
+        out_val[o_idx + arr_size] = b_val[j];
         arr_size++;
       }
     }
@@ -173,36 +160,32 @@ __global__ void csr_add_kernel(const int* a_ind,
  * @param stream: cuda stream to use
  */
 template <typename T, int TPB_X = 128>
-size_t csr_add_calc_inds(const int* a_ind,
-                         const int* a_indptr,
-                         const T* a_val,
-                         int nnz1,
-                         const int* b_ind,
-                         const int* b_indptr,
-                         const T* b_val,
-                         int nnz2,
-                         int m,
-                         int* out_ind,
+size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
+                         int nnz1, const int *b_ind, const int *b_indptr,
+                         const T *b_val, int nnz2, int m, int *out_ind,
                          std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                         cudaStream_t stream)
-{
+                         cudaStream_t stream) {
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
   raft::mr::device::buffer<int> row_counts(d_alloc, stream, m + 1);
-  CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
+  CUDA_CHECK(
+    cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
 
-  csr_add_calc_row_counts_kernel<T, TPB_X><<<grid, blk, 0, stream>>>(
-    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, row_counts.data());
+  csr_add_calc_row_counts_kernel<T, TPB_X>
+    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
+                               b_val, nnz2, m, row_counts.data());
 
   int cnnz = 0;
   raft::update_host(&cnnz, row_counts.data() + m, 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   // create csr compressed row index from row counts
-  thrust::device_ptr<int> row_counts_d = thrust::device_pointer_cast(row_counts.data());
-  thrust::device_ptr<int> c_ind_d      = thrust::device_pointer_cast(out_ind);
-  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m, c_ind_d);
+  thrust::device_ptr<int> row_counts_d =
+    thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<int> c_ind_d = thrust::device_pointer_cast(out_ind);
+  exclusive_scan(thrust::cuda::par.on(stream), row_counts_d, row_counts_d + m,
+                 c_ind_d);
 
   return cnnz;
 }
@@ -225,25 +208,16 @@ size_t csr_add_calc_inds(const int* a_ind,
  * @param stream: cuda stream to use
  */
 template <typename T, int TPB_X = 128>
-void csr_add_finalize(const int* a_ind,
-                      const int* a_indptr,
-                      const T* a_val,
-                      int nnz1,
-                      const int* b_ind,
-                      const int* b_indptr,
-                      const T* b_val,
-                      int nnz2,
-                      int m,
-                      int* c_ind,
-                      int* c_indptr,
-                      T* c_val,
-                      cudaStream_t stream)
-{
+void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val,
+                      int nnz1, const int *b_ind, const int *b_indptr,
+                      const T *b_val, int nnz2, int m, int *c_ind,
+                      int *c_indptr, T *c_val, cudaStream_t stream) {
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_add_kernel<T, TPB_X><<<grid, blk, 0, stream>>>(
-    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val);
+  csr_add_kernel<T, TPB_X>
+    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
+                               b_val, nnz2, m, c_ind, c_indptr, c_val);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index 77a9445ab1..9bd322c90a 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -44,10 +44,11 @@ namespace linalg {
  * @param results array to place results
  */
 template <int TPB_X = 64>
-__global__ void coo_degree_kernel(const int* rows, int nnz, int* results)
-{
+__global__ void coo_degree_kernel(const int *rows, int nnz, int *results) {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz) { raft::myAtomicAdd(results + rows[row], 1); }
+  if (row < nnz) {
+    raft::myAtomicAdd(results + rows[row], 1);
+  }
 }
 
 /**
@@ -59,8 +60,7 @@ __global__ void coo_degree_kernel(const int* rows, int nnz, int* results)
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64>
-void coo_degree(const int* rows, int nnz, int* results, cudaStream_t stream)
-{
+void coo_degree(const int *rows, int nnz, int *results, cudaStream_t stream) {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
@@ -77,28 +77,31 @@ void coo_degree(const int* rows, int nnz, int* results, cudaStream_t stream)
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
-{
+void coo_degree(COO<T> *in, int *results, cudaStream_t stream) {
   dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
-  coo_degree_kernel<TPB_X><<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->nnz, results);
+  coo_degree_kernel<TPB_X>
+    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->nnz, results);
   CUDA_CHECK(cudaGetLastError());
 }
 
 template <int TPB_X = 64, typename T>
-__global__ void coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results)
-{
+__global__ void coo_degree_nz_kernel(const int *rows, const T *vals, int nnz,
+                                     int *results) {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); }
+  if (row < nnz && vals[row] != 0.0) {
+    raft::myAtomicAdd(results + rows[row], 1);
+  }
 }
 
 template <int TPB_X = 64, typename T>
-__global__ void coo_degree_scalar_kernel(
-  const int* rows, const T* vals, int nnz, T scalar, int* results)
-{
+__global__ void coo_degree_scalar_kernel(const int *rows, const T *vals,
+                                         int nnz, T scalar, int *results) {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); }
+  if (row < nnz && vals[row] != scalar) {
+    raft::myAtomicAdd(results + rows[row], 1);
+  }
 }
 
 /**
@@ -111,12 +114,12 @@ __global__ void coo_degree_scalar_kernel(
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_scalar(COO<T>* in, T scalar, int* results, cudaStream_t stream)
-{
+void coo_degree_scalar(COO<T> *in, T scalar, int *results,
+                       cudaStream_t stream) {
   dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
-  coo_degree_scalar_kernel<TPB_X, T>
-    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->vals(), in->nnz, scalar, results);
+  coo_degree_scalar_kernel<TPB_X, T><<<grid_rc, blk_rc, 0, stream>>>(
+    in->rows(), in->vals(), in->nnz, scalar, results);
   CUDA_CHECK(cudaGetLastError());
 }
 
@@ -132,9 +135,8 @@ void coo_degree_scalar(COO<T>* in, T scalar, int* results, cudaStream_t stream)
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_scalar(
-  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
-{
+void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar,
+                       int *results, cudaStream_t stream = 0) {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
   coo_degree_scalar_kernel<TPB_X, T>
@@ -152,11 +154,12 @@ void coo_degree_scalar(
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream)
-{
+void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results,
+                   cudaStream_t stream) {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
-  coo_degree_nz_kernel<TPB_X, T><<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, results);
+  coo_degree_nz_kernel<TPB_X, T>
+    <<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, results);
 }
 
 /**
@@ -168,8 +171,7 @@ void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaSt
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
-{
+void coo_degree_nz(COO<T> *in, int *results, cudaStream_t stream) {
   dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index 59dc5ff3e4..bfcd3fd592 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -41,12 +41,10 @@ __global__ void csr_row_normalize_l1_kernel(
   // @TODO: This can be done much more parallel by
   // having threads in a warp compute the sum in parallel
   // over each row and then divide the values in parallel.
-  const int* ia,  // csr row ex_scan (sorted by row)
-  const T* vals,
-  int nnz,  // array of values and number of non-zeros
-  int m,    // num rows in csr
-  T* result)
-{  // output array
+  const int *ia,           // csr row ex_scan (sorted by row)
+  const T *vals, int nnz,  // array of values and number of non-zeros
+  int m,                   // num rows in csr
+  T *result) {             // output array
 
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -54,7 +52,7 @@ __global__ void csr_row_normalize_l1_kernel(
   // sum all vals_arr for row and divide each val by sum
   if (row < m) {
     int start_idx = ia[row];
-    int stop_idx  = 0;
+    int stop_idx = 0;
     if (row < m - 1) {
       stop_idx = ia[row + 1];
     } else
@@ -67,7 +65,7 @@ __global__ void csr_row_normalize_l1_kernel(
 
     for (int j = start_idx; j < stop_idx; j++) {
       if (sum != 0.0) {
-        T val     = vals[j];
+        T val = vals[j];
         result[j] = val / sum;
       } else {
         result[j] = 0.0;
@@ -87,18 +85,18 @@ __global__ void csr_row_normalize_l1_kernel(
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
-                          const T* vals,
+void csr_row_normalize_l1(const int *ia,  // csr row ex_scan (sorted by row)
+                          const T *vals,
                           int nnz,  // array of values and number of non-zeros
                           int m,    // num rows in csr
-                          T* result,
-                          cudaStream_t stream)
-{  // output array
+                          T *result,
+                          cudaStream_t stream) {  // output array
 
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_row_normalize_l1_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
+  csr_row_normalize_l1_kernel<TPB_X, T>
+    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
   CUDA_CHECK(cudaGetLastError());
 }
 
@@ -107,12 +105,10 @@ __global__ void csr_row_normalize_max_kernel(
   // @TODO: This can be done much more parallel by
   // having threads in a warp compute the sum in parallel
   // over each row and then divide the values in parallel.
-  const int* ia,  // csr row ind array (sorted by row)
-  const T* vals,
-  int nnz,  // array of values and number of non-zeros
-  int m,    // num total rows in csr
-  T* result)
-{  // output array
+  const int *ia,           // csr row ind array (sorted by row)
+  const T *vals, int nnz,  // array of values and number of non-zeros
+  int m,                   // num total rows in csr
+  T *result) {             // output array
 
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -120,7 +116,7 @@ __global__ void csr_row_normalize_max_kernel(
   // find max across columns and divide
   if (row < m) {
     int start_idx = ia[row];
-    int stop_idx  = 0;
+    int stop_idx = 0;
     if (row < m - 1) {
       stop_idx = ia[row + 1];
     } else
@@ -134,7 +130,7 @@ __global__ void csr_row_normalize_max_kernel(
     // divide nonzeros in current row by max
     for (int j = start_idx; j < stop_idx; j++) {
       if (max != 0.0 && max > std::numeric_limits<float>::min()) {
-        T val     = vals[j];
+        T val = vals[j];
         result[j] = val / max;
       } else {
         result[j] = 0.0;
@@ -155,17 +151,16 @@ __global__ void csr_row_normalize_max_kernel(
  */
 
 template <int TPB_X = 64, typename T>
-void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
-                           const T* vals,
+void csr_row_normalize_max(const int *ia,  // csr row ind array (sorted by row)
+                           const T *vals,
                            int nnz,  // array of values and number of non-zeros
                            int m,    // num total rows in csr
-                           T* result,
-                           cudaStream_t stream)
-{
+                           T *result, cudaStream_t stream) {
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_row_normalize_max_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
+  csr_row_normalize_max_kernel<TPB_X, T>
+    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
   CUDA_CHECK(cudaGetLastError());
 }
 
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index 3b609d994f..15302f3b74 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -31,23 +31,16 @@ namespace sparse {
 namespace spectral {
 
 template <typename T>
-void fit_embedding(const raft::handle_t& handle,
-                   int* rows,
-                   int* cols,
-                   T* vals,
-                   int nnz,
-                   int n,
-                   int n_components,
-                   T* out,
-                   unsigned long long seed = 1234567)
-{
-  auto stream  = handle.get_stream();
+void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
+                   int nnz, int n, int n_components, T *out,
+                   unsigned long long seed = 1234567) {
+  auto stream = handle.get_stream();
   auto d_alloc = handle.get_device_allocator();
   raft::mr::device::buffer<int> src_offsets(d_alloc, stream, n + 1);
   raft::mr::device::buffer<int> dst_cols(d_alloc, stream, nnz);
   raft::mr::device::buffer<T> dst_vals(d_alloc, stream, nnz);
-  convert::coo_to_csr(
-    handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data());
+  convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(),
+                      dst_cols.data(), dst_vals.data());
 
   raft::mr::device::buffer<T> eigVals(d_alloc, stream, n_components + 1);
   raft::mr::device::buffer<T> eigVecs(d_alloc, stream, n * (n_components + 1));
@@ -61,53 +54,48 @@ void fit_embedding(const raft::handle_t& handle,
   using index_type = int;
   using value_type = T;
 
-  index_type* ro = src_offsets.data();
-  index_type* ci = dst_cols.data();
-  value_type* vs = dst_vals.data();
+  index_type *ro = src_offsets.data();
+  index_type *ci = dst_cols.data();
+  value_type *vs = dst_vals.data();
 
-  raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{handle, ro, ci, vs, n, nnz};
+  raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
+    handle, ro, ci, vs, n, nnz};
 
-  index_type neigvs         = n_components + 1;
-  index_type maxiter        = 4000;  // default reset value (when set to 0);
-  value_type tol            = 0.01;
-  index_type restart_iter   = 15 + neigvs;  // what cugraph is using
-  auto t_exe_p              = thrust::cuda::par.on(stream);
+  index_type neigvs = n_components + 1;
+  index_type maxiter = 4000;  //default reset value (when set to 0);
+  value_type tol = 0.01;
+  index_type restart_iter = 15 + neigvs;  //what cugraph is using
+  auto t_exe_p = thrust::cuda::par.on(stream);
   using thrust_exe_policy_t = decltype(t_exe_p);
 
-  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter, restart_iter, tol};
+  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter,
+                                                          restart_iter, tol};
 
   cfg.seed = seed;
 
   raft::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
 
-  // cluster computation here is irrelevant,
-  // hence define a no-op such solver to
-  // feed partition():
+  //cluster computation here is irrelevant,
+  //hence define a no-op such solver to
+  //feed partition():
   //
   struct no_op_cluster_solver_t {
     using index_type_t = index_type;
-    using size_type_t  = index_type;
+    using size_type_t = index_type;
     using value_type_t = value_type;
 
-    std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
-                                                thrust_exe_policy_t t_exe_policy,
-                                                size_type_t n_obs_vecs,
-                                                size_type_t dim,
-                                                value_type_t const* __restrict__ obs,
-                                                index_type_t* __restrict__ codes) const
-    {
+    std::pair<value_type_t, index_type_t> solve(
+      handle_t const &handle, thrust_exe_policy_t t_exe_policy,
+      size_type_t n_obs_vecs, size_type_t dim,
+      value_type_t const *__restrict__ obs,
+      index_type_t *__restrict__ codes) const {
       return std::make_pair<value_type_t, index_type_t>(0, 0);
     }
   };
 
-  raft::spectral::partition(handle,
-                            t_exe_p,
-                            r_csr_m,
-                            eig_solver,
-                            no_op_cluster_solver_t{},
-                            labels.data(),
-                            eigVals.data(),
-                            eigVecs.data());
+  raft::spectral::partition(handle, t_exe_p, r_csr_m, eig_solver,
+                            no_op_cluster_solver_t{}, labels.data(),
+                            eigVals.data(), eigVecs.data());
 
   raft::copy<T>(out, eigVecs.data() + n, n * n_components, stream);
 
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index b9426c284a..5c2c78f0c3 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -49,34 +49,26 @@ namespace linalg {
 // TODO: value_idx param needs to be used for this once FAISS is updated to use float32
 // for indices so that the index types can be uniform
 template <int TPB_X = 128, typename T, typename Lambda>
-__global__ void coo_symmetrize_kernel(int* row_ind,
-                                      int* rows,
-                                      int* cols,
-                                      T* vals,
-                                      int* orows,
-                                      int* ocols,
-                                      T* ovals,
-                                      int n,
-                                      int cnnz,
-                                      Lambda reduction_op)
-{
+__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
+                                      T *vals, int *orows, int *ocols, T *ovals,
+                                      int n, int cnnz, Lambda reduction_op) {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < n) {
     int start_idx = row_ind[row];  // each thread processes one row
-    int stop_idx  = get_stop_idx(row, n, cnnz, row_ind);
+    int stop_idx = get_stop_idx(row, n, cnnz, row_ind);
 
-    int row_nnz       = 0;
+    int row_nnz = 0;
     int out_start_idx = start_idx * 2;
 
     for (int idx = 0; idx < stop_idx - start_idx; idx++) {
       int cur_row = rows[idx + start_idx];
       int cur_col = cols[idx + start_idx];
-      T cur_val   = vals[idx + start_idx];
+      T cur_val = vals[idx + start_idx];
 
       int lookup_row = cur_col;
-      int t_start    = row_ind[lookup_row];  // Start at
-      int t_stop     = get_stop_idx(lookup_row, n, cnnz, row_ind);
+      int t_start = row_ind[lookup_row];  // Start at
+      int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind);
 
       T transpose = 0.0;
 
@@ -87,7 +79,7 @@ __global__ void coo_symmetrize_kernel(int* row_ind,
         // done in a different thread.
         if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) {
           // If it exists already, set transposed value to existing value
-          transpose   = vals[t_idx];
+          transpose = vals[t_idx];
           found_match = true;
           break;
         }
@@ -134,12 +126,10 @@ __global__ void coo_symmetrize_kernel(int* row_ind,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 128, typename T, typename Lambda>
-void coo_symmetrize(COO<T>* in,
-                    COO<T>* out,
+void coo_symmetrize(COO<T> *in, COO<T> *out,
                     Lambda reduction_op,  // two-argument reducer
                     std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                    cudaStream_t stream)
-{
+                    cudaStream_t stream) {
   dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
@@ -151,16 +141,9 @@ void coo_symmetrize(COO<T>* in,
 
   out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream);
 
-  coo_symmetrize_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(in_row_ind.data(),
-                                                            in->rows(),
-                                                            in->cols(),
-                                                            in->vals(),
-                                                            out->rows(),
-                                                            out->cols(),
-                                                            out->vals(),
-                                                            in->n_rows,
-                                                            in->nnz,
-                                                            reduction_op);
+  coo_symmetrize_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(
+    in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(),
+    out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -176,15 +159,14 @@ void coo_symmetrize(COO<T>* in,
  * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_find_size(const value_t* restrict data,
-                                           const value_idx* restrict indices,
-                                           const value_idx n,
-                                           const int k,
-                                           value_idx* restrict row_sizes,
-                                           value_idx* restrict row_sizes2)
-{
+__global__ static void symmetric_find_size(const value_t *restrict data,
+                                           const value_idx *restrict indices,
+                                           const value_idx n, const int k,
+                                           value_idx *restrict row_sizes,
+                                           value_idx *restrict row_sizes2) {
   const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
-  const auto j   = blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
+  const auto j =
+    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
   if (row >= n || j >= k) return;
 
   const auto col = indices[row * k + j];
@@ -204,11 +186,9 @@ __global__ static void symmetric_find_size(const value_t* restrict data,
  * @param row_sizes2: Input row sum 2 array(n) for faster reduction
  */
 template <typename value_idx>
-__global__ static void reduce_find_size(const value_idx n,
-                                        const int k,
-                                        value_idx* restrict row_sizes,
-                                        const value_idx* restrict row_sizes2)
-{
+__global__ static void reduce_find_size(const value_idx n, const int k,
+                                        value_idx *restrict row_sizes,
+                                        const value_idx *restrict row_sizes2) {
   const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (i >= n) return;
   row_sizes[i] += (row_sizes2[i] + k);
@@ -229,21 +209,20 @@ __global__ static void reduce_find_size(const value_idx n,
  * @param k: Number of n_neighbors
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_sum(value_idx* restrict edges,
-                                     const value_t* restrict data,
-                                     const value_idx* restrict indices,
-                                     value_t* restrict VAL,
-                                     value_idx* restrict COL,
-                                     value_idx* restrict ROW,
-                                     const value_idx n,
-                                     const int k)
-{
+__global__ static void symmetric_sum(value_idx *restrict edges,
+                                     const value_t *restrict data,
+                                     const value_idx *restrict indices,
+                                     value_t *restrict VAL,
+                                     value_idx *restrict COL,
+                                     value_idx *restrict ROW, const value_idx n,
+                                     const int k) {
   const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
-  const auto j   = blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
+  const auto j =
+    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
   if (row >= n || j >= k) return;
 
-  const auto col       = indices[row * k + j];
-  const auto original  = atomicAdd(&edges[row], value_idx(1));
+  const auto col = indices[row * k + j];
+  const auto original = atomicAdd(&edges[row], value_idx(1));
   const auto transpose = atomicAdd(&edges[col], value_idx(1));
 
   VAL[transpose] = VAL[original] = data[row * k + j];
@@ -273,26 +252,26 @@ __global__ static void symmetric_sum(value_idx* restrict edges,
  * @param stream: Input cuda stream
  * @param d_alloc device allocator for temporary buffers
  */
-template <typename value_idx = int64_t, typename value_t = float, int TPB_X = 32, int TPB_Y = 32>
-void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices,
-                                const value_t* restrict knn_dists,
-                                const value_idx n,
-                                const int k,
-                                COO<value_t, value_idx>* out,
-                                cudaStream_t stream,
-                                std::shared_ptr<raft::mr::device::allocator> d_alloc)
-{
+template <typename value_idx = int64_t, typename value_t = float,
+          int TPB_X = 32, int TPB_Y = 32>
+void from_knn_symmetrize_matrix(
+  const value_idx *restrict knn_indices, const value_t *restrict knn_dists,
+  const value_idx n, const int k, COO<value_t, value_idx> *out,
+  cudaStream_t stream, std::shared_ptr<raft::mr::device::allocator> d_alloc) {
   // (1) Find how much space needed in each row
   // We look through all datapoints and increment the count for each row.
   const dim3 threadsPerBlock(TPB_X, TPB_Y);
-  const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), raft::ceildiv(k, TPB_Y));
+  const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X),
+                       raft::ceildiv(k, TPB_Y));
 
   // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4)
   raft::mr::device::buffer<value_idx> row_sizes(d_alloc, stream, n);
-  CUDA_CHECK(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
+  CUDA_CHECK(
+    cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
 
   raft::mr::device::buffer<value_idx> row_sizes2(d_alloc, stream, n);
-  CUDA_CHECK(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
+  CUDA_CHECK(
+    cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
 
   symmetric_find_size<<<numBlocks, threadsPerBlock, 0, stream>>>(
     knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data());
@@ -313,12 +292,14 @@ void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices,
   // This mirrors CSR matrix's row Pointer, were maximum bounds for each row
   // are calculated as the cumulative rolling sum of the previous rows.
   // Notice reusing old row_sizes2 memory
-  value_idx* edges                          = row_sizes2.data();
-  thrust::device_ptr<value_idx> __edges     = thrust::device_pointer_cast(edges);
-  thrust::device_ptr<value_idx> __row_sizes = thrust::device_pointer_cast(row_sizes.data());
+  value_idx *edges = row_sizes2.data();
+  thrust::device_ptr<value_idx> __edges = thrust::device_pointer_cast(edges);
+  thrust::device_ptr<value_idx> __row_sizes =
+    thrust::device_pointer_cast(row_sizes.data());
 
   // Rolling cumulative sum
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes, __row_sizes + n, __edges);
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), __row_sizes,
+                         __row_sizes + n, __edges);
 
   // (5) Perform final data + data.T operation in tandem with memcpying
   symmetric_sum<<<numBlocks, threadsPerBlock, 0, stream>>>(
@@ -330,17 +311,11 @@ void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices,
  * Symmetrizes a COO matrix
  */
 template <typename value_idx, typename value_t>
-void symmetrize(const raft::handle_t& handle,
-                const value_idx* rows,
-                const value_idx* cols,
-                const value_t* vals,
-                size_t m,
-                size_t n,
-                size_t nnz,
-                raft::sparse::COO<value_t, value_idx>& out)
-{
+void symmetrize(const raft::handle_t &handle, const value_idx *rows,
+                const value_idx *cols, const value_t *vals, size_t m, size_t n,
+                size_t nnz, raft::sparse::COO<value_t, value_idx> &out) {
   auto d_alloc = handle.get_device_allocator();
-  auto stream  = handle.get_stream();
+  auto stream = handle.get_stream();
 
   // copy rows to cols and cols to rows
   rmm::device_uvector<value_idx> symm_rows(nnz * 2, stream);
@@ -356,17 +331,13 @@ void symmetrize(const raft::handle_t& handle,
   raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream);
 
   // sort COO
-  raft::sparse::op::coo_sort((value_idx)m,
-                             (value_idx)n,
-                             (value_idx)nnz * 2,
-                             symm_rows.data(),
-                             symm_cols.data(),
-                             symm_vals.data(),
-                             d_alloc,
-                             stream);
-
-  raft::sparse::op::max_duplicates(
-    handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, m, n);
+  raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2,
+                             symm_rows.data(), symm_cols.data(),
+                             symm_vals.data(), d_alloc, stream);
+
+  raft::sparse::op::max_duplicates(handle, out, symm_rows.data(),
+                                   symm_cols.data(), symm_vals.data(), nnz * 2,
+                                   m, n);
 }
 
 };  // end NAMESPACE linalg
diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h
index ce90eb6702..6afe4ca8f6 100644
--- a/cpp/include/raft/sparse/linalg/transpose.h
+++ b/cpp/include/raft/sparse/linalg/transpose.h
@@ -57,55 +57,29 @@ namespace linalg {
  * @param[in] stream : Cuda stream for ordering events
  */
 template <typename value_idx, typename value_t>
-void csr_transpose(cusparseHandle_t handle,
-                   const value_idx* csr_indptr,
-                   const value_idx* csr_indices,
-                   const value_t* csr_data,
-                   value_idx* csc_indptr,
-                   value_idx* csc_indices,
-                   value_t* csc_data,
-                   value_idx csr_nrows,
-                   value_idx csr_ncols,
+void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr,
+                   const value_idx *csr_indices, const value_t *csr_data,
+                   value_idx *csc_indptr, value_idx *csc_indices,
+                   value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols,
                    value_idx nnz,
                    std::shared_ptr<raft::mr::device::allocator> allocator,
-                   cudaStream_t stream)
-{
+                   cudaStream_t stream) {
   size_t convert_csc_workspace_size = 0;
 
-  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(handle,
-                                                          csr_nrows,
-                                                          csr_ncols,
-                                                          nnz,
-                                                          csr_data,
-                                                          csr_indptr,
-                                                          csr_indices,
-                                                          csc_data,
-                                                          csc_indptr,
-                                                          csc_indices,
-                                                          CUSPARSE_ACTION_NUMERIC,
-                                                          CUSPARSE_INDEX_BASE_ZERO,
-                                                          CUSPARSE_CSR2CSC_ALG1,
-                                                          &convert_csc_workspace_size,
-                                                          stream));
+  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(
+    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
+    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
+    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
+    &convert_csc_workspace_size, stream));
 
   raft::mr::device::buffer<char> convert_csc_workspace(
     allocator, stream, convert_csc_workspace_size);
 
-  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(handle,
-                                               csr_nrows,
-                                               csr_ncols,
-                                               nnz,
-                                               csr_data,
-                                               csr_indptr,
-                                               csr_indices,
-                                               csc_data,
-                                               csc_indptr,
-                                               csc_indices,
-                                               CUSPARSE_ACTION_NUMERIC,
-                                               CUSPARSE_INDEX_BASE_ZERO,
-                                               CUSPARSE_CSR2CSC_ALG1,
-                                               convert_csc_workspace.data(),
-                                               stream));
+  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(
+    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
+    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
+    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
+    convert_csc_workspace.data(), stream));
 }
 
 };  // end NAMESPACE linalg
diff --git a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh
index 36d426029b..f0d30b0cb7 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh
@@ -28,16 +28,10 @@ namespace mst {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename alteration_t>
-__global__ void kernel_min_edge_per_vertex(const edge_t* offsets,
-                                           const vertex_t* indices,
-                                           const alteration_t* weights,
-                                           const vertex_t* color,
-                                           const vertex_t* color_index,
-                                           edge_t* new_mst_edge,
-                                           const bool* mst_edge,
-                                           alteration_t* min_edge_color,
-                                           const vertex_t v)
-{
+__global__ void kernel_min_edge_per_vertex(
+  const edge_t* offsets, const vertex_t* indices, const alteration_t* weights,
+  const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge,
+  const bool* mst_edge, alteration_t* min_edge_color, const vertex_t v) {
   edge_t tid = threadIdx.x + blockIdx.x * blockDim.x;
 
   unsigned warp_id = tid / 32;
@@ -47,14 +41,14 @@ __global__ void kernel_min_edge_per_vertex(const edge_t* offsets,
   __shared__ alteration_t min_edge_weight[32];
   __shared__ vertex_t min_color[32];
 
-  min_edge_index[lane_id]  = std::numeric_limits<edge_t>::max();
+  min_edge_index[lane_id] = std::numeric_limits<edge_t>::max();
   min_edge_weight[lane_id] = std::numeric_limits<alteration_t>::max();
-  min_color[lane_id]       = std::numeric_limits<vertex_t>::max();
+  min_color[lane_id] = std::numeric_limits<vertex_t>::max();
 
   __syncthreads();
 
   vertex_t self_color_idx = color_index[warp_id];
-  vertex_t self_color     = color[self_color_idx];
+  vertex_t self_color = color[self_color_idx];
 
   // find the minimum edge associated per row
   // each thread in warp holds the minimum edge for
@@ -62,20 +56,20 @@ __global__ void kernel_min_edge_per_vertex(const edge_t* offsets,
   if (warp_id < v) {
     // one row is associated with one warp
     edge_t row_start = offsets[warp_id];
-    edge_t row_end   = offsets[warp_id + 1];
+    edge_t row_end = offsets[warp_id + 1];
 
     // assuming one warp per row
     // find min for each thread in warp
     for (edge_t e = row_start + lane_id; e < row_end; e += 32) {
       alteration_t curr_edge_weight = weights[e];
-      vertex_t successor_color_idx  = color_index[indices[e]];
-      vertex_t successor_color      = color[successor_color_idx];
+      vertex_t successor_color_idx = color_index[indices[e]];
+      vertex_t successor_color = color[successor_color_idx];
 
       if (!mst_edge[e] && self_color != successor_color) {
         if (curr_edge_weight < min_edge_weight[lane_id]) {
-          min_color[lane_id]       = successor_color;
+          min_color[lane_id] = successor_color;
           min_edge_weight[lane_id] = curr_edge_weight;
-          min_edge_index[lane_id]  = e;
+          min_edge_index[lane_id] = e;
         }
       }
     }
@@ -88,9 +82,9 @@ __global__ void kernel_min_edge_per_vertex(const edge_t* offsets,
   for (int offset = 16; offset > 0; offset >>= 1) {
     if (lane_id < offset) {
       if (min_edge_weight[lane_id] > min_edge_weight[lane_id + offset]) {
-        min_color[lane_id]       = min_color[lane_id + offset];
+        min_color[lane_id] = min_color[lane_id + offset];
         min_edge_weight[lane_id] = min_edge_weight[lane_id + offset];
-        min_edge_index[lane_id]  = min_edge_index[lane_id + offset];
+        min_edge_index[lane_id] = min_edge_index[lane_id + offset];
       }
     }
     __syncthreads();
@@ -108,26 +102,19 @@ __global__ void kernel_min_edge_per_vertex(const edge_t* offsets,
   }
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-__global__ void min_edge_per_supervertex(const vertex_t* color,
-                                         const vertex_t* color_index,
-                                         edge_t* new_mst_edge,
-                                         bool* mst_edge,
-                                         const vertex_t* indices,
-                                         const weight_t* weights,
-                                         const alteration_t* altered_weights,
-                                         vertex_t* temp_src,
-                                         vertex_t* temp_dst,
-                                         weight_t* temp_weights,
-                                         const alteration_t* min_edge_color,
-                                         const vertex_t v,
-                                         bool symmetrize_output)
-{
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t>
+__global__ void min_edge_per_supervertex(
+  const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge,
+  bool* mst_edge, const vertex_t* indices, const weight_t* weights,
+  const alteration_t* altered_weights, vertex_t* temp_src, vertex_t* temp_dst,
+  weight_t* temp_weights, const alteration_t* min_edge_color, const vertex_t v,
+  bool symmetrize_output) {
   auto tid = get_1D_idx<vertex_t>();
   if (tid < v) {
     vertex_t vertex_color_idx = color_index[tid];
-    vertex_t vertex_color     = color[vertex_color_idx];
-    edge_t edge_idx           = new_mst_edge[tid];
+    vertex_t vertex_color = color[vertex_color_idx];
+    edge_t edge_idx = new_mst_edge[tid];
 
     // check if valid outgoing edge was found
     // find minimum edge is same as minimum edge of whole supervertex
@@ -142,27 +129,32 @@ __global__ void min_edge_per_supervertex(const vertex_t* color,
         auto dst = indices[edge_idx];
         if (!symmetrize_output) {
           auto dst_edge_idx = new_mst_edge[dst];
-          auto dst_color    = color[color_index[dst]];
+          auto dst_color = color[color_index[dst]];
 
           // vertices added each other
           // only if destination has found an edge
           // the edge points back to source
           // the edge is minimum edge found for dst color
-          if (dst_edge_idx != std::numeric_limits<edge_t>::max() && indices[dst_edge_idx] == tid &&
+          if (dst_edge_idx != std::numeric_limits<edge_t>::max() &&
+              indices[dst_edge_idx] == tid &&
               min_edge_color[dst_color] == altered_weights[dst_edge_idx]) {
-            if (vertex_color > dst_color) { add_edge = false; }
+            if (vertex_color > dst_color) {
+              add_edge = false;
+            }
           }
         }
 
         if (add_edge) {
-          temp_src[tid]      = tid;
-          temp_dst[tid]      = dst;
-          temp_weights[tid]  = weights[edge_idx];
+          temp_src[tid] = tid;
+          temp_dst[tid] = dst;
+          temp_weights[tid] = weights[edge_idx];
           mst_edge[edge_idx] = true;
         }
       }
 
-      if (!add_edge) { new_mst_edge[tid] = std::numeric_limits<edge_t>::max(); }
+      if (!add_edge) {
+        new_mst_edge[tid] = std::numeric_limits<edge_t>::max();
+      }
     }
   }
 }
@@ -170,13 +162,9 @@ __global__ void min_edge_per_supervertex(const vertex_t* color,
 template <typename vertex_t, typename edge_t, typename weight_t>
 __global__ void add_reverse_edge(const edge_t* new_mst_edge,
                                  const vertex_t* indices,
-                                 const weight_t* weights,
-                                 vertex_t* temp_src,
-                                 vertex_t* temp_dst,
-                                 weight_t* temp_weights,
-                                 const vertex_t v,
-                                 bool symmetrize_output)
-{
+                                 const weight_t* weights, vertex_t* temp_src,
+                                 vertex_t* temp_dst, weight_t* temp_weights,
+                                 const vertex_t v, bool symmetrize_output) {
   auto tid = get_1D_idx<vertex_t>();
 
   if (tid < v) {
@@ -198,7 +186,9 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
 
           // if vertices did not pick each other
           // add a reverse edge
-          if (tid != neighbor_vertex_neighbor) { reverse_needed = true; }
+          if (tid != neighbor_vertex_neighbor) {
+            reverse_needed = true;
+          }
         }
       }
 
@@ -207,8 +197,8 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
         // it is assumed the each vertex only picks one valid min edge
         // per cycle
         // hence, we store at index tid + v for the reverse edge scenario
-        temp_src[tid + v]     = neighbor_vertex;
-        temp_dst[tid + v]     = tid;
+        temp_src[tid + v] = neighbor_vertex;
+        temp_dst[tid + v] = tid;
         temp_weights[tid + v] = weights[edge_idx];
       }
     }
@@ -217,13 +207,11 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
 
 // executes for newly added mst edges and updates the colors of both vertices to the lower color
 template <typename vertex_t, typename edge_t>
-__global__ void min_pair_colors(const vertex_t v,
-                                const vertex_t* indices,
+__global__ void min_pair_colors(const vertex_t v, const vertex_t* indices,
                                 const edge_t* new_mst_edge,
                                 const vertex_t* color,
                                 const vertex_t* color_index,
-                                vertex_t* next_color)
-{
+                                vertex_t* next_color) {
   auto i = get_1D_idx<vertex_t>();
 
   if (i < v) {
@@ -232,9 +220,9 @@ __global__ void min_pair_colors(const vertex_t v,
     if (edge_idx != std::numeric_limits<edge_t>::max()) {
       vertex_t neighbor_vertex = indices[edge_idx];
       // vertex_t self_color = color[i];
-      vertex_t self_color_idx       = color_index[i];
-      vertex_t self_color           = color[self_color_idx];
-      vertex_t neighbor_color_idx   = color_index[neighbor_vertex];
+      vertex_t self_color_idx = color_index[i];
+      vertex_t self_color = color[self_color_idx];
+      vertex_t neighbor_color_idx = color_index[neighbor_vertex];
       vertex_t neighbor_super_color = color[neighbor_color_idx];
 
       // update my own color as source of edge
@@ -250,36 +238,33 @@ __global__ void min_pair_colors(const vertex_t v,
 
 // for each vertex, update color if it was changed in min_pair_colors kernel
 template <typename vertex_t>
-__global__ void update_colors(const vertex_t v,
-                              vertex_t* color,
+__global__ void update_colors(const vertex_t v, vertex_t* color,
                               const vertex_t* color_index,
-                              const vertex_t* next_color,
-                              bool* done)
-{
+                              const vertex_t* next_color, bool* done) {
   auto i = get_1D_idx<vertex_t>();
 
   if (i < v) {
-    vertex_t self_color     = color[i];
+    vertex_t self_color = color[i];
     vertex_t self_color_idx = color_index[i];
-    vertex_t new_color      = next_color[self_color_idx];
+    vertex_t new_color = next_color[self_color_idx];
 
     // update self color to new smaller color
     if (self_color > new_color) {
       color[i] = new_color;
-      *done    = false;
+      *done = false;
     }
   }
 }
 
 // point vertices to their final color index
 template <typename vertex_t>
-__global__ void final_color_indices(const vertex_t v, const vertex_t* color, vertex_t* color_index)
-{
+__global__ void final_color_indices(const vertex_t v, const vertex_t* color,
+                                    vertex_t* color_index) {
   auto i = get_1D_idx<vertex_t>();
 
   if (i < v) {
     vertex_t self_color_idx = color_index[i];
-    vertex_t self_color     = color[self_color_idx];
+    vertex_t self_color = color[self_color_idx];
 
     // if self color is not equal to self color index,
     // it means self is not supervertex
@@ -287,7 +272,7 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, ver
     // parent supervertex
     while (self_color_idx != self_color) {
       self_color_idx = color_index[self_color];
-      self_color     = color[self_color_idx];
+      self_color = color[self_color_idx];
     }
 
     // point to new supervertex
@@ -297,23 +282,22 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color, ver
 
 // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
 // Consider using curand device API instead of precomputed random_values array
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-__global__ void alteration_kernel(const vertex_t v,
-                                  const edge_t e,
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t>
+__global__ void alteration_kernel(const vertex_t v, const edge_t e,
                                   const edge_t* offsets,
                                   const vertex_t* indices,
-                                  const weight_t* weights,
-                                  alteration_t max,
+                                  const weight_t* weights, alteration_t max,
                                   alteration_t* random_values,
-                                  alteration_t* altered_weights)
-{
+                                  alteration_t* altered_weights) {
   auto row = get_1D_idx<vertex_t>();
   if (row < v) {
     auto row_begin = offsets[row];
-    auto row_end   = offsets[row + 1];
+    auto row_end = offsets[row + 1];
     for (auto i = row_begin; i < row_end; i++) {
-      auto column        = indices[i];
-      altered_weights[i] = weights[i] + max * (random_values[row] + random_values[column]);
+      auto column = indices[i];
+      altered_weights[i] =
+        weights[i] + max * (random_values[row] + random_values[column]);
     }
   }
 }
@@ -321,15 +305,17 @@ __global__ void alteration_kernel(const vertex_t v,
 template <typename vertex_t, typename edge_t>
 __global__ void kernel_count_new_mst_edges(const vertex_t* mst_src,
                                            edge_t* mst_edge_count,
-                                           const vertex_t v)
-{
+                                           const vertex_t v) {
   auto tid = get_1D_idx<vertex_t>();
 
   // count number of new mst edges added
-  bool predicate       = tid < v && (mst_src[tid] != std::numeric_limits<vertex_t>::max());
+  bool predicate =
+    tid < v && (mst_src[tid] != std::numeric_limits<vertex_t>::max());
   vertex_t block_count = __syncthreads_count(predicate);
 
-  if (threadIdx.x == 0 && block_count > 0) { atomicAdd(mst_edge_count, block_count); }
+  if (threadIdx.x == 0 && block_count > 0) {
+    atomicAdd(mst_edge_count, block_count);
+  }
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index 158f4cc314..c5ba4fcb4f 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -46,30 +46,21 @@ typedef std::chrono::high_resolution_clock Clock;
 
 // curand generator uniform
 inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator,
-                                               float* outputPtr,
-                                               size_t n)
-{
+                                               float* outputPtr, size_t n) {
   return curandGenerateUniform(generator, outputPtr, n);
 }
 inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator,
-                                               double* outputPtr,
-                                               size_t n)
-{
+                                               double* outputPtr, size_t n) {
   return curandGenerateUniformDouble(generator, outputPtr, n);
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(const raft::handle_t& handle_,
-                                                                 const edge_t* offsets_,
-                                                                 const vertex_t* indices_,
-                                                                 const weight_t* weights_,
-                                                                 const vertex_t v_,
-                                                                 const edge_t e_,
-                                                                 vertex_t* color_,
-                                                                 cudaStream_t stream_,
-                                                                 bool symmetrize_output_,
-                                                                 bool initialize_colors_,
-                                                                 int iterations_)
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t>
+MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
+  const raft::handle_t& handle_, const edge_t* offsets_,
+  const vertex_t* indices_, const weight_t* weights_, const vertex_t v_,
+  const edge_t e_, vertex_t* color_, cudaStream_t stream_,
+  bool symmetrize_output_, bool initialize_colors_, int iterations_)
   : handle(handle_),
     offsets(offsets_),
     indices(indices_),
@@ -91,13 +82,12 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(const raft::han
     stream(stream_),
     symmetrize_output(symmetrize_output_),
     initialize_colors(initialize_colors_),
-    iterations(iterations_)
-{
-  max_blocks  = handle_.get_device_properties().maxGridSize[0];
+    iterations(iterations_) {
+  max_blocks = handle_.get_device_properties().maxGridSize[0];
   max_threads = handle_.get_device_properties().maxThreadsPerBlock;
-  sm_count    = handle_.get_device_properties().multiProcessorCount;
+  sm_count = handle_.get_device_properties().multiProcessorCount;
 
-  // Initially, color holds the vertex id as color
+  //Initially, color holds the vertex id as color
   auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
   if (initialize_colors_) {
     thrust::sequence(policy, color.begin(), color.end(), 0);
@@ -108,10 +98,10 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(const raft::han
   thrust::sequence(policy, next_color.begin(), next_color.end(), 0);
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t>
 raft::Graph_COO<vertex_t, edge_t, weight_t>
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve()
-{
+MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
   RAFT_EXPECTS(v > 0, "0 vertices");
   RAFT_EXPECTS(e > 0, "0 edges");
   RAFT_EXPECTS(offsets != nullptr, "Null offsets.");
@@ -124,13 +114,12 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve()
 
   // Alterating the weights
   // this is done by identifying the lowest cost edge weight gap that is not 0, call this theta.
-  // For each edge, add noise that is less than theta. That is, generate a random number in the
-  // range [0.0, theta) and add it to each edge weight.
+  // For each edge, add noise that is less than theta. That is, generate a random number in the range [0.0, theta) and add it to each edge weight.
   alteration();
 
 #ifdef MST_TIME
   auto stop = Clock::now();
-  timer0    = duration_us(stop - start);
+  timer0 = duration_us(stop - start);
 #endif
 
   auto max_mst_edges = symmetrize_output ? 2 * v - 2 : v - 1;
@@ -179,8 +168,8 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve()
     if (curr_mst_edge_count == prev_mst_edge_count[0]) {
 #ifdef MST_TIME
       std::cout << "Iterations: " << i << std::endl;
-      std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 << "," << timer4 << ","
-                << timer5 << std::endl;
+      std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3
+                << "," << timer4 << "," << timer5 << std::endl;
 #endif
       // exit here when reaching steady state
       break;
@@ -190,7 +179,8 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve()
     start = Clock::now();
 #endif
     // append the newly found MST edges to the final output
-    append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), mst_result.weights.data());
+    append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(),
+                        mst_result.weights.data());
 #ifdef MST_TIME
     stop = Clock::now();
     timer4 += duration_us(stop - start);
@@ -211,7 +201,7 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve()
 
   // result packaging
   thrust::host_vector<edge_t> host_mst_edge_count = mst_edge_count;
-  mst_result.n_edges                              = host_mst_edge_count[0];
+  mst_result.n_edges = host_mst_edge_count[0];
   mst_result.src.resize(mst_result.n_edges, stream);
   mst_result.dst.resize(mst_result.n_edges, stream);
   mst_result.weights.resize(mst_result.n_edges, stream);
@@ -222,46 +212,50 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve()
 // ||y|-|x||
 template <typename weight_t>
 struct alteration_functor {
-  __host__ __device__ weight_t operator()(const thrust::tuple<weight_t, weight_t>& t)
-  {
+  __host__ __device__ weight_t
+  operator()(const thrust::tuple<weight_t, weight_t>& t) {
     auto x = thrust::get<0>(t);
     auto y = thrust::get<1>(t);
-    x      = x < 0 ? -x : x;
-    y      = y < 0 ? -y : y;
+    x = x < 0 ? -x : x;
+    y = y < 0 ? -y : y;
     return x < y ? y - x : x - y;
   }
 };
 
 // Compute the uper bound for the alteration
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-alteration_t MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max()
-{
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t>
+alteration_t
+MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max() {
   auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
   rmm::device_vector<weight_t> tmp(e);
   thrust::device_ptr<const weight_t> weights_ptr(weights);
   thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin());
-  // sort tmp weights
+  //sort tmp weights
   thrust::sort(policy, tmp.begin(), tmp.end());
 
-  // remove duplicates
+  //remove duplicates
   auto new_end = thrust::unique(policy, tmp.begin(), tmp.end());
 
-  // min(a[i+1]-a[i])/2
-  auto begin = thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1));
-  auto end   = thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end));
-  auto init  = tmp[1] - tmp[0];
-  auto max   = thrust::transform_reduce(
-    policy, begin, end, alteration_functor<weight_t>(), init, thrust::minimum<weight_t>());
+  //min(a[i+1]-a[i])/2
+  auto begin =
+    thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1));
+  auto end =
+    thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end));
+  auto init = tmp[1] - tmp[0];
+  auto max =
+    thrust::transform_reduce(policy, begin, end, alteration_functor<weight_t>(),
+                             init, thrust::minimum<weight_t>());
   return max / static_cast<alteration_t>(2);
 }
 
 // Compute the alteration to make all undirected edge weight unique
 // Preserves weights order
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration()
-{
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
   auto nthreads = std::min(v, max_threads);
-  auto nblocks  = std::min((v + nthreads - 1) / nthreads, max_blocks);
+  auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks);
 
   // maximum alteration that does not change realtive weights order
   alteration_t max = alteration_max();
@@ -275,32 +269,35 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration()
   curandSetPseudoRandomGeneratorSeed(randGen, 1234567);
 
   // Initialize rand values
-  auto curand_status = curand_generate_uniformX(randGen, rand_values.data().get(), v);
+  auto curand_status =
+    curand_generate_uniformX(randGen, rand_values.data().get(), v);
   RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed");
   curand_status = curandDestroyGenerator(randGen);
-  RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND cleanup failed");
+  RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS,
+               "MST: CURAND cleanup failed");
 
-  // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
+  //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
   detail::alteration_kernel<<<nblocks, nthreads, 0, stream>>>(
-    v, e, offsets, indices, weights, max, rand_values.data().get(), altered_weights.data().get());
+    v, e, offsets, indices, weights, max, rand_values.data().get(),
+    altered_weights.data().get());
 }
 
 // updates colors of vertices by propagating the lower color to the higher
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(vertex_t* mst_src,
-                                                                      vertex_t* mst_dst)
-{
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(
+  vertex_t* mst_src, vertex_t* mst_dst) {
   // update the colors of both ends its until there is no change in colors
   thrust::host_vector<edge_t> curr_mst_edge_count = mst_edge_count;
 
   auto min_pair_nthreads = std::min(v, (vertex_t)max_threads);
-  auto min_pair_nblocks =
-    std::min((v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks);
+  auto min_pair_nblocks = std::min(
+    (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks);
 
   rmm::device_vector<bool> done(1, false);
 
   edge_t* new_mst_edge_ptr = new_mst_edge.data().get();
-  vertex_t* color_ptr      = color.data().get();
+  vertex_t* color_ptr = color.data().get();
   vertex_t* next_color_ptr = next_color.data().get();
 
   bool* done_ptr = done.data().get();
@@ -317,99 +314,84 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(vertex_t*
     i++;
   }
 
-  detail::final_color_indices<<<min_pair_nblocks, min_pair_nthreads, 0, stream>>>(
-    v, color_ptr, color_index);
+  detail::
+    final_color_indices<<<min_pair_nblocks, min_pair_nthreads, 0, stream>>>(
+      v, color_ptr, color_index);
 #ifdef MST_TIME
   std::cout << "Label prop iterations: " << i << std::endl;
 #endif
 }
 
 // Finds the minimum edge from each vertex to the lowest color
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::min_edge_per_vertex()
-{
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t,
+                alteration_t>::min_edge_per_vertex() {
   auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
-  thrust::fill(
-    policy, min_edge_color.begin(), min_edge_color.end(), std::numeric_limits<alteration_t>::max());
-  thrust::fill(
-    policy, new_mst_edge.begin(), new_mst_edge.end(), std::numeric_limits<weight_t>::max());
+  thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(),
+               std::numeric_limits<alteration_t>::max());
+  thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(),
+               std::numeric_limits<weight_t>::max());
 
   int n_threads = 32;
 
-  vertex_t* color_ptr               = color.data().get();
-  edge_t* new_mst_edge_ptr          = new_mst_edge.data().get();
-  bool* mst_edge_ptr                = mst_edge.data().get();
-  alteration_t* min_edge_color_ptr  = min_edge_color.data().get();
+  vertex_t* color_ptr = color.data().get();
+  edge_t* new_mst_edge_ptr = new_mst_edge.data().get();
+  bool* mst_edge_ptr = mst_edge.data().get();
+  alteration_t* min_edge_color_ptr = min_edge_color.data().get();
   alteration_t* altered_weights_ptr = altered_weights.data().get();
 
-  detail::kernel_min_edge_per_vertex<<<v, n_threads, 0, stream>>>(offsets,
-                                                                  indices,
-                                                                  altered_weights_ptr,
-                                                                  color_ptr,
-                                                                  color_index,
-                                                                  new_mst_edge_ptr,
-                                                                  mst_edge_ptr,
-                                                                  min_edge_color_ptr,
-                                                                  v);
+  detail::kernel_min_edge_per_vertex<<<v, n_threads, 0, stream>>>(
+    offsets, indices, altered_weights_ptr, color_ptr, color_index,
+    new_mst_edge_ptr, mst_edge_ptr, min_edge_color_ptr, v);
 }
 
 // Finds the minimum edge from each supervertex to the lowest color
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::min_edge_per_supervertex()
-{
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t,
+                alteration_t>::min_edge_per_supervertex() {
   auto nthreads = std::min(v, max_threads);
-  auto nblocks  = std::min((v + nthreads - 1) / nthreads, max_blocks);
+  auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks);
 
   auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
-  thrust::fill(policy, temp_src.begin(), temp_src.end(), std::numeric_limits<vertex_t>::max());
+  thrust::fill(policy, temp_src.begin(), temp_src.end(),
+               std::numeric_limits<vertex_t>::max());
 
-  vertex_t* color_ptr               = color.data().get();
-  edge_t* new_mst_edge_ptr          = new_mst_edge.data().get();
-  bool* mst_edge_ptr                = mst_edge.data().get();
-  alteration_t* min_edge_color_ptr  = min_edge_color.data().get();
+  vertex_t* color_ptr = color.data().get();
+  edge_t* new_mst_edge_ptr = new_mst_edge.data().get();
+  bool* mst_edge_ptr = mst_edge.data().get();
+  alteration_t* min_edge_color_ptr = min_edge_color.data().get();
   alteration_t* altered_weights_ptr = altered_weights.data().get();
-  vertex_t* temp_src_ptr            = temp_src.data().get();
-  vertex_t* temp_dst_ptr            = temp_dst.data().get();
-  weight_t* temp_weights_ptr        = temp_weights.data().get();
-
-  detail::min_edge_per_supervertex<<<nblocks, nthreads, 0, stream>>>(color_ptr,
-                                                                     color_index,
-                                                                     new_mst_edge_ptr,
-                                                                     mst_edge_ptr,
-                                                                     indices,
-                                                                     weights,
-                                                                     altered_weights_ptr,
-                                                                     temp_src_ptr,
-                                                                     temp_dst_ptr,
-                                                                     temp_weights_ptr,
-                                                                     min_edge_color_ptr,
-                                                                     v,
-                                                                     symmetrize_output);
+  vertex_t* temp_src_ptr = temp_src.data().get();
+  vertex_t* temp_dst_ptr = temp_dst.data().get();
+  weight_t* temp_weights_ptr = temp_weights.data().get();
+
+  detail::min_edge_per_supervertex<<<nblocks, nthreads, 0, stream>>>(
+    color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights,
+    altered_weights_ptr, temp_src_ptr, temp_dst_ptr, temp_weights_ptr,
+    min_edge_color_ptr, v, symmetrize_output);
 
   // the above kernel only adds directed mst edges in the case where
   // a pair of vertices don't pick the same min edge between them
   // so, now we add the reverse edge to make it undirected
   if (symmetrize_output) {
-    detail::add_reverse_edge<<<nblocks, nthreads, 0, stream>>>(new_mst_edge_ptr,
-                                                               indices,
-                                                               weights,
-                                                               temp_src_ptr,
-                                                               temp_dst_ptr,
-                                                               temp_weights_ptr,
-                                                               v,
-                                                               symmetrize_output);
+    detail::add_reverse_edge<<<nblocks, nthreads, 0, stream>>>(
+      new_mst_edge_ptr, indices, weights, temp_src_ptr, temp_dst_ptr,
+      temp_weights_ptr, v, symmetrize_output);
   }
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination()
-{
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination() {
   vertex_t nthreads = std::min(2 * v, (vertex_t)max_threads);
-  vertex_t nblocks  = std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks);
+  vertex_t nblocks =
+    std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks);
 
   // count number of new mst edges
   edge_t* mst_edge_count_ptr = mst_edge_count.data().get();
-  vertex_t* temp_src_ptr     = temp_src.data().get();
+  vertex_t* temp_src_ptr = temp_src.data().get();
 
   detail::kernel_count_new_mst_edges<<<nblocks, nthreads, 0, stream>>>(
     temp_src_ptr, mst_edge_count_ptr, 2 * v);
@@ -417,40 +399,36 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination()
 
 template <typename vertex_t, typename weight_t>
 struct new_edges_functor {
-  __host__ __device__ bool operator()(const thrust::tuple<vertex_t, vertex_t, weight_t>& t)
-  {
+  __host__ __device__ bool operator()(
+    const thrust::tuple<vertex_t, vertex_t, weight_t>& t) {
     auto src = thrust::get<0>(t);
 
     return src != std::numeric_limits<vertex_t>::max() ? true : false;
   }
 };
 
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t>
 void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::append_src_dst_pair(
-  vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights)
-{
+  vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) {
   auto policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
 
   auto curr_mst_edge_count = prev_mst_edge_count[0];
 
   // iterator to end of mst edges added to final output in previous iteration
-  auto src_dst_zip_end =
-    thrust::make_zip_iterator(thrust::make_tuple(mst_src + curr_mst_edge_count,
-                                                 mst_dst + curr_mst_edge_count,
-                                                 mst_weights + curr_mst_edge_count));
+  auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple(
+    mst_src + curr_mst_edge_count, mst_dst + curr_mst_edge_count,
+    mst_weights + curr_mst_edge_count));
 
   // iterator to new mst edges found
-  auto temp_src_dst_zip_begin = thrust::make_zip_iterator(
-    thrust::make_tuple(temp_src.begin(), temp_dst.begin(), temp_weights.begin()));
+  auto temp_src_dst_zip_begin = thrust::make_zip_iterator(thrust::make_tuple(
+    temp_src.begin(), temp_dst.begin(), temp_weights.begin()));
   auto temp_src_dst_zip_end = thrust::make_zip_iterator(
     thrust::make_tuple(temp_src.end(), temp_dst.end(), temp_weights.end()));
 
   // copy new mst edges to final output
-  thrust::copy_if(policy,
-                  temp_src_dst_zip_begin,
-                  temp_src_dst_zip_end,
-                  src_dst_zip_end,
-                  new_edges_functor<vertex_t, weight_t>());
+  thrust::copy_if(policy, temp_src_dst_zip_begin, temp_src_dst_zip_end,
+                  src_dst_zip_end, new_edges_functor<vertex_t, weight_t>());
 }
 
 }  // namespace mst
diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh
index 24127c993f..8f755de459 100644
--- a/cpp/include/raft/sparse/mst/detail/utils.cuh
+++ b/cpp/include/raft/sparse/mst/detail/utils.cuh
@@ -26,29 +26,32 @@ namespace mst {
 namespace detail {
 
 template <typename idx_t>
-__device__ idx_t get_1D_idx()
-{
+__device__ idx_t get_1D_idx() {
   return blockIdx.x * blockDim.x + threadIdx.x;
 }
 
 // somewhat smart vector print
 template <typename T>
-void printv(rmm::device_vector<T>& vec, const std::string& name = "", const size_t displ = 5)
-{
+void printv(rmm::device_vector<T>& vec, const std::string& name = "",
+            const size_t displ = 5) {
 #ifdef MST_TIME
   std::cout.precision(15);
   std::cout << name << " size = " << vec.size() << std::endl;
   if (displ < vec.size()) {
-    thrust::copy(vec.begin(), vec.begin() + displ, std::ostream_iterator<T>(std::cout, " "));
+    thrust::copy(vec.begin(), vec.begin() + displ,
+                 std::ostream_iterator<T>(std::cout, " "));
     std::cout << " ... ";
-    thrust::copy(vec.end() - displ, vec.end(), std::ostream_iterator<T>(std::cout, " "));
+    thrust::copy(vec.end() - displ, vec.end(),
+                 std::ostream_iterator<T>(std::cout, " "));
   } else {
-    thrust::copy(vec.begin(), vec.end(), std::ostream_iterator<T>(std::cout, " "));
+    thrust::copy(vec.begin(), vec.end(),
+                 std::ostream_iterator<T>(std::cout, " "));
   }
   std::cout << std::endl << std::endl;
 #endif
 }
-#define duration_us(a) std::chrono::duration_cast<std::chrono::microseconds>(a).count()
+#define duration_us(a) \
+  std::chrono::duration_cast<std::chrono::microseconds>(a).count()
 
 }  // namespace detail
 }  // namespace mst
diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh
index b49003467b..10c981445e 100644
--- a/cpp/include/raft/sparse/mst/mst.cuh
+++ b/cpp/include/raft/sparse/mst/mst.cuh
@@ -22,30 +22,16 @@
 namespace raft {
 namespace mst {
 
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t = weight_t>
-raft::Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
-                                                edge_t const* offsets,
-                                                vertex_t const* indices,
-                                                weight_t const* weights,
-                                                vertex_t const v,
-                                                edge_t const e,
-                                                vertex_t* color,
-                                                cudaStream_t stream,
-                                                bool symmetrize_output = true,
-                                                bool initialize_colors = true,
-                                                int iterations         = 0)
-{
-  MST_solver<vertex_t, edge_t, weight_t, alteration_t> mst_solver(handle,
-                                                                  offsets,
-                                                                  indices,
-                                                                  weights,
-                                                                  v,
-                                                                  e,
-                                                                  color,
-                                                                  stream,
-                                                                  symmetrize_output,
-                                                                  initialize_colors,
-                                                                  iterations);
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t = weight_t>
+raft::Graph_COO<vertex_t, edge_t, weight_t> mst(
+  const raft::handle_t& handle, edge_t const* offsets, vertex_t const* indices,
+  weight_t const* weights, vertex_t const v, edge_t const e, vertex_t* color,
+  cudaStream_t stream, bool symmetrize_output = true,
+  bool initialize_colors = true, int iterations = 0) {
+  MST_solver<vertex_t, edge_t, weight_t, alteration_t> mst_solver(
+    handle, offsets, indices, weights, v, e, color, stream, symmetrize_output,
+    initialize_colors, iterations);
   return mst_solver.solve();
 }
 
diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh
index e32bcfacac..833882ea0d 100644
--- a/cpp/include/raft/sparse/mst/mst_solver.cuh
+++ b/cpp/include/raft/sparse/mst/mst_solver.cuh
@@ -31,27 +31,20 @@ struct Graph_COO {
   edge_t n_edges;
 
   Graph_COO(vertex_t size, cudaStream_t stream)
-    : src(size, stream), dst(size, stream), weights(size, stream)
-  {
-  }
+    : src(size, stream), dst(size, stream), weights(size, stream) {}
 };
 
 namespace mst {
 
-template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename alteration_t>
 class MST_solver {
  public:
-  MST_solver(const raft::handle_t& handle_,
-             const edge_t* offsets_,
-             const vertex_t* indices_,
-             const weight_t* weights_,
-             const vertex_t v_,
-             const edge_t e_,
-             vertex_t* color_,
-             cudaStream_t stream_,
-             bool symmetrize_output_,
-             bool initialize_colors_,
-             int iterations_);
+  MST_solver(const raft::handle_t& handle_, const edge_t* offsets_,
+             const vertex_t* indices_, const weight_t* weights_,
+             const vertex_t v_, const edge_t e_, vertex_t* color_,
+             cudaStream_t stream_, bool symmetrize_output_,
+             bool initialize_colors_, int iterations_);
 
   raft::Graph_COO<vertex_t, edge_t, weight_t> solve();
 
@@ -63,7 +56,7 @@ class MST_solver {
   bool symmetrize_output, initialize_colors;
   int iterations;
 
-  // CSR
+  //CSR
   const edge_t* offsets;
   const vertex_t* indices;
   const weight_t* weights;
@@ -74,16 +67,20 @@ class MST_solver {
   vertex_t max_threads;
   vertex_t sm_count;
 
-  vertex_t* color_index;                             // represent each supervertex as a color
-  rmm::device_vector<alteration_t> min_edge_color;   // minimum incident edge weight per color
-  rmm::device_vector<edge_t> new_mst_edge;           // new minimum edge per vertex
-  rmm::device_vector<alteration_t> altered_weights;  // weights to be used for mst
-  rmm::device_vector<edge_t> mst_edge_count;  // total number of edges added after every iteration
+  vertex_t* color_index;  // represent each supervertex as a color
+  rmm::device_vector<alteration_t>
+    min_edge_color;  // minimum incident edge weight per color
+  rmm::device_vector<edge_t> new_mst_edge;  // new minimum edge per vertex
+  rmm::device_vector<alteration_t>
+    altered_weights;  // weights to be used for mst
   rmm::device_vector<edge_t>
-    prev_mst_edge_count;                    // total number of edges up to the previous iteration
-  rmm::device_vector<bool> mst_edge;        // mst output -  true if the edge belongs in mst
+    mst_edge_count;  // total number of edges added after every iteration
+  rmm::device_vector<edge_t>
+    prev_mst_edge_count;  // total number of edges up to the previous iteration
+  rmm::device_vector<bool>
+    mst_edge;  // mst output -  true if the edge belongs in mst
   rmm::device_vector<vertex_t> next_color;  //  next iteration color
-  rmm::device_vector<vertex_t> color;       // index of color that vertex points to
+  rmm::device_vector<vertex_t> color;  // index of color that vertex points to
 
   // new src-dst pairs found per iteration
   rmm::device_vector<vertex_t> temp_src;
@@ -96,7 +93,8 @@ class MST_solver {
   void check_termination();
   void alteration();
   alteration_t alteration_max();
-  void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights);
+  void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst,
+                           weight_t* mst_weights);
 };
 
 }  // namespace mst
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index 397fecaaea..562d506cfe 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -42,23 +42,15 @@ namespace sparse {
 namespace op {
 
 template <int TPB_X, typename T>
-__global__ void coo_remove_scalar_kernel(const int* rows,
-                                         const int* cols,
-                                         const T* vals,
-                                         int nnz,
-                                         int* crows,
-                                         int* ccols,
-                                         T* cvals,
-                                         int* ex_scan,
-                                         int* cur_ex_scan,
-                                         int m,
-                                         T scalar)
-{
+__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols,
+                                         const T *vals, int nnz, int *crows,
+                                         int *ccols, T *cvals, int *ex_scan,
+                                         int *cur_ex_scan, int m, T scalar) {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < m) {
-    int start       = cur_ex_scan[row];
-    int stop        = get_stop_idx(row, m, nnz, cur_ex_scan);
+    int start = cur_ex_scan[row];
+    int stop = get_stop_idx(row, m, nnz, cur_ex_scan);
     int cur_out_idx = ex_scan[row];
 
     for (int idx = start; idx < stop; idx++) {
@@ -90,51 +82,37 @@ __global__ void coo_remove_scalar_kernel(const int* rows,
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_scalar(const int* rows,
-                       const int* cols,
-                       const T* vals,
-                       int nnz,
-                       int* crows,
-                       int* ccols,
-                       T* cvals,
-                       int* cnnz,
-                       int* cur_cnnz,
-                       T scalar,
-                       int n,
+void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
+                       int *crows, int *ccols, T *cvals, int *cnnz,
+                       int *cur_cnnz, T scalar, int n,
                        std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream)
-{
+                       cudaStream_t stream) {
   raft::mr::device::buffer<int> ex_scan(d_alloc, stream, n);
   raft::mr::device::buffer<int> cur_ex_scan(d_alloc, stream, n);
 
   CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream));
   CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream));
 
-  thrust::device_ptr<int> dev_cnnz    = thrust::device_pointer_cast(cnnz);
-  thrust::device_ptr<int> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
-  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan);
+  thrust::device_ptr<int> dev_cnnz = thrust::device_pointer_cast(cnnz);
+  thrust::device_ptr<int> dev_ex_scan =
+    thrust::device_pointer_cast(ex_scan.data());
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cnnz, dev_cnnz + n,
+                         dev_ex_scan);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  thrust::device_ptr<int> dev_cur_cnnz    = thrust::device_pointer_cast(cur_cnnz);
-  thrust::device_ptr<int> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
-  thrust::exclusive_scan(
-    thrust::cuda::par.on(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan);
+  thrust::device_ptr<int> dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz);
+  thrust::device_ptr<int> dev_cur_ex_scan =
+    thrust::device_pointer_cast(cur_ex_scan.data());
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), dev_cur_cnnz,
+                         dev_cur_cnnz + n, dev_cur_ex_scan);
   CUDA_CHECK(cudaPeekAtLastError());
 
   dim3 grid(raft::ceildiv(n, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(rows,
-                                                            cols,
-                                                            vals,
-                                                            nnz,
-                                                            crows,
-                                                            ccols,
-                                                            cvals,
-                                                            dev_ex_scan.get(),
-                                                            dev_cur_ex_scan.get(),
-                                                            n,
-                                                            scalar);
+  coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(
+    rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(),
+    dev_cur_ex_scan.get(), n, scalar);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -148,44 +126,35 @@ void coo_remove_scalar(const int* rows,
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_scalar(COO<T>* in,
-                       COO<T>* out,
-                       T scalar,
+void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar,
                        std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                       cudaStream_t stream)
-{
+                       cudaStream_t stream) {
   raft::mr::device::buffer<int> row_count_nz(d_alloc, stream, in->n_rows);
   raft::mr::device::buffer<int> row_count(d_alloc, stream, in->n_rows);
 
-  CUDA_CHECK(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
-  CUDA_CHECK(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
+  CUDA_CHECK(
+    cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
+  CUDA_CHECK(
+    cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
 
   linalg::coo_degree<TPB_X>(in->rows(), in->nnz, row_count.data(), stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  linalg::coo_degree_scalar<TPB_X>(
-    in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream);
+  linalg::coo_degree_scalar<TPB_X>(in->rows(), in->vals(), in->nnz, scalar,
+                                   row_count_nz.data(), stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  thrust::device_ptr<int> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
-  int out_nnz =
-    thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz, d_row_count_nz + in->n_rows);
+  thrust::device_ptr<int> d_row_count_nz =
+    thrust::device_pointer_cast(row_count_nz.data());
+  int out_nnz = thrust::reduce(thrust::cuda::par.on(stream), d_row_count_nz,
+                               d_row_count_nz + in->n_rows);
 
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
 
-  coo_remove_scalar<TPB_X, T>(in->rows(),
-                              in->cols(),
-                              in->vals(),
-                              in->nnz,
-                              out->rows(),
-                              out->cols(),
-                              out->vals(),
-                              row_count_nz.data(),
-                              row_count.data(),
-                              scalar,
-                              in->n_rows,
-                              d_alloc,
-                              stream);
+  coo_remove_scalar<TPB_X, T>(in->rows(), in->cols(), in->vals(), in->nnz,
+                              out->rows(), out->cols(), out->vals(),
+                              row_count_nz.data(), row_count.data(), scalar,
+                              in->n_rows, d_alloc, stream);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -198,11 +167,9 @@ void coo_remove_scalar(COO<T>* in,
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_zeros(COO<T>* in,
-                      COO<T>* out,
+void coo_remove_zeros(COO<T> *in, COO<T> *out,
                       std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                      cudaStream_t stream)
-{
+                      cudaStream_t stream) {
   coo_remove_scalar<TPB_X, T>(in, out, T(0.0), d_alloc, stream);
 }
 
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index bc4d7bace5..53c9f89074 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -46,29 +46,25 @@ namespace sparse {
 namespace op {
 
 template <typename value_idx>
-__global__ void compute_duplicates_diffs_kernel(const value_idx* rows,
-                                                const value_idx* cols,
-                                                value_idx* diff,
-                                                size_t nnz)
-{
+__global__ void compute_duplicates_diffs_kernel(const value_idx *rows,
+                                                const value_idx *cols,
+                                                value_idx *diff, size_t nnz) {
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid >= nnz) return;
 
   value_idx d = 1;
-  if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) d = 0;
+  if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid]))
+    d = 0;
   diff[tid] = d;
 }
 
 template <typename value_idx, typename value_t>
-__global__ void max_duplicates_kernel(const value_idx* src_rows,
-                                      const value_idx* src_cols,
-                                      const value_t* src_vals,
-                                      const value_idx* index,
-                                      value_idx* out_rows,
-                                      value_idx* out_cols,
-                                      value_t* out_vals,
-                                      size_t nnz)
-{
+__global__ void max_duplicates_kernel(const value_idx *src_rows,
+                                      const value_idx *src_cols,
+                                      const value_t *src_vals,
+                                      const value_idx *index,
+                                      value_idx *out_rows, value_idx *out_cols,
+                                      value_t *out_vals, size_t nnz) {
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid < nnz) {
@@ -100,13 +96,13 @@ __global__ void max_duplicates_kernel(const value_idx* src_rows,
  * @param[in] stream cuda ops will be ordered wrt this stream
  */
 template <typename value_idx>
-void compute_duplicates_mask(
-  value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream)
-{
+void compute_duplicates_mask(value_idx *mask, const value_idx *rows,
+                             const value_idx *cols, size_t nnz,
+                             cudaStream_t stream) {
   CUDA_CHECK(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream));
 
-  compute_duplicates_diffs_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0, stream>>>(
-    rows, cols, mask, nnz);
+  compute_duplicates_diffs_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0,
+                                    stream>>>(rows, cols, mask, nnz);
 }
 
 /**
@@ -126,17 +122,12 @@ void compute_duplicates_mask(
  * @param[in] stream cuda ops will be ordered wrt this stream
  */
 template <typename value_idx, typename value_t>
-void max_duplicates(const raft::handle_t& handle,
-                    raft::sparse::COO<value_t, value_idx>& out,
-                    const value_idx* rows,
-                    const value_idx* cols,
-                    const value_t* vals,
-                    size_t nnz,
-                    size_t m,
-                    size_t n)
-{
+void max_duplicates(const raft::handle_t &handle,
+                    raft::sparse::COO<value_t, value_idx> &out,
+                    const value_idx *rows, const value_idx *cols,
+                    const value_t *vals, size_t nnz, size_t m, size_t n) {
   auto d_alloc = handle.get_device_allocator();
-  auto stream  = handle.get_stream();
+  auto stream = handle.get_stream();
 
   auto exec_policy = rmm::exec_policy(rmm::cuda_stream_view{stream});
 
@@ -145,8 +136,8 @@ void max_duplicates(const raft::handle_t& handle,
 
   compute_duplicates_mask(diff.data(), rows, cols, nnz, stream);
 
-  thrust::exclusive_scan(
-    thrust::cuda::par.on(stream), diff.data(), diff.data() + diff.size(), diff.data());
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), diff.data(),
+                         diff.data() + diff.size(), diff.data());
 
   // compute final size
   value_idx size = 0;
diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh
index 194a878ac1..9e5034dc28 100644
--- a/cpp/include/raft/sparse/op/row_op.cuh
+++ b/cpp/include/raft/sparse/op/row_op.cuh
@@ -38,12 +38,12 @@ namespace sparse {
 namespace op {
 
 template <typename T, int TPB_X = 256, typename Lambda = auto(T, T, T)->void>
-__global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op)
-{
+__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz,
+                                  Lambda op) {
   T row = blockIdx.x * TPB_X + threadIdx.x;
   if (row < n_rows) {
     T start_idx = row_ind[row];
-    T stop_idx  = row < n_rows - 1 ? row_ind[row + 1] : nnz;
+    T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz;
     op(row, start_idx, stop_idx);
   }
 }
@@ -59,12 +59,14 @@ __global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op)
  * @param op custom row operation functor accepting the row and beginning index.
  * @param stream cuda stream to use
  */
-template <typename Index_, int TPB_X = 256, typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream)
-{
+template <typename Index_, int TPB_X = 256,
+          typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op,
+                cudaStream_t stream) {
   dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1);
   dim3 blk(TPB_X, 1, 1);
-  csr_row_op_kernel<Index_, TPB_X><<<grid, blk, 0, stream>>>(row_ind, n_rows, nnz, op);
+  csr_row_op_kernel<Index_, TPB_X>
+    <<<grid, blk, 0, stream>>>(row_ind, n_rows, nnz, op);
 
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/sparse/op/slice.h b/cpp/include/raft/sparse/op/slice.h
index 9bbe04cf34..46f4f41879 100644
--- a/cpp/include/raft/sparse/op/slice.h
+++ b/cpp/include/raft/sparse/op/slice.h
@@ -50,14 +50,10 @@ namespace op {
  * @param[in] stream : cuda stream for ordering events
  */
 template <typename value_idx>
-void csr_row_slice_indptr(value_idx start_row,
-                          value_idx stop_row,
-                          const value_idx* indptr,
-                          value_idx* indptr_out,
-                          value_idx* start_offset,
-                          value_idx* stop_offset,
-                          cudaStream_t stream)
-{
+void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
+                          const value_idx *indptr, value_idx *indptr_out,
+                          value_idx *start_offset, value_idx *stop_offset,
+                          cudaStream_t stream) {
   raft::update_host(start_offset, indptr + start_row, 1, stream);
   raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream);
 
@@ -67,12 +63,11 @@ void csr_row_slice_indptr(value_idx start_row,
 
   // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1,
   // we add another 1 to stop row.
-  raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, stream);
+  raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row,
+                   stream);
 
   raft::linalg::unaryOp<value_idx>(
-    indptr_out,
-    indptr_out,
-    (stop_row + 2) - start_row,
+    indptr_out, indptr_out, (stop_row + 2) - start_row,
     [s_offset] __device__(value_idx input) { return input - s_offset; },
     stream);
 }
@@ -90,15 +85,12 @@ void csr_row_slice_indptr(value_idx start_row,
  * @param[in] stream : cuda stream for ordering events
  */
 template <typename value_idx, typename value_t>
-void csr_row_slice_populate(value_idx start_offset,
-                            value_idx stop_offset,
-                            const value_idx* indices,
-                            const value_t* data,
-                            value_idx* indices_out,
-                            value_t* data_out,
-                            cudaStream_t stream)
-{
-  raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, stream);
+void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset,
+                            const value_idx *indices, const value_t *data,
+                            value_idx *indices_out, value_t *data_out,
+                            cudaStream_t stream) {
+  raft::copy(indices_out, indices + start_offset, stop_offset - start_offset,
+             stream);
   raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream);
 }
 
diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h
index 3cab24fc09..9dbe2b67c5 100644
--- a/cpp/include/raft/sparse/op/sort.h
+++ b/cpp/include/raft/sparse/op/sort.h
@@ -42,8 +42,7 @@ namespace op {
 
 struct TupleComp {
   template <typename one, typename two>
-  __host__ __device__ bool operator()(const one& t1, const two& t2)
-  {
+  __host__ __device__ bool operator()(const one &t1, const two &t2) {
     // sort first by each sample's color,
     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
     if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
@@ -67,21 +66,15 @@ struct TupleComp {
  * @param stream: cuda stream to use
  */
 template <typename T>
-void coo_sort(int m,
-              int n,
-              int nnz,
-              int* rows,
-              int* cols,
-              T* vals,
+void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
               // TODO: Remove this
               std::shared_ptr<raft::mr::device::allocator> d_alloc,
-              cudaStream_t stream)
-{
+              cudaStream_t stream) {
   auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(
-    thrust::cuda::par.on(stream), coo_indices, coo_indices + nnz, vals, TupleComp());
+  thrust::sort_by_key(thrust::cuda::par.on(stream), coo_indices,
+                      coo_indices + nnz, vals, TupleComp());
 }
 
 /**
@@ -92,12 +85,12 @@ void coo_sort(int m,
  * @param stream: the cuda stream to use
  */
 template <typename T>
-void coo_sort(COO<T>* const in,
+void coo_sort(COO<T> *const in,
               // TODO: Remove this
               std::shared_ptr<raft::mr::device::allocator> d_alloc,
-              cudaStream_t stream)
-{
-  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), d_alloc, stream);
+              cudaStream_t stream) {
+  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(),
+              in->vals(), d_alloc, stream);
 }
 
 /**
@@ -111,16 +104,16 @@ void coo_sort(COO<T>* const in,
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t>
-void coo_sort_by_weight(
-  value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream)
-{
+void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data,
+                        value_idx nnz, cudaStream_t stream) {
   thrust::device_ptr<value_idx> t_rows = thrust::device_pointer_cast(rows);
   thrust::device_ptr<value_idx> t_cols = thrust::device_pointer_cast(cols);
-  thrust::device_ptr<value_t> t_data   = thrust::device_pointer_cast(data);
+  thrust::device_ptr<value_t> t_data = thrust::device_pointer_cast(data);
 
   auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
-  thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz, first);
+  thrust::sort_by_key(thrust::cuda::par.on(stream), t_data, t_data + nnz,
+                      first);
 }
 };  // namespace op
 };  // end NAMESPACE sparse
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index ec8bec6eb3..8aae90f1d8 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -59,20 +59,17 @@ struct KeyValuePair {
   __host__ __device__ __forceinline__ KeyValuePair() {}
 
   /// Copy Constructor
-  __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp)
-    : key(kvp.key), value(kvp.value)
-  {
-  }
+  __host__ __device__ __forceinline__
+  KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp)
+    : key(kvp.key), value(kvp.value) {}
 
   /// Constructor
-  __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value)
-    : key(key), value(value)
-  {
-  }
+  __host__ __device__ __forceinline__ KeyValuePair(Key const &key,
+                                                   Value const &value)
+    : key(key), value(value) {}
 
   /// Inequality operator
-  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b)
-  {
+  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair &b) {
     return (value != b.value) || (key != b.key);
   }
 };
@@ -86,32 +83,31 @@ struct KeyValuePair {
  */
 template <typename value_idx, typename value_t>
 struct FixConnectivitiesRedOp {
-  value_idx* colors;
+  value_idx *colors;
   value_idx m;
 
-  FixConnectivitiesRedOp(value_idx* colors_, value_idx m_) : colors(colors_), m(m_){};
+  FixConnectivitiesRedOp(value_idx *colors_, value_idx m_)
+    : colors(colors_), m(m_){};
 
   typedef typename cub::KeyValuePair<value_idx, value_t> KVP;
-  DI void operator()(value_idx rit, KVP* out, const KVP& other)
-  {
-    if (rit < m && other.value < out->value && colors[rit] != colors[other.key]) {
-      out->key   = other.key;
+  DI void operator()(value_idx rit, KVP *out, const KVP &other) {
+    if (rit < m && other.value < out->value &&
+        colors[rit] != colors[other.key]) {
+      out->key = other.key;
       out->value = other.value;
     }
   }
 
-  DI KVP operator()(value_idx rit, const KVP& a, const KVP& b)
-  {
+  DI KVP operator()(value_idx rit, const KVP &a, const KVP &b) {
     if (rit < m && a.value < b.value && colors[rit] != colors[a.key]) {
       return a;
     } else
       return b;
   }
 
-  DI void init(value_t* out, value_t maxVal) { *out = maxVal; }
-  DI void init(KVP* out, value_t maxVal)
-  {
-    out->key   = -1;
+  DI void init(value_t *out, value_t maxVal) { *out = maxVal; }
+  DI void init(KVP *out, value_t maxVal) {
+    out->key = -1;
     out->value = maxVal;
   }
 };
@@ -123,8 +119,7 @@ struct FixConnectivitiesRedOp {
  */
 struct TupleComp {
   template <typename one, typename two>
-  __host__ __device__ bool operator()(const one& t1, const two& t2)
-  {
+  __host__ __device__ bool operator()(const one &t1, const two &t2) {
     // sort first by each sample's color,
     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
     if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
@@ -142,9 +137,13 @@ template <typename LabelT, typename DataT>
 struct CubKVPMinReduce {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+  DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) {
+    return b.value < a.value ? b : a;
+  }
 
-  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+  DI KVP operator()(const KVP &a, const KVP &b) {
+    return b.value < a.value ? b : a;
+  }
 
 };  // KVPMinReduce
 
@@ -159,14 +158,13 @@ struct CubKVPMinReduce {
  * @return total number of components
  */
 template <typename value_idx>
-value_idx get_n_components(value_idx* colors,
-                           size_t n_rows,
+value_idx get_n_components(value_idx *colors, size_t n_rows,
                            std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                           cudaStream_t stream)
-{
-  value_idx* map_ids;
+                           cudaStream_t stream) {
+  value_idx *map_ids;
   int num_clusters;
-  raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream, d_alloc);
+  raft::label::getUniquelabels(colors, n_rows, &map_ids, &num_clusters, stream,
+                               d_alloc);
   d_alloc->deallocate(map_ids, num_clusters * sizeof(value_idx), stream);
 
   return num_clusters;
@@ -179,12 +177,11 @@ value_idx get_n_components(value_idx* colors,
  */
 template <typename value_idx, typename value_t>
 struct LookupColorOp {
-  value_idx* colors;
+  value_idx *colors;
 
-  LookupColorOp(value_idx* colors_) : colors(colors_) {}
+  LookupColorOp(value_idx *colors_) : colors(colors_) {}
 
-  DI value_idx operator()(const cub::KeyValuePair<value_idx, value_t>& kvp)
-  {
+  DI value_idx operator()(const cub::KeyValuePair<value_idx, value_t> &kvp) {
     return colors[kvp.key];
   }
 };
@@ -194,8 +191,7 @@ struct LookupColorOp {
  * the given array of components
  * @tparam value_idx
  * @tparam value_t
- * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given
- * array of components
+ * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given array of components
  * @param[out] nn_colors components of nearest neighbors for each vertex
  * @param[in] colors components of each vertex
  * @param[in] X original dense data
@@ -205,39 +201,25 @@ struct LookupColorOp {
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t, typename red_op>
-void perform_1nn(cub::KeyValuePair<value_idx, value_t>* kvp,
-                 value_idx* nn_colors,
-                 value_idx* colors,
-                 const value_t* X,
-                 size_t n_rows,
-                 size_t n_cols,
+void perform_1nn(cub::KeyValuePair<value_idx, value_t> *kvp,
+                 value_idx *nn_colors, value_idx *colors, const value_t *X,
+                 size_t n_rows, size_t n_cols,
                  std::shared_ptr<raft::mr::device::allocator> d_alloc,
-                 cudaStream_t stream,
-                 red_op reduction_op)
-{
+                 cudaStream_t stream, red_op reduction_op) {
   rmm::device_uvector<int> workspace(n_rows, stream);
   rmm::device_uvector<value_t> x_norm(n_rows, stream);
 
-  raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, true, stream);
-
-  raft::distance::fusedL2NN<value_t, cub::KeyValuePair<value_idx, value_t>, value_idx>(
-    kvp,
-    X,
-    X,
-    x_norm.data(),
-    x_norm.data(),
-    n_rows,
-    n_rows,
-    n_cols,
-    workspace.data(),
-    reduction_op,
-    reduction_op,
-    true,
-    true,
-    stream);
+  raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm,
+                        true, stream);
+
+  raft::distance::fusedL2NN<value_t, cub::KeyValuePair<value_idx, value_t>,
+                            value_idx>(
+    kvp, X, X, x_norm.data(), x_norm.data(), n_rows, n_rows, n_cols,
+    workspace.data(), reduction_op, reduction_op, true, true, stream);
 
   LookupColorOp<value_idx, value_t> extract_colors_op(colors);
-  thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op);
+  thrust::transform(thrust::cuda::par.on(stream), kvp, kvp + n_rows, nn_colors,
+                    extract_colors_op);
 }
 
 /**
@@ -253,33 +235,27 @@ void perform_1nn(cub::KeyValuePair<value_idx, value_t>* kvp,
  * @param stream stream for which to order CUDA operations
  */
 template <typename value_idx, typename value_t>
-void sort_by_color(value_idx* colors,
-                   value_idx* nn_colors,
-                   cub::KeyValuePair<value_idx, value_t>* kvp,
-                   value_idx* src_indices,
-                   size_t n_rows,
-                   cudaStream_t stream)
-{
+void sort_by_color(value_idx *colors, value_idx *nn_colors,
+                   cub::KeyValuePair<value_idx, value_t> *kvp,
+                   value_idx *src_indices, size_t n_rows, cudaStream_t stream) {
   thrust::counting_iterator<value_idx> arg_sort_iter(0);
-  thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter, arg_sort_iter + n_rows, src_indices);
+  thrust::copy(thrust::cuda::par.on(stream), arg_sort_iter,
+               arg_sort_iter + n_rows, src_indices);
 
-  auto keys = thrust::make_zip_iterator(
-    thrust::make_tuple(colors, nn_colors, (raft::linkage::KeyValuePair<value_idx, value_t>*)kvp));
+  auto keys = thrust::make_zip_iterator(thrust::make_tuple(
+    colors, nn_colors, (raft::linkage::KeyValuePair<value_idx, value_t> *)kvp));
   auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals, TupleComp());
+  thrust::sort_by_key(thrust::cuda::par.on(stream), keys, keys + n_rows, vals,
+                      TupleComp());
 }
 
 template <typename value_idx, typename value_t>
-__global__ void min_components_by_color_kernel(value_idx* out_rows,
-                                               value_idx* out_cols,
-                                               value_t* out_vals,
-                                               const value_idx* out_index,
-                                               const value_idx* indices,
-                                               const cub::KeyValuePair<value_idx, value_t>* kvp,
-                                               size_t nnz)
-{
+__global__ void min_components_by_color_kernel(
+  value_idx *out_rows, value_idx *out_cols, value_t *out_vals,
+  const value_idx *out_index, const value_idx *indices,
+  const cub::KeyValuePair<value_idx, value_t> *kvp, size_t nnz) {
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid >= nnz) return;
@@ -308,20 +284,19 @@ __global__ void min_components_by_color_kernel(value_idx* out_rows,
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t>
-void min_components_by_color(raft::sparse::COO<value_t, value_idx>& coo,
-                             const value_idx* out_index,
-                             const value_idx* indices,
-                             const cub::KeyValuePair<value_idx, value_t>* kvp,
-                             size_t nnz,
-                             cudaStream_t stream)
-{
+void min_components_by_color(raft::sparse::COO<value_t, value_idx> &coo,
+                             const value_idx *out_index,
+                             const value_idx *indices,
+                             const cub::KeyValuePair<value_idx, value_t> *kvp,
+                             size_t nnz, cudaStream_t stream) {
   /**
    * Arrays should be ordered by: colors_indptr->colors_n->kvp.value
    * so the last element of each column in the input CSR should be
    * the min.
    */
-  min_components_by_color_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0, stream>>>(
-    coo.rows(), coo.cols(), coo.vals(), out_index, indices, kvp, nnz);
+  min_components_by_color_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0,
+                                   stream>>>(coo.rows(), coo.cols(), coo.vals(),
+                                             out_index, indices, kvp, nnz);
 }
 
 /**
@@ -343,18 +318,14 @@ void min_components_by_color(raft::sparse::COO<value_t, value_idx>& coo,
  * @param[in] n_cols number of cols in X
  */
 template <typename value_idx, typename value_t, typename red_op>
-void connect_components(
-  const raft::handle_t& handle,
-  raft::sparse::COO<value_t, value_idx>& out,
-  const value_t* X,
-  const value_idx* orig_colors,
-  size_t n_rows,
-  size_t n_cols,
-  red_op reduction_op,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
-{
+void connect_components(const raft::handle_t &handle,
+                        raft::sparse::COO<value_t, value_idx> &out,
+                        const value_t *X, const value_idx *orig_colors,
+                        size_t n_rows, size_t n_cols, red_op reduction_op,
+                        raft::distance::DistanceType metric =
+                          raft::distance::DistanceType::L2SqrtExpanded) {
   auto d_alloc = handle.get_device_allocator();
-  auto stream  = handle.get_stream();
+  auto stream = handle.get_stream();
 
   RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
                "Fixing connectivities for an unconnected k-NN graph only "
@@ -364,52 +335,47 @@ void connect_components(
   raft::copy_async(colors.data(), orig_colors, n_rows, stream);
 
   // Normalize colors so they are drawn from a monotonically increasing set
-  raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, d_alloc, true);
+  raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream,
+                              d_alloc, true);
 
-  value_idx n_components = get_n_components(colors.data(), n_rows, d_alloc, stream);
+  value_idx n_components =
+    get_n_components(colors.data(), n_rows, d_alloc, stream);
 
   /**
    * First compute 1-nn for all colors where the color of each data point
    * is guaranteed to be != color of its nearest neighbor.
    */
   rmm::device_uvector<value_idx> nn_colors(n_rows, stream);
-  rmm::device_uvector<cub::KeyValuePair<value_idx, value_t>> temp_inds_dists(n_rows, stream);
+  rmm::device_uvector<cub::KeyValuePair<value_idx, value_t>> temp_inds_dists(
+    n_rows, stream);
   rmm::device_uvector<value_idx> src_indices(n_rows, stream);
 
-  perform_1nn(temp_inds_dists.data(),
-              nn_colors.data(),
-              colors.data(),
-              X,
-              n_rows,
-              n_cols,
-              d_alloc,
-              stream,
-              reduction_op);
+  perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X,
+              n_rows, n_cols, d_alloc, stream, reduction_op);
 
   /**
    * Sort data points by color (neighbors are not sorted)
    */
   // max_color + 1 = number of connected components
   // sort nn_colors by key w/ original colors
-  sort_by_color(
-    colors.data(), nn_colors.data(), temp_inds_dists.data(), src_indices.data(), n_rows, stream);
+  sort_by_color(colors.data(), nn_colors.data(), temp_inds_dists.data(),
+                src_indices.data(), n_rows, stream);
 
   /**
    * Take the min for any duplicate colors
    */
   // Compute mask of duplicates
   rmm::device_uvector<value_idx> out_index(n_rows + 1, stream);
-  raft::sparse::op::compute_duplicates_mask(
-    out_index.data(), colors.data(), nn_colors.data(), n_rows, stream);
+  raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(),
+                                            nn_colors.data(), n_rows, stream);
 
-  thrust::exclusive_scan(thrust::cuda::par.on(stream),
-                         out_index.data(),
-                         out_index.data() + out_index.size(),
-                         out_index.data());
+  thrust::exclusive_scan(thrust::cuda::par.on(stream), out_index.data(),
+                         out_index.data() + out_index.size(), out_index.data());
 
   // compute final size
   value_idx size = 0;
-  raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, stream);
+  raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1,
+                    stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   size++;
@@ -417,14 +383,14 @@ void connect_components(
   raft::sparse::COO<value_t, value_idx> min_edges(d_alloc, stream);
   min_edges.allocate(size, n_rows, n_rows, true, stream);
 
-  min_components_by_color(
-    min_edges, out_index.data(), src_indices.data(), temp_inds_dists.data(), n_rows, stream);
+  min_components_by_color(min_edges, out_index.data(), src_indices.data(),
+                          temp_inds_dists.data(), n_rows, stream);
 
   /**
    * Symmetrize resulting edge list
    */
-  raft::sparse::linalg::symmetrize(
-    handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out);
+  raft::sparse::linalg::symmetrize(handle, min_edges.rows(), min_edges.cols(),
+                                   min_edges.vals(), n_rows, n_rows, size, out);
 }
 
 };  // end namespace linkage
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index dbb24ee334..71fbb8ab3d 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -49,11 +49,9 @@ namespace selection {
 
 template <typename value_idx, typename value_t>
 struct csr_batcher_t {
-  csr_batcher_t(value_idx batch_size,
-                value_idx n_rows,
-                const value_idx* csr_indptr,
-                const value_idx* csr_indices,
-                const value_t* csr_data)
+  csr_batcher_t(value_idx batch_size, value_idx n_rows,
+                const value_idx *csr_indptr, const value_idx *csr_indices,
+                const value_t *csr_data)
     : batch_start_(0),
       batch_stop_(0),
       batch_rows_(0),
@@ -63,42 +61,32 @@ struct csr_batcher_t {
       csr_indices_(csr_indices),
       csr_data_(csr_data),
       batch_csr_start_offset_(0),
-      batch_csr_stop_offset_(0)
-  {
-  }
+      batch_csr_stop_offset_(0) {}
 
-  void set_batch(int batch_num)
-  {
+  void set_batch(int batch_num) {
     batch_start_ = batch_num * batch_size_;
-    batch_stop_  = batch_start_ + batch_size_ - 1;  // zero-based indexing
+    batch_stop_ = batch_start_ + batch_size_ - 1;  // zero-based indexing
 
-    if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1;  // zero-based indexing
+    if (batch_stop_ >= total_rows_)
+      batch_stop_ = total_rows_ - 1;  // zero-based indexing
 
     batch_rows_ = (batch_stop_ - batch_start_) + 1;
   }
 
-  value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream)
-  {
-    raft::sparse::op::csr_row_slice_indptr(batch_start_,
-                                           batch_stop_,
-                                           csr_indptr_,
-                                           batch_indptr,
-                                           &batch_csr_start_offset_,
-                                           &batch_csr_stop_offset_,
-                                           stream);
+  value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr,
+                                     cudaStream_t stream) {
+    raft::sparse::op::csr_row_slice_indptr(
+      batch_start_, batch_stop_, csr_indptr_, batch_indptr,
+      &batch_csr_start_offset_, &batch_csr_stop_offset_, stream);
 
     return batch_csr_stop_offset_ - batch_csr_start_offset_;
   }
 
-  void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream)
-  {
-    raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_,
-                                             batch_csr_stop_offset_,
-                                             csr_indices_,
-                                             csr_data_,
-                                             csr_indices,
-                                             csr_data,
-                                             stream);
+  void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data,
+                                  cudaStream_t stream) {
+    raft::sparse::op::csr_row_slice_populate(
+      batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_,
+      csr_indices, csr_data, stream);
   }
 
   value_idx batch_rows() const { return batch_rows_; }
@@ -115,9 +103,9 @@ struct csr_batcher_t {
 
   value_idx total_rows_;
 
-  const value_idx* csr_indptr_;
-  const value_idx* csr_indices_;
-  const value_t* csr_data_;
+  const value_idx *csr_indptr_;
+  const value_idx *csr_indices_;
+  const value_t *csr_data_;
 
   value_idx batch_csr_start_offset_;
   value_idx batch_csr_stop_offset_;
@@ -126,26 +114,18 @@ struct csr_batcher_t {
 template <typename value_idx, typename value_t>
 class sparse_knn_t {
  public:
-  sparse_knn_t(const value_idx* idxIndptr_,
-               const value_idx* idxIndices_,
-               const value_t* idxData_,
-               size_t idxNNZ_,
-               int n_idx_rows_,
-               int n_idx_cols_,
-               const value_idx* queryIndptr_,
-               const value_idx* queryIndices_,
-               const value_t* queryData_,
-               size_t queryNNZ_,
-               int n_query_rows_,
-               int n_query_cols_,
-               value_idx* output_indices_,
-               value_t* output_dists_,
-               int k_,
-               const raft::handle_t& handle_,
-               size_t batch_size_index_             = 2 << 14,  // approx 1M
-               size_t batch_size_query_             = 2 << 14,
-               raft::distance::DistanceType metric_ = raft::distance::DistanceType::L2Expanded,
-               float metricArg_                     = 0)
+  sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_,
+               const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_,
+               int n_idx_cols_, const value_idx *queryIndptr_,
+               const value_idx *queryIndices_, const value_t *queryData_,
+               size_t queryNNZ_, int n_query_rows_, int n_query_cols_,
+               value_idx *output_indices_, value_t *output_dists_, int k_,
+               const raft::handle_t &handle_,
+               size_t batch_size_index_ = 2 << 14,  // approx 1M
+               size_t batch_size_query_ = 2 << 14,
+               raft::distance::DistanceType metric_ =
+                 raft::distance::DistanceType::L2Expanded,
+               float metricArg_ = 0)
     : idxIndptr(idxIndptr_),
       idxIndices(idxIndices_),
       idxData(idxData_),
@@ -165,12 +145,9 @@ class sparse_knn_t {
       batch_size_index(batch_size_index_),
       batch_size_query(batch_size_query_),
       metric(metric_),
-      metricArg(metricArg_)
-  {
-  }
+      metricArg(metricArg_) {}
 
-  void run()
-  {
+  void run() {
     using namespace raft::sparse;
 
     int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query);
@@ -181,33 +158,37 @@ class sparse_knn_t {
 
     for (int i = 0; i < n_batches_query; i++) {
       /**
-       * Compute index batch info
-       */
+        * Compute index batch info
+        */
       query_batcher.set_batch(i);
 
       /**
-       * Slice CSR to rows in batch
-       */
+        * Slice CSR to rows in batch
+        */
 
-      rmm::device_uvector<value_idx> query_batch_indptr(query_batcher.batch_rows() + 1,
-                                                        handle.get_stream());
+      rmm::device_uvector<value_idx> query_batch_indptr(
+        query_batcher.batch_rows() + 1, handle.get_stream());
 
-      value_idx n_query_batch_nnz =
-        query_batcher.get_batch_csr_indptr_nnz(query_batch_indptr.data(), handle.get_stream());
+      value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz(
+        query_batch_indptr.data(), handle.get_stream());
 
-      rmm::device_uvector<value_idx> query_batch_indices(n_query_batch_nnz, handle.get_stream());
-      rmm::device_uvector<value_t> query_batch_data(n_query_batch_nnz, handle.get_stream());
+      rmm::device_uvector<value_idx> query_batch_indices(n_query_batch_nnz,
+                                                         handle.get_stream());
+      rmm::device_uvector<value_t> query_batch_data(n_query_batch_nnz,
+                                                    handle.get_stream());
 
-      query_batcher.get_batch_csr_indices_data(
-        query_batch_indices.data(), query_batch_data.data(), handle.get_stream());
+      query_batcher.get_batch_csr_indices_data(query_batch_indices.data(),
+                                               query_batch_data.data(),
+                                               handle.get_stream());
 
       // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent
       // batches and 1 space for the results of the merge, which get copied back to the top
-      rmm::device_uvector<value_idx> merge_buffer_indices(0, handle.get_stream());
+      rmm::device_uvector<value_idx> merge_buffer_indices(0,
+                                                          handle.get_stream());
       rmm::device_uvector<value_t> merge_buffer_dists(0, handle.get_stream());
 
-      value_t* dists_merge_buffer_ptr;
-      value_idx* indices_merge_buffer_ptr;
+      value_t *dists_merge_buffer_ptr;
+      value_idx *indices_merge_buffer_ptr;
 
       int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index);
       csr_batcher_t<value_idx, value_t> idx_batcher(
@@ -216,19 +197,22 @@ class sparse_knn_t {
       for (int j = 0; j < n_batches_idx; j++) {
         idx_batcher.set_batch(j);
 
-        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, handle.get_stream());
-        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, handle.get_stream());
+        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3,
+                                    handle.get_stream());
+        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3,
+                                  handle.get_stream());
 
         /**
-         * Slice CSR to rows in batch
-         */
-        rmm::device_uvector<value_idx> idx_batch_indptr(idx_batcher.batch_rows() + 1,
-                                                        handle.get_stream());
-        rmm::device_uvector<value_idx> idx_batch_indices(0, handle.get_stream());
+          * Slice CSR to rows in batch
+        */
+        rmm::device_uvector<value_idx> idx_batch_indptr(
+          idx_batcher.batch_rows() + 1, handle.get_stream());
+        rmm::device_uvector<value_idx> idx_batch_indices(0,
+                                                         handle.get_stream());
         rmm::device_uvector<value_t> idx_batch_data(0, handle.get_stream());
 
-        value_idx idx_batch_nnz =
-          idx_batcher.get_batch_csr_indptr_nnz(idx_batch_indptr.data(), handle.get_stream());
+        value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz(
+          idx_batch_indptr.data(), handle.get_stream());
 
         idx_batch_indices.resize(idx_batch_nnz, handle.get_stream());
         idx_batch_data.resize(idx_batch_nnz, handle.get_stream());
@@ -237,126 +221,111 @@ class sparse_knn_t {
           idx_batch_indices.data(), idx_batch_data.data(), handle.get_stream());
 
         /**
-         * Compute distances
-         */
-        size_t dense_size = idx_batcher.batch_rows() * query_batcher.batch_rows();
-        rmm::device_uvector<value_t> batch_dists(dense_size, handle.get_stream());
-
-        CUDA_CHECK(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t)));
-
-        compute_distances(idx_batcher,
-                          query_batcher,
-                          idx_batch_nnz,
-                          n_query_batch_nnz,
-                          idx_batch_indptr.data(),
-                          idx_batch_indices.data(),
-                          idx_batch_data.data(),
-                          query_batch_indptr.data(),
-                          query_batch_indices.data(),
-                          query_batch_data.data(),
-                          batch_dists.data());
+           * Compute distances
+           */
+        size_t dense_size =
+          idx_batcher.batch_rows() * query_batcher.batch_rows();
+        rmm::device_uvector<value_t> batch_dists(dense_size,
+                                                 handle.get_stream());
+
+        CUDA_CHECK(cudaMemset(batch_dists.data(), 0,
+                              batch_dists.size() * sizeof(value_t)));
+
+        compute_distances(idx_batcher, query_batcher, idx_batch_nnz,
+                          n_query_batch_nnz, idx_batch_indptr.data(),
+                          idx_batch_indices.data(), idx_batch_data.data(),
+                          query_batch_indptr.data(), query_batch_indices.data(),
+                          query_batch_data.data(), batch_dists.data());
 
         // Build batch indices array
-        rmm::device_uvector<value_idx> batch_indices(batch_dists.size(), handle.get_stream());
+        rmm::device_uvector<value_idx> batch_indices(batch_dists.size(),
+                                                     handle.get_stream());
 
         // populate batch indices array
-        value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
+        value_idx batch_rows = query_batcher.batch_rows(),
+                  batch_cols = idx_batcher.batch_rows();
 
-        iota_fill(batch_indices.data(), batch_rows, batch_cols, handle.get_stream());
+        iota_fill(batch_indices.data(), batch_rows, batch_cols,
+                  handle.get_stream());
 
         /**
          * Perform k-selection on batch & merge with other k-selections
          */
         size_t merge_buffer_offset = batch_rows * k;
-        dists_merge_buffer_ptr     = merge_buffer_dists.data() + merge_buffer_offset;
-        indices_merge_buffer_ptr   = merge_buffer_indices.data() + merge_buffer_offset;
-
-        perform_k_selection(idx_batcher,
-                            query_batcher,
-                            batch_dists.data(),
-                            batch_indices.data(),
-                            dists_merge_buffer_ptr,
+        dists_merge_buffer_ptr =
+          merge_buffer_dists.data() + merge_buffer_offset;
+        indices_merge_buffer_ptr =
+          merge_buffer_indices.data() + merge_buffer_offset;
+
+        perform_k_selection(idx_batcher, query_batcher, batch_dists.data(),
+                            batch_indices.data(), dists_merge_buffer_ptr,
                             indices_merge_buffer_ptr);
 
-        value_t* dists_merge_buffer_tmp_ptr     = dists_merge_buffer_ptr;
-        value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
+        value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr;
+        value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
 
         // Merge results of difference batches if necessary
         if (idx_batcher.batch_start() > 0) {
-          size_t merge_buffer_tmp_out  = batch_rows * k * 2;
-          dists_merge_buffer_tmp_ptr   = merge_buffer_dists.data() + merge_buffer_tmp_out;
-          indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out;
-
-          merge_batches(idx_batcher,
-                        query_batcher,
-                        merge_buffer_dists.data(),
-                        merge_buffer_indices.data(),
-                        dists_merge_buffer_tmp_ptr,
+          size_t merge_buffer_tmp_out = batch_rows * k * 2;
+          dists_merge_buffer_tmp_ptr =
+            merge_buffer_dists.data() + merge_buffer_tmp_out;
+          indices_merge_buffer_tmp_ptr =
+            merge_buffer_indices.data() + merge_buffer_tmp_out;
+
+          merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(),
+                        merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr,
                         indices_merge_buffer_tmp_ptr);
         }
 
         // copy merged output back into merge buffer partition for next iteration
         raft::copy_async<value_idx>(merge_buffer_indices.data(),
                                     indices_merge_buffer_tmp_ptr,
-                                    batch_rows * k,
-                                    handle.get_stream());
+                                    batch_rows * k, handle.get_stream());
         raft::copy_async<value_t>(merge_buffer_dists.data(),
-                                  dists_merge_buffer_tmp_ptr,
-                                  batch_rows * k,
+                                  dists_merge_buffer_tmp_ptr, batch_rows * k,
                                   handle.get_stream());
       }
 
       // Copy final merged batch to output array
-      raft::copy_async<value_idx>(output_indices + (rows_processed * k),
-                                  merge_buffer_indices.data(),
-                                  query_batcher.batch_rows() * k,
-                                  handle.get_stream());
-      raft::copy_async<value_t>(output_dists + (rows_processed * k),
-                                merge_buffer_dists.data(),
-                                query_batcher.batch_rows() * k,
-                                handle.get_stream());
+      raft::copy_async<value_idx>(
+        output_indices + (rows_processed * k), merge_buffer_indices.data(),
+        query_batcher.batch_rows() * k, handle.get_stream());
+      raft::copy_async<value_t>(
+        output_dists + (rows_processed * k), merge_buffer_dists.data(),
+        query_batcher.batch_rows() * k, handle.get_stream());
 
       rows_processed += query_batcher.batch_rows();
     }
   }
 
  private:
-  void merge_batches(csr_batcher_t<value_idx, value_t>& idx_batcher,
-                     csr_batcher_t<value_idx, value_t>& query_batcher,
-                     value_t* merge_buffer_dists,
-                     value_idx* merge_buffer_indices,
-                     value_t* out_dists,
-                     value_idx* out_indices)
-  {
+  void merge_batches(csr_batcher_t<value_idx, value_t> &idx_batcher,
+                     csr_batcher_t<value_idx, value_t> &query_batcher,
+                     value_t *merge_buffer_dists,
+                     value_idx *merge_buffer_indices, value_t *out_dists,
+                     value_idx *out_indices) {
     // build translation buffer to shift resulting indices by the batch
     std::vector<value_idx> id_ranges;
     id_ranges.push_back(0);
     id_ranges.push_back(idx_batcher.batch_start());
 
     rmm::device_uvector<value_idx> trans(id_ranges.size(), handle.get_stream());
-    raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), handle.get_stream());
+    raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(),
+                        handle.get_stream());
 
     // combine merge buffers only if there's more than 1 partition to combine
-    raft::spatial::knn::knn_merge_parts(merge_buffer_dists,
-                                        merge_buffer_indices,
-                                        out_dists,
-                                        out_indices,
-                                        query_batcher.batch_rows(),
-                                        2,
-                                        k,
-                                        handle.get_stream(),
-                                        trans.data());
+    raft::spatial::knn::knn_merge_parts(
+      merge_buffer_dists, merge_buffer_indices, out_dists, out_indices,
+      query_batcher.batch_rows(), 2, k, handle.get_stream(), trans.data());
   }
 
   void perform_k_selection(csr_batcher_t<value_idx, value_t> idx_batcher,
                            csr_batcher_t<value_idx, value_t> query_batcher,
-                           value_t* batch_dists,
-                           value_idx* batch_indices,
-                           value_t* out_dists,
-                           value_idx* out_indices)
-  {
+                           value_t *batch_dists, value_idx *batch_indices,
+                           value_t *out_dists, value_idx *out_indices) {
     // populate batch indices array
-    value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
+    value_idx batch_rows = query_batcher.batch_rows(),
+              batch_cols = idx_batcher.batch_rows();
 
     // build translation buffer to shift resulting indices by the batch
     std::vector<value_idx> id_ranges;
@@ -371,60 +340,51 @@ class sparse_knn_t {
     if (metric == raft::distance::DistanceType::InnerProduct) ascending = false;
 
     // kernel to slice first (min) k cols and copy into batched merge buffer
-    select_k(batch_dists,
-             batch_indices,
-             batch_rows,
-             batch_cols,
-             out_dists,
-             out_indices,
-             ascending,
-             n_neighbors,
-             handle.get_stream());
+    select_k(batch_dists, batch_indices, batch_rows, batch_cols, out_dists,
+             out_indices, ascending, n_neighbors, handle.get_stream());
   }
 
-  void compute_distances(csr_batcher_t<value_idx, value_t>& idx_batcher,
-                         csr_batcher_t<value_idx, value_t>& query_batcher,
-                         size_t idx_batch_nnz,
-                         size_t query_batch_nnz,
-                         value_idx* idx_batch_indptr,
-                         value_idx* idx_batch_indices,
-                         value_t* idx_batch_data,
-                         value_idx* query_batch_indptr,
-                         value_idx* query_batch_indices,
-                         value_t* query_batch_data,
-                         value_t* batch_dists)
-  {
+  void compute_distances(csr_batcher_t<value_idx, value_t> &idx_batcher,
+                         csr_batcher_t<value_idx, value_t> &query_batcher,
+                         size_t idx_batch_nnz, size_t query_batch_nnz,
+                         value_idx *idx_batch_indptr,
+                         value_idx *idx_batch_indices, value_t *idx_batch_data,
+                         value_idx *query_batch_indptr,
+                         value_idx *query_batch_indices,
+                         value_t *query_batch_data, value_t *batch_dists) {
     /**
      * Compute distances
      */
-    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config(handle);
+    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config(
+      handle);
     dist_config.b_nrows = idx_batcher.batch_rows();
     dist_config.b_ncols = n_idx_cols;
-    dist_config.b_nnz   = idx_batch_nnz;
+    dist_config.b_nnz = idx_batch_nnz;
 
-    dist_config.b_indptr  = idx_batch_indptr;
+    dist_config.b_indptr = idx_batch_indptr;
     dist_config.b_indices = idx_batch_indices;
-    dist_config.b_data    = idx_batch_data;
+    dist_config.b_data = idx_batch_data;
 
     dist_config.a_nrows = query_batcher.batch_rows();
     dist_config.a_ncols = n_query_cols;
-    dist_config.a_nnz   = query_batch_nnz;
+    dist_config.a_nnz = query_batch_nnz;
 
-    dist_config.a_indptr  = query_batch_indptr;
+    dist_config.a_indptr = query_batch_indptr;
     dist_config.a_indices = query_batch_indices;
-    dist_config.a_data    = query_batch_data;
+    dist_config.a_data = query_batch_data;
 
     if (raft::sparse::distance::supportedDistance.find(metric) ==
         raft::sparse::distance::supportedDistance.end())
       THROW("DistanceType not supported: %d", metric);
 
-    raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg);
+    raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric,
+                                             metricArg);
   }
 
   const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices;
-  value_idx* output_indices;
+  value_idx *output_indices;
   const value_t *idxData, *queryData;
-  value_t* output_dists;
+  value_t *output_dists;
 
   size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query;
 
@@ -434,76 +394,52 @@ class sparse_knn_t {
 
   int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k;
 
-  const raft::handle_t& handle;
+  const raft::handle_t &handle;
 };
 
 /**
- * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
- * using some distance implementation
- * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
- * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
- * @param[in] idxData csr data array of the index matrix (size idxNNZ)
- * @param[in] idxNNA number of non-zeros for sparse index matrix
- * @param[in] n_idx_rows number of data samples in index matrix
- * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
- * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
- * @param[in] queryData csr data array of the query matrix (size queryNNZ)
- * @param[in] queryNNZ number of non-zeros for sparse query matrix
- * @param[in] n_query_rows number of data samples in query matrix
- * @param[in] n_query_cols number of features in query matrix
- * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
- * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
- * @param[in] k the number of neighbors to query
- * @param[in] cusparseHandle the initialized cusparseHandle instance to use
- * @param[in] allocator device allocator instance to use
- * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
- * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
- * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
- * @param[in] metric distance metric/measure to use
- * @param[in] metricArg potential argument for metric (currently unused)
- */
+   * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
+   * using some distance implementation
+   * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
+   * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
+   * @param[in] idxData csr data array of the index matrix (size idxNNZ)
+   * @param[in] idxNNA number of non-zeros for sparse index matrix
+   * @param[in] n_idx_rows number of data samples in index matrix
+   * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
+   * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
+   * @param[in] queryData csr data array of the query matrix (size queryNNZ)
+   * @param[in] queryNNZ number of non-zeros for sparse query matrix
+   * @param[in] n_query_rows number of data samples in query matrix
+   * @param[in] n_query_cols number of features in query matrix
+   * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
+   * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
+   * @param[in] k the number of neighbors to query
+   * @param[in] cusparseHandle the initialized cusparseHandle instance to use
+   * @param[in] allocator device allocator instance to use
+   * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
+   * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
+   * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
+   * @param[in] metric distance metric/measure to use
+   * @param[in] metricArg potential argument for metric (currently unused)
+   */
 template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
-void brute_force_knn(const value_idx* idxIndptr,
-                     const value_idx* idxIndices,
-                     const value_t* idxData,
-                     size_t idxNNZ,
-                     int n_idx_rows,
-                     int n_idx_cols,
-                     const value_idx* queryIndptr,
-                     const value_idx* queryIndices,
-                     const value_t* queryData,
-                     size_t queryNNZ,
-                     int n_query_rows,
-                     int n_query_cols,
-                     value_idx* output_indices,
-                     value_t* output_dists,
-                     int k,
-                     const raft::handle_t& handle,
-                     size_t batch_size_index             = 2 << 14,  // approx 1M
-                     size_t batch_size_query             = 2 << 14,
-                     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
-                     float metricArg                     = 0)
-{
-  sparse_knn_t<value_idx, value_t>(idxIndptr,
-                                   idxIndices,
-                                   idxData,
-                                   idxNNZ,
-                                   n_idx_rows,
-                                   n_idx_cols,
-                                   queryIndptr,
-                                   queryIndices,
-                                   queryData,
-                                   queryNNZ,
-                                   n_query_rows,
-                                   n_query_cols,
-                                   output_indices,
-                                   output_dists,
-                                   k,
-                                   handle,
-                                   batch_size_index,
-                                   batch_size_query,
-                                   metric,
-                                   metricArg)
+void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices,
+                     const value_t *idxData, size_t idxNNZ, int n_idx_rows,
+                     int n_idx_cols, const value_idx *queryIndptr,
+                     const value_idx *queryIndices, const value_t *queryData,
+                     size_t queryNNZ, int n_query_rows, int n_query_cols,
+                     value_idx *output_indices, value_t *output_dists, int k,
+                     const raft::handle_t &handle,
+                     size_t batch_size_index = 2 << 14,  // approx 1M
+                     size_t batch_size_query = 2 << 14,
+                     raft::distance::DistanceType metric =
+                       raft::distance::DistanceType::L2Expanded,
+                     float metricArg = 0) {
+  sparse_knn_t<value_idx, value_t>(
+    idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr,
+    queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols,
+    output_indices, output_dists, k, handle, batch_size_index, batch_size_query,
+    metric, metricArg)
     .run();
 }
 
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
index 1308f5ce02..1cf225087a 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -45,34 +45,31 @@ namespace selection {
  * @param m
  */
 template <typename value_idx>
-__global__ void fill_indices(value_idx* indices, size_t m, size_t nnz)
-{
+__global__ void fill_indices(value_idx *indices, size_t m, size_t nnz) {
   value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (tid >= nnz) return;
-  value_idx v  = tid / m;
+  value_idx v = tid / m;
   indices[tid] = v;
 }
 
 template <typename value_idx>
-value_idx build_k(value_idx n_samples, int c)
-{
+value_idx build_k(value_idx n_samples, int c) {
   // from "kNN-MST-Agglomerative: A fast & scalable graph-based data clustering
   // approach on GPU"
-  return min(n_samples, max((value_idx)2, (value_idx)floor(log2(n_samples)) + c));
+  return min(n_samples,
+             max((value_idx)2, (value_idx)floor(log2(n_samples)) + c));
 }
 
 template <typename in_t, typename out_t>
-__global__ void conv_indices_kernel(in_t* inds, out_t* out, size_t nnz)
-{
+__global__ void conv_indices_kernel(in_t *inds, out_t *out, size_t nnz) {
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid >= nnz) return;
-  out_t v  = inds[tid];
+  out_t v = inds[tid];
   out[tid] = v;
 }
 
 template <typename in_t, typename out_t, int tpb = 256>
-void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream)
-{
+void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) {
   size_t blocks = ceildiv(size, (size_t)tpb);
   conv_indices_kernel<<<blocks, tpb, 0, stream>>>(inds, out, size);
 }
@@ -94,18 +91,13 @@ void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream)
  * @param c
  */
 template <typename value_idx = int, typename value_t = float>
-void knn_graph(const handle_t& handle,
-               const value_t* X,
-               size_t m,
-               size_t n,
+void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
                distance::DistanceType metric,
-               raft::sparse::COO<value_t, value_idx>& out,
-               int c = 15)
-{
+               raft::sparse::COO<value_t, value_idx> &out, int c = 15) {
   int k = build_k(m, c);
 
   auto d_alloc = handle.get_device_allocator();
-  auto stream  = handle.get_stream();
+  auto stream = handle.get_stream();
 
   size_t nnz = m * k;
 
@@ -116,8 +108,8 @@ void knn_graph(const handle_t& handle,
   size_t blocks = ceildiv(nnz, (size_t)256);
   fill_indices<value_idx><<<blocks, 256, 0, stream>>>(rows.data(), k, nnz);
 
-  std::vector<value_t*> inputs;
-  inputs.push_back(const_cast<value_t*>(X));
+  std::vector<value_t *> inputs;
+  inputs.push_back(const_cast<value_t *>(X));
 
   std::vector<int> sizes;
   sizes.push_back(m);
@@ -127,25 +119,15 @@ void knn_graph(const handle_t& handle,
   rmm::device_uvector<int64_t> int64_indices(nnz, stream);
 
   uint32_t knn_start = curTimeMillis();
-  raft::spatial::knn::brute_force_knn(handle,
-                                      inputs,
-                                      sizes,
-                                      n,
-                                      const_cast<value_t*>(X),
-                                      m,
-                                      int64_indices.data(),
-                                      data.data(),
-                                      k,
-                                      true,
-                                      true,
-                                      nullptr,
-                                      metric);
+  raft::spatial::knn::brute_force_knn(
+    handle, inputs, sizes, n, const_cast<value_t *>(X), m, int64_indices.data(),
+    data.data(), k, true, true, nullptr, metric);
 
   // convert from current knn's 64-bit to 32-bit.
   conv_indices(int64_indices.data(), indices.data(), nnz, stream);
 
-  raft::sparse::linalg::symmetrize(
-    handle, rows.data(), indices.data(), data.data(), m, k, nnz, out);
+  raft::sparse::linalg::symmetrize(handle, rows.data(), indices.data(),
+                                   data.data(), m, k, nnz, out);
 }
 
 };  // namespace selection
diff --git a/cpp/include/raft/sparse/selection/selection.cuh b/cpp/include/raft/sparse/selection/selection.cuh
index 190e06b2cd..6066a36289 100644
--- a/cpp/include/raft/sparse/selection/selection.cuh
+++ b/cpp/include/raft/sparse/selection/selection.cuh
@@ -39,33 +39,27 @@ namespace raft {
 namespace sparse {
 namespace selection {
 
-template <typename K, typename IndexType, bool select_min, int warp_q, int thread_q, int tpb>
-__global__ void select_k_kernel(K* inK,
-                                IndexType* inV,
-                                size_t n_rows,
-                                size_t n_cols,
-                                K* outK,
-                                IndexType* outV,
-                                K initK,
-                                IndexType initV,
-                                int k)
-{
+template <typename K, typename IndexType, bool select_min, int warp_q,
+          int thread_q, int tpb>
+__global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows,
+                                size_t n_cols, K *outK, IndexType *outV,
+                                K initK, IndexType initV, int k) {
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ K smemK[kNumWarps * warp_q];
   __shared__ IndexType smemV[kNumWarps * warp_q];
 
-  faiss::gpu::
-    BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>, warp_q, thread_q, tpb>
-      heap(initK, initV, smemK, smemV, k);
+  faiss::gpu::BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>,
+                          warp_q, thread_q, tpb>
+    heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
   int row = blockIdx.x;
-  int i   = threadIdx.x;
+  int i = threadIdx.x;
 
-  int idx             = row * n_cols;
-  K* inKStart         = inK + idx + i;
-  IndexType* inVStart = inV + idx + i;
+  int idx = row * n_cols;
+  K *inKStart = inK + idx + i;
+  IndexType *inVStart = inV + idx + i;
 
   // Whole warps must participate in the selection
   int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize);
@@ -92,31 +86,27 @@ __global__ void select_k_kernel(K* inK,
   }
 }
 
-template <typename value_idx = int, typename value_t = float, int warp_q, int thread_q>
-inline void select_k_impl(value_t* inK,
-                          value_idx* inV,
-                          size_t n_rows,
-                          size_t n_cols,
-                          value_t* outK,
-                          value_idx* outV,
-                          bool select_min,
-                          int k,
-                          cudaStream_t stream)
-{
+template <typename value_idx = int, typename value_t = float, int warp_q,
+          int thread_q>
+inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows,
+                          size_t n_cols, value_t *outK, value_idx *outV,
+                          bool select_min, int k, cudaStream_t stream) {
   auto grid = dim3(n_rows);
 
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
-  auto block              = dim3(n_threads);
+  auto block = dim3(n_threads);
 
-  auto kInit =
-    select_min ? faiss::gpu::Limits<value_t>::getMax() : faiss::gpu::Limits<value_t>::getMin();
+  auto kInit = select_min ? faiss::gpu::Limits<value_t>::getMax()
+                          : faiss::gpu::Limits<value_t>::getMin();
   auto vInit = -1;
   if (select_min) {
     select_k_kernel<value_t, value_idx, false, warp_q, thread_q, n_threads>
-      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
+      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit,
+                                   vInit, k);
   } else {
     select_k_kernel<value_t, value_idx, true, warp_q, thread_q, n_threads>
-      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
+      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit,
+                                   vInit, k);
   }
   CUDA_CHECK(cudaGetLastError());
 }
@@ -136,37 +126,30 @@ inline void select_k_impl(value_t* inK,
  * @param[in] stream CUDA stream to use
  */
 template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t* inK,
-                     value_idx* inV,
-                     size_t n_rows,
-                     size_t n_cols,
-                     value_t* outK,
-                     value_idx* outV,
-                     bool select_min,
-                     int k,
-                     cudaStream_t stream)
-{
+inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
+                     value_t *outK, value_idx *outV, bool select_min, int k,
+                     cudaStream_t stream) {
   if (k == 1)
-    select_k_impl<value_idx, value_t, 1, 1>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 1, 1>(inK, inV, n_rows, n_cols, outK,
+                                            outV, select_min, k, stream);
   else if (k <= 32)
-    select_k_impl<value_idx, value_t, 32, 2>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 32, 2>(inK, inV, n_rows, n_cols, outK,
+                                             outV, select_min, k, stream);
   else if (k <= 64)
-    select_k_impl<value_idx, value_t, 64, 3>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 64, 3>(inK, inV, n_rows, n_cols, outK,
+                                             outV, select_min, k, stream);
   else if (k <= 128)
-    select_k_impl<value_idx, value_t, 128, 3>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 128, 3>(inK, inV, n_rows, n_cols, outK,
+                                              outV, select_min, k, stream);
   else if (k <= 256)
-    select_k_impl<value_idx, value_t, 256, 4>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 256, 4>(inK, inV, n_rows, n_cols, outK,
+                                              outV, select_min, k, stream);
   else if (k <= 512)
-    select_k_impl<value_idx, value_t, 512, 8>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 512, 8>(inK, inV, n_rows, n_cols, outK,
+                                              outV, select_min, k, stream);
   else if (k <= 1024)
-    select_k_impl<value_idx, value_t, 1024, 8>(
-      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 1024, 8>(inK, inV, n_rows, n_cols, outK,
+                                               outV, select_min, k, stream);
 }
 
 };  // namespace selection
diff --git a/cpp/include/raft/sparse/utils.h b/cpp/include/raft/sparse/utils.h
index 56e8832e0a..63578bf1f3 100644
--- a/cpp/include/raft/sparse/utils.h
+++ b/cpp/include/raft/sparse/utils.h
@@ -26,8 +26,7 @@ namespace sparse {
  * @param[in] ncols number of blocks to quantize
  */
 template <typename value_idx>
-inline int block_dim(value_idx ncols)
-{
+inline int block_dim(value_idx ncols) {
   int blockdim;
   if (ncols <= 32)
     blockdim = 32;
@@ -55,9 +54,9 @@ inline int block_dim(value_idx ncols)
  * @return
  */
 template <typename G>
-__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, G key)
-{
-  unsigned int mask       = __ballot_sync(init_mask, true);
+__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask,
+                                                    G key) {
+  unsigned int mask = __ballot_sync(init_mask, true);
   unsigned int peer_group = 0;
   bool is_peer;
 
@@ -78,14 +77,12 @@ __device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, G ke
 }
 #endif
 
-__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group)
-{
+__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) {
   return __ffs(peer_group) - 1;
 }
 
 template <typename value_idx>
-__global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols)
-{
+__global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) {
   int row = blockIdx.x;
   int tid = threadIdx.x;
 
@@ -95,16 +92,15 @@ __global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols)
 }
 
 template <typename value_idx>
-void iota_fill(value_idx* indices, value_idx nrows, value_idx ncols, cudaStream_t stream)
-{
+void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols,
+               cudaStream_t stream) {
   int blockdim = block_dim(ncols);
 
   iota_fill_block_kernel<<<nrows, blockdim, 0, stream>>>(indices, ncols);
 }
 
 template <typename T>
-__device__ int get_stop_idx(T row, T m, T nnz, const T* ind)
-{
+__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) {
   int stop_idx = 0;
   if (row < (m - 1))
     stop_idx = ind[row + 1];
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index f77a56164d..77d7831b4a 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -45,16 +45,14 @@ using deviceAllocator = raft::mr::device::allocator;
  * @param[in] D the dimensionality of the index array
  */
 template <typename value_idx = int>
-inline void approx_knn_build_index(raft::handle_t& handle,
-                                   raft::spatial::knn::knnIndex* index,
-                                   knnIndexParam* params,
+inline void approx_knn_build_index(raft::handle_t &handle,
+                                   raft::spatial::knn::knnIndex *index,
+                                   knnIndexParam *params,
                                    raft::distance::DistanceType metric,
-                                   float metricArg,
-                                   float* index_array,
-                                   value_idx n,
-                                   value_idx D)
-{
-  detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D);
+                                   float metricArg, float *index_array,
+                                   value_idx n, value_idx D) {
+  detail::approx_knn_build_index(handle, index, params, metric, metricArg,
+                                 index_array, n, D);
 }
 
 /**
@@ -71,15 +69,12 @@ inline void approx_knn_build_index(raft::handle_t& handle,
  * @param[in] n number of rows in the query array
  */
 template <typename value_idx = int>
-inline void approx_knn_search(raft::handle_t& handle,
-                              float* distances,
-                              int64_t* indices,
-                              raft::spatial::knn::knnIndex* index,
-                              value_idx k,
-                              float* query_array,
-                              value_idx n)
-{
-  detail::approx_knn_search(handle, distances, indices, index, k, query_array, n);
+inline void approx_knn_search(raft::handle_t &handle, float *distances,
+                              int64_t *indices,
+                              raft::spatial::knn::knnIndex *index, value_idx k,
+                              float *query_array, value_idx n) {
+  detail::approx_knn_search(handle, distances, indices, index, k, query_array,
+                            n);
 }
 
 }  // namespace knn
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 573a23181d..6a6c7751c2 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -26,14 +26,13 @@ namespace spatial {
 namespace knn {
 
 struct knnIndex {
-  faiss::gpu::GpuIndex* index;
+  faiss::gpu::GpuIndex *index;
   raft::distance::DistanceType metric;
   float metricArg;
 
-  faiss::gpu::StandardGpuResources* gpu_res;
+  faiss::gpu::StandardGpuResources *gpu_res;
   int device;
-  ~knnIndex()
-  {
+  ~knnIndex() {
     delete index;
     delete gpu_res;
   }
@@ -58,8 +57,7 @@ struct IVFParam : knnIndexParam {
   int nprobe;
 };
 
-struct IVFFlatParam : IVFParam {
-};
+struct IVFFlatParam : IVFParam {};
 
 struct IVFPQParam : IVFParam {
   int M;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 7eb439c78b..6e4c99b646 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -56,107 +56,115 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype)
-{
+inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(
+  QuantizerType qtype) {
   switch (qtype) {
-    case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit;
+    case QuantizerType::QT_8bit:
+      return faiss::ScalarQuantizer::QuantizerType::QT_8bit;
     case QuantizerType::QT_8bit_uniform:
       return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform;
     case QuantizerType::QT_4bit_uniform:
       return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform;
-    case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16;
+    case QuantizerType::QT_fp16:
+      return faiss::ScalarQuantizer::QuantizerType::QT_fp16;
     case QuantizerType::QT_8bit_direct:
       return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct;
-    case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit;
-    default: return (faiss::ScalarQuantizer::QuantizerType)qtype;
+    case QuantizerType::QT_6bit:
+      return faiss::ScalarQuantizer::QuantizerType::QT_6bit;
+    default:
+      return (faiss::ScalarQuantizer::QuantizerType)qtype;
   }
 }
 
 template <typename IntType = int>
-void approx_knn_ivfflat_build_index(
-  knnIndex* index, IVFParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
-{
+void approx_knn_ivfflat_build_index(knnIndex *index, IVFParam *params,
+                                    raft::distance::DistanceType metric,
+                                    IntType n, IntType D) {
   faiss::gpu::GpuIndexIVFFlatConfig config;
-  config.device                  = index->device;
+  config.device = index->device;
   faiss::MetricType faiss_metric = build_faiss_metric(metric);
-  faiss::gpu::GpuIndexIVFFlat* faiss_index =
-    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params->nlist, faiss_metric, config);
+  faiss::gpu::GpuIndexIVFFlat *faiss_index = new faiss::gpu::GpuIndexIVFFlat(
+    index->gpu_res, D, params->nlist, faiss_metric, config);
   faiss_index->setNumProbes(params->nprobe);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_ivfpq_build_index(
-  knnIndex* index, IVFPQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
-{
+void approx_knn_ivfpq_build_index(knnIndex *index, IVFPQParam *params,
+                                  raft::distance::DistanceType metric,
+                                  IntType n, IntType D) {
   faiss::gpu::GpuIndexIVFPQConfig config;
-  config.device                          = index->device;
-  config.usePrecomputedTables            = params->usePrecomputedTables;
-  config.interleavedLayout               = params->n_bits != 8;
-  faiss::MetricType faiss_metric         = build_faiss_metric(metric);
-  faiss::gpu::GpuIndexIVFPQ* faiss_index = new faiss::gpu::GpuIndexIVFPQ(
-    index->gpu_res, D, params->nlist, params->M, params->n_bits, faiss_metric, config);
+  config.device = index->device;
+  config.usePrecomputedTables = params->usePrecomputedTables;
+  config.interleavedLayout = params->n_bits != 8;
+  faiss::MetricType faiss_metric = build_faiss_metric(metric);
+  faiss::gpu::GpuIndexIVFPQ *faiss_index =
+    new faiss::gpu::GpuIndexIVFPQ(index->gpu_res, D, params->nlist, params->M,
+                                  params->n_bits, faiss_metric, config);
   faiss_index->setNumProbes(params->nprobe);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_ivfsq_build_index(
-  knnIndex* index, IVFSQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
-{
+void approx_knn_ivfsq_build_index(knnIndex *index, IVFSQParam *params,
+                                  raft::distance::DistanceType metric,
+                                  IntType n, IntType D) {
   faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
-  config.device                                       = index->device;
-  faiss::MetricType faiss_metric                      = build_faiss_metric(metric);
-  faiss::ScalarQuantizer::QuantizerType faiss_qtype   = build_faiss_qtype(params->qtype);
-  faiss::gpu::GpuIndexIVFScalarQuantizer* faiss_index = new faiss::gpu::GpuIndexIVFScalarQuantizer(
-    index->gpu_res, D, params->nlist, faiss_qtype, faiss_metric, params->encodeResidual);
+  config.device = index->device;
+  faiss::MetricType faiss_metric = build_faiss_metric(metric);
+  faiss::ScalarQuantizer::QuantizerType faiss_qtype =
+    build_faiss_qtype(params->qtype);
+  faiss::gpu::GpuIndexIVFScalarQuantizer *faiss_index =
+    new faiss::gpu::GpuIndexIVFScalarQuantizer(index->gpu_res, D, params->nlist,
+                                               faiss_qtype, faiss_metric,
+                                               params->encodeResidual);
   faiss_index->setNumProbes(params->nprobe);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_build_index(raft::handle_t& handle,
-                            raft::spatial::knn::knnIndex* index,
-                            raft::spatial::knn::knnIndexParam* params,
+void approx_knn_build_index(raft::handle_t &handle,
+                            raft::spatial::knn::knnIndex *index,
+                            raft::spatial::knn::knnIndexParam *params,
                             raft::distance::DistanceType metric,
-                            float metricArg,
-                            float* index_array,
-                            IntType n,
-                            IntType D)
-{
+                            float metricArg, float *index_array, IntType n,
+                            IntType D) {
   int device;
   CUDA_CHECK(cudaGetDevice(&device));
 
-  faiss::gpu::StandardGpuResources* gpu_res = new faiss::gpu::StandardGpuResources();
+  faiss::gpu::StandardGpuResources *gpu_res =
+    new faiss::gpu::StandardGpuResources();
   gpu_res->noTempMemory();
   gpu_res->setDefaultStream(device, handle.get_stream());
-  index->gpu_res   = gpu_res;
-  index->device    = device;
-  index->index     = nullptr;
-  index->metric    = metric;
+  index->gpu_res = gpu_res;
+  index->device = device;
+  index->index = nullptr;
+  index->metric = metric;
   index->metricArg = metricArg;
 
   // perform preprocessing
   // k set to 0 (unused during preprocessing / revertion)
-  std::unique_ptr<MetricProcessor<float>> query_metric_processor = create_processor<float>(
-    metric, n, D, 0, false, handle.get_stream(), handle.get_device_allocator());
+  std::unique_ptr<MetricProcessor<float>> query_metric_processor =
+    create_processor<float>(metric, n, D, 0, false, handle.get_stream(),
+                            handle.get_device_allocator());
 
   query_metric_processor->preprocess(index_array);
 
-  if (dynamic_cast<IVFFlatParam*>(params)) {
-    IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
+  if (dynamic_cast<IVFFlatParam *>(params)) {
+    IVFFlatParam *IVFFlat_param = dynamic_cast<IVFFlatParam *>(params);
     approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D);
     std::vector<float> h_index_array(n * D);
-    raft::update_host(h_index_array.data(), index_array, h_index_array.size(), handle.get_stream());
+    raft::update_host(h_index_array.data(), index_array, h_index_array.size(),
+                      handle.get_stream());
     query_metric_processor->revert(index_array);
     index->index->train(n, h_index_array.data());
     index->index->add(n, h_index_array.data());
   } else {
-    if (dynamic_cast<IVFPQParam*>(params)) {
-      IVFPQParam* IVFPQ_param = dynamic_cast<IVFPQParam*>(params);
+    if (dynamic_cast<IVFPQParam *>(params)) {
+      IVFPQParam *IVFPQ_param = dynamic_cast<IVFPQParam *>(params);
       approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D);
-    } else if (dynamic_cast<IVFSQParam*>(params)) {
-      IVFSQParam* IVFSQ_param = dynamic_cast<IVFSQParam*>(params);
+    } else if (dynamic_cast<IVFSQParam *>(params)) {
+      IVFSQParam *IVFSQ_param = dynamic_cast<IVFSQParam *>(params);
       approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D);
     } else {
       ASSERT(index->index, "KNN index could not be initialized");
@@ -169,23 +177,13 @@ void approx_knn_build_index(raft::handle_t& handle,
 }
 
 template <typename IntType = int>
-void approx_knn_search(raft::handle_t& handle,
-                       float* distances,
-                       int64_t* indices,
-                       raft::spatial::knn::knnIndex* index,
-                       IntType k,
-                       float* query_array,
-                       IntType n)
-{
+void approx_knn_search(raft::handle_t &handle, float *distances,
+                       int64_t *indices, raft::spatial::knn::knnIndex *index,
+                       IntType k, float *query_array, IntType n) {
   // perform preprocessing
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(index->metric,
-                            n,
-                            index->index->d,
-                            k,
-                            false,
-                            handle.get_stream(),
-                            handle.get_device_allocator());
+    create_processor<float>(index->metric, n, index->index->d, k, false,
+                            handle.get_stream(), handle.get_device_allocator());
 
   query_metric_processor->preprocess(query_array);
   index->index->search(n, query_array, k, distances, indices);
@@ -196,14 +194,13 @@ void approx_knn_search(raft::handle_t& handle,
       index->metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
       index->metric == raft::distance::DistanceType::LpUnexpanded) {
     /**
-     * post-processing
-     */
+  * post-processing
+  */
     float p = 0.5;  // standard l2
-    if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg;
+    if (index->metric == raft::distance::DistanceType::LpUnexpanded)
+      p = 1.0 / index->metricArg;
     raft::linalg::unaryOp<float>(
-      distances,
-      distances,
-      n * k,
+      distances, distances, n * k,
       [p] __device__(float input) { return powf(input, p); },
       handle.get_stream());
   }
diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h
index 5618186dfc..0c0398a336 100644
--- a/cpp/include/raft/spatial/knn/detail/common_faiss.h
+++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h
@@ -27,26 +27,37 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-inline faiss::MetricType build_faiss_metric(raft::distance::DistanceType metric)
-{
+inline faiss::MetricType build_faiss_metric(
+  raft::distance::DistanceType metric) {
   switch (metric) {
     case raft::distance::DistanceType::CosineExpanded:
       return faiss::MetricType::METRIC_INNER_PRODUCT;
     case raft::distance::DistanceType::CorrelationExpanded:
       return faiss::MetricType::METRIC_INNER_PRODUCT;
-    case raft::distance::DistanceType::L2Expanded: return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L2Unexpanded: return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L2SqrtExpanded: return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L2SqrtUnexpanded: return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L1: return faiss::MetricType::METRIC_L1;
-    case raft::distance::DistanceType::InnerProduct: return faiss::MetricType::METRIC_INNER_PRODUCT;
-    case raft::distance::DistanceType::LpUnexpanded: return faiss::MetricType::METRIC_Lp;
-    case raft::distance::DistanceType::Linf: return faiss::MetricType::METRIC_Linf;
-    case raft::distance::DistanceType::Canberra: return faiss::MetricType::METRIC_Canberra;
-    case raft::distance::DistanceType::BrayCurtis: return faiss::MetricType::METRIC_BrayCurtis;
+    case raft::distance::DistanceType::L2Expanded:
+      return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L2Unexpanded:
+      return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L2SqrtExpanded:
+      return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L2SqrtUnexpanded:
+      return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L1:
+      return faiss::MetricType::METRIC_L1;
+    case raft::distance::DistanceType::InnerProduct:
+      return faiss::MetricType::METRIC_INNER_PRODUCT;
+    case raft::distance::DistanceType::LpUnexpanded:
+      return faiss::MetricType::METRIC_Lp;
+    case raft::distance::DistanceType::Linf:
+      return faiss::MetricType::METRIC_Linf;
+    case raft::distance::DistanceType::Canberra:
+      return faiss::MetricType::METRIC_Canberra;
+    case raft::distance::DistanceType::BrayCurtis:
+      return faiss::MetricType::METRIC_BrayCurtis;
     case raft::distance::DistanceType::JensenShannon:
       return faiss::MetricType::METRIC_JensenShannon;
-    default: THROW("MetricType not supported: %d", metric);
+    default:
+      THROW("MetricType not supported: %d", metric);
   }
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 049c11514c..7d87254cb6 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -35,8 +35,7 @@ namespace knn {
 namespace detail {
 
 template <typename value_t>
-DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2)
-{
+DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) {
   value_t sin_0 = sin(0.5 * (x1 - y1));
   value_t sin_1 = sin(0.5 * (x2 - y2));
   value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1;
@@ -57,36 +56,34 @@ DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2)
  * @param[in] n_index_rows number of rows in index array
  * @param[in] k number of closest neighbors to return
  */
-template <typename value_idx, typename value_t, int warp_q = 1024, int thread_q = 8, int tpb = 128>
-__global__ void haversine_knn_kernel(value_idx* out_inds,
-                                     value_t* out_dists,
-                                     const value_t* index,
-                                     const value_t* query,
-                                     size_t n_index_rows,
-                                     int k)
-{
+template <typename value_idx, typename value_t, int warp_q = 1024,
+          int thread_q = 8, int tpb = 128>
+__global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
+                                     const value_t *index, const value_t *query,
+                                     size_t n_index_rows, int k) {
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
 
-  faiss::gpu::
-    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
-      heap(faiss::gpu::Limits<value_t>::getMax(), -1, smemK, smemV, k);
+  faiss::gpu::BlockSelect<value_t, value_idx, false,
+                          faiss::gpu::Comparator<value_t>, warp_q, thread_q,
+                          tpb>
+    heap(faiss::gpu::Limits<value_t>::getMax(), -1, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
   int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize);
 
-  const value_t* query_ptr = query + (blockIdx.x * 2);
-  value_t x1               = query_ptr[0];
-  value_t x2               = query_ptr[1];
+  const value_t *query_ptr = query + (blockIdx.x * 2);
+  value_t x1 = query_ptr[0];
+  value_t x2 = query_ptr[1];
 
   int i = threadIdx.x;
 
   for (; i < limit; i += tpb) {
-    const value_t* idx_ptr = index + (i * 2);
-    value_t y1             = idx_ptr[0];
-    value_t y2             = idx_ptr[1];
+    const value_t *idx_ptr = index + (i * 2);
+    value_t y1 = idx_ptr[0];
+    value_t y2 = idx_ptr[1];
 
     value_t dist = compute_haversine(x1, y1, x2, y2);
 
@@ -95,9 +92,9 @@ __global__ void haversine_knn_kernel(value_idx* out_inds,
 
   // Handle last remainder fraction of a warp of elements
   if (i < n_index_rows) {
-    const value_t* idx_ptr = index + (i * 2);
-    value_t y1             = idx_ptr[0];
-    value_t y2             = idx_ptr[1];
+    const value_t *idx_ptr = index + (i * 2);
+    value_t y1 = idx_ptr[0];
+    value_t y2 = idx_ptr[1];
 
     value_t dist = compute_haversine(x1, y1, x2, y2);
 
@@ -108,7 +105,7 @@ __global__ void haversine_knn_kernel(value_idx* out_inds,
 
   for (int i = threadIdx.x; i < k; i += tpb) {
     out_dists[blockIdx.x * k + i] = smemK[i];
-    out_inds[blockIdx.x * k + i]  = smemV[i];
+    out_inds[blockIdx.x * k + i] = smemV[i];
   }
 }
 
@@ -129,15 +126,10 @@ __global__ void haversine_knn_kernel(value_idx* out_inds,
  * @param[in] stream stream to order kernel launch
  */
 template <typename value_idx, typename value_t>
-void haversine_knn(value_idx* out_inds,
-                   value_t* out_dists,
-                   const value_t* index,
-                   const value_t* query,
-                   size_t n_index_rows,
-                   size_t n_query_rows,
-                   int k,
-                   cudaStream_t stream)
-{
+void haversine_knn(value_idx *out_inds, value_t *out_dists,
+                   const value_t *index, const value_t *query,
+                   size_t n_index_rows, size_t n_query_rows, int k,
+                   cudaStream_t stream) {
   haversine_knn_kernel<<<n_query_rows, 128, 0, stream>>>(
     out_inds, out_dists, index, query, n_index_rows, k);
 }
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index a276ae45ad..09494e9eb1 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -43,18 +43,13 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-template <typename value_idx = int64_t, typename value_t = float, int warp_q, int thread_q, int tpb>
-__global__ void knn_merge_parts_kernel(value_t* inK,
-                                       value_idx* inV,
-                                       value_t* outK,
-                                       value_idx* outV,
-                                       size_t n_samples,
-                                       int n_parts,
-                                       value_t initK,
-                                       value_idx initV,
-                                       int k,
-                                       value_idx* translations)
-{
+template <typename value_idx = int64_t, typename value_t = float, int warp_q,
+          int thread_q, int tpb>
+__global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
+                                       value_t *outK, value_idx *outV,
+                                       size_t n_samples, int n_parts,
+                                       value_t initK, value_idx initV, int k,
+                                       value_idx *translations) {
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
@@ -63,33 +58,34 @@ __global__ void knn_merge_parts_kernel(value_t* inK,
   /**
    * Uses shared memory
    */
-  faiss::gpu::
-    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
-      heap(initK, initV, smemK, smemV, k);
+  faiss::gpu::BlockSelect<value_t, value_idx, false,
+                          faiss::gpu::Comparator<value_t>, warp_q, thread_q,
+                          tpb>
+    heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
-  int row     = blockIdx.x;
+  int row = blockIdx.x;
   int total_k = k * n_parts;
 
   int i = threadIdx.x;
 
   // Get starting pointers for cols in current thread
-  int part       = i / k;
+  int part = i / k;
   size_t row_idx = (row * k) + (part * n_samples * k);
 
   int col = i % k;
 
-  value_t* inKStart   = inK + (row_idx + col);
-  value_idx* inVStart = inV + (row_idx + col);
+  value_t *inKStart = inK + (row_idx + col);
+  value_idx *inVStart = inV + (row_idx + col);
 
-  int limit             = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
+  int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
   value_idx translation = 0;
 
   for (; i < limit; i += tpb) {
     translation = translations[part];
     heap.add(*inKStart, (*inVStart) + translation);
 
-    part    = (i + tpb) / k;
+    part = (i + tpb) / k;
     row_idx = (row * k) + (part * n_samples * k);
 
     col = (i + tpb) % k;
@@ -112,27 +108,22 @@ __global__ void knn_merge_parts_kernel(value_t* inK,
   }
 }
 
-template <typename value_idx = int64_t, typename value_t = float, int warp_q, int thread_q>
-inline void knn_merge_parts_impl(value_t* inK,
-                                 value_idx* inV,
-                                 value_t* outK,
-                                 value_idx* outV,
-                                 size_t n_samples,
-                                 int n_parts,
-                                 int k,
-                                 cudaStream_t stream,
-                                 value_idx* translations)
-{
+template <typename value_idx = int64_t, typename value_t = float, int warp_q,
+          int thread_q>
+inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK,
+                                 value_idx *outV, size_t n_samples, int n_parts,
+                                 int k, cudaStream_t stream,
+                                 value_idx *translations) {
   auto grid = dim3(n_samples);
 
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
-  auto block              = dim3(n_threads);
+  auto block = dim3(n_threads);
 
   auto kInit = faiss::gpu::Limits<value_t>::getMax();
   auto vInit = -1;
   knn_merge_parts_kernel<value_idx, value_t, warp_q, thread_q, n_threads>
-    <<<grid, block, 0, stream>>>(
-      inK, inV, outK, outV, n_samples, n_parts, kInit, vInit, k, translations);
+    <<<grid, block, 0, stream>>>(inK, inV, outK, outV, n_samples, n_parts,
+                                 kInit, vInit, k, translations);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -151,16 +142,10 @@ inline void knn_merge_parts_impl(value_t* inK,
  * @param translations mapping of index offsets for each partition
  */
 template <typename value_idx = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t* inK,
-                            value_idx* inV,
-                            value_t* outK,
-                            value_idx* outV,
-                            size_t n_samples,
-                            int n_parts,
-                            int k,
-                            cudaStream_t stream,
-                            value_idx* translations)
-{
+inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
+                            value_idx *outV, size_t n_samples, int n_parts,
+                            int k, cudaStream_t stream,
+                            value_idx *translations) {
   if (k == 1)
     knn_merge_parts_impl<value_idx, value_t, 1, 1>(
       inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
@@ -210,33 +195,27 @@ inline void knn_merge_parts(value_t* inK,
  * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm
  */
 template <typename IntType = int>
-void brute_force_knn_impl(
-  std::vector<float*>& input,
-  std::vector<int>& sizes,
-  IntType D,
-  float* search_items,
-  IntType n,
-  int64_t* res_I,
-  float* res_D,
-  IntType k,
-  std::shared_ptr<deviceAllocator> allocator,
-  cudaStream_t userStream,
-  cudaStream_t* internalStreams       = nullptr,
-  int n_int_streams                   = 0,
-  bool rowMajorIndex                  = true,
-  bool rowMajorQuery                  = true,
-  std::vector<int64_t>* translations  = nullptr,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
-  float metricArg                     = 0)
-{
-  ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size");
-
-  std::vector<int64_t>* id_ranges;
+void brute_force_knn_impl(std::vector<float *> &input, std::vector<int> &sizes,
+                          IntType D, float *search_items, IntType n,
+                          int64_t *res_I, float *res_D, IntType k,
+                          std::shared_ptr<deviceAllocator> allocator,
+                          cudaStream_t userStream,
+                          cudaStream_t *internalStreams = nullptr,
+                          int n_int_streams = 0, bool rowMajorIndex = true,
+                          bool rowMajorQuery = true,
+                          std::vector<int64_t> *translations = nullptr,
+                          raft::distance::DistanceType metric =
+                            raft::distance::DistanceType::L2Expanded,
+                          float metricArg = 0) {
+  ASSERT(input.size() == sizes.size(),
+         "input and sizes vectors should be the same size");
+
+  std::vector<int64_t> *id_ranges;
   if (translations == nullptr) {
     // If we don't have explicit translations
     // for offsets of the indices, build them
     // from the local partitions
-    id_ranges       = new std::vector<int64_t>();
+    id_ranges = new std::vector<int64_t>();
     int64_t total_n = 0;
     for (size_t i = 0; i < input.size(); i++) {
       id_ranges->push_back(total_n);
@@ -249,27 +228,31 @@ void brute_force_knn_impl(
 
   // perform preprocessing
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream, allocator);
+    create_processor<float>(metric, n, D, k, rowMajorQuery, userStream,
+                            allocator);
   query_metric_processor->preprocess(search_items);
 
-  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(input.size());
+  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(
+    input.size());
   for (size_t i = 0; i < input.size(); i++) {
-    metric_processors[i] =
-      create_processor<float>(metric, sizes[i], D, k, rowMajorQuery, userStream, allocator);
+    metric_processors[i] = create_processor<float>(
+      metric, sizes[i], D, k, rowMajorQuery, userStream, allocator);
     metric_processors[i]->preprocess(input[i]);
   }
 
   int device;
   CUDA_CHECK(cudaGetDevice(&device));
 
-  raft::mr::device::buffer<int64_t> trans(allocator, userStream, id_ranges->size());
-  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream);
+  raft::mr::device::buffer<int64_t> trans(allocator, userStream,
+                                          id_ranges->size());
+  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
+                      userStream);
 
   raft::mr::device::buffer<float> all_D(allocator, userStream, 0);
   raft::mr::device::buffer<int64_t> all_I(allocator, userStream, 0);
 
-  float* out_D   = res_D;
-  int64_t* out_I = res_I;
+  float *out_D = res_D;
+  int64_t *out_I = res_I;
 
   if (input.size() > 1) {
     all_D.resize(input.size() * k * n, userStream);
@@ -283,10 +266,11 @@ void brute_force_knn_impl(
   if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));
 
   for (size_t i = 0; i < input.size(); i++) {
-    float* out_d_ptr   = out_D + (i * k * n);
-    int64_t* out_i_ptr = out_I + (i * k * n);
+    float *out_d_ptr = out_D + (i * k * n);
+    int64_t *out_i_ptr = out_I + (i * k * n);
 
-    cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i);
+    cudaStream_t stream =
+      raft::select_stream(userStream, internalStreams, n_int_streams, i);
 
     switch (metric) {
       case raft::distance::DistanceType::Haversine:
@@ -295,7 +279,8 @@ void brute_force_knn_impl(
                "Haversine distance requires 2 dimensions "
                "(latitude / longitude).");
 
-        haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
+        haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n,
+                      k, stream);
         break;
       default:
         faiss::MetricType m = build_faiss_metric(metric);
@@ -306,18 +291,18 @@ void brute_force_knn_impl(
         gpu_res.setDefaultStream(device, stream);
 
         faiss::gpu::GpuDistanceParams args;
-        args.metric          = m;
-        args.metricArg       = metricArg;
-        args.k               = k;
-        args.dims            = D;
-        args.vectors         = input[i];
+        args.metric = m;
+        args.metricArg = metricArg;
+        args.k = k;
+        args.dims = D;
+        args.vectors = input[i];
         args.vectorsRowMajor = rowMajorIndex;
-        args.numVectors      = sizes[i];
-        args.queries         = search_items;
+        args.numVectors = sizes[i];
+        args.queries = search_items;
         args.queriesRowMajor = rowMajorQuery;
-        args.numQueries      = n;
-        args.outDistances    = out_d_ptr;
-        args.outIndices      = out_i_ptr;
+        args.numQueries = n;
+        args.outDistances = out_d_ptr;
+        args.outIndices = out_i_ptr;
 
         /**
          * @todo: Until FAISS supports pluggable allocation strategies,
@@ -340,7 +325,8 @@ void brute_force_knn_impl(
   if (input.size() > 1 || translations != nullptr) {
     // This is necessary for proper index translations. If there are
     // no translations or partitions to combine, it can be skipped.
-    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data());
+    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream,
+                    trans.data());
   }
 
   // Perform necessary post-processing
@@ -348,12 +334,14 @@ void brute_force_knn_impl(
       metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
       metric == raft::distance::DistanceType::LpUnexpanded) {
     /**
-     * post-processing
-     */
+	* post-processing
+	*/
     float p = 0.5;  // standard l2
-    if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg;
+    if (metric == raft::distance::DistanceType::LpUnexpanded)
+      p = 1.0 / metricArg;
     raft::linalg::unaryOp<float>(
-      res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream);
+      res_D, res_D, n * k,
+      [p] __device__(float input) { return powf(input, p); }, userStream);
   }
 
   query_metric_processor->revert(search_items);
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index 6e983d1f42..a645412c2f 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -39,11 +39,11 @@ using deviceAllocator = raft::mr::device::allocator;
 template <typename math_t>
 class MetricProcessor {
  public:
-  virtual void preprocess(math_t* data) {}
+  virtual void preprocess(math_t *data) {}
 
-  virtual void revert(math_t* data) {}
+  virtual void revert(math_t *data) {}
 
-  virtual void postprocess(math_t* data) {}
+  virtual void postprocess(math_t *data) {}
 
   virtual ~MetricProcessor() = default;
 };
@@ -60,10 +60,7 @@ class CosineMetricProcessor : public MetricProcessor<math_t> {
   raft::mr::device::buffer<math_t> colsums_;
 
  public:
-  CosineMetricProcessor(size_t n_rows,
-                        size_t n_cols,
-                        int k,
-                        bool row_major,
+  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major,
                         cudaStream_t stream,
                         std::shared_ptr<deviceAllocator> allocator)
     : device_allocator_(allocator),
@@ -72,51 +69,30 @@ class CosineMetricProcessor : public MetricProcessor<math_t> {
       n_cols_(n_cols),
       n_rows_(n_rows),
       row_major_(row_major),
-      k_(k)
-  {
-  }
+      k_(k) {}
 
-  void preprocess(math_t* data)
-  {
-    raft::linalg::rowNorm(colsums_.data(),
-                          data,
-                          n_cols_,
-                          n_rows_,
-                          raft::linalg::NormType::L2Norm,
-                          row_major_,
-                          stream_,
+  void preprocess(math_t *data) {
+    raft::linalg::rowNorm(colsums_.data(), data, n_cols_, n_rows_,
+                          raft::linalg::NormType::L2Norm, row_major_, stream_,
                           [] __device__(math_t in) { return sqrtf(in); });
 
     raft::linalg::matrixVectorOp(
-      data,
-      data,
-      colsums_.data(),
-      n_cols_,
-      n_rows_,
-      row_major_,
-      false,
+      data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false,
       [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; },
       stream_);
   }
 
-  void revert(math_t* data)
-  {
+  void revert(math_t *data) {
     raft::linalg::matrixVectorOp(
-      data,
-      data,
-      colsums_.data(),
-      n_cols_,
-      n_rows_,
-      row_major_,
-      false,
+      data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false,
       [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; },
       stream_);
   }
 
-  void postprocess(math_t* data)
-  {
+  void postprocess(math_t *data) {
     raft::linalg::unaryOp(
-      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_);
+      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; },
+      stream_);
   }
 
   ~CosineMetricProcessor() = default;
@@ -127,64 +103,43 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
   using cosine = CosineMetricProcessor<math_t>;
 
  public:
-  CorrelationMetricProcessor(size_t n_rows,
-                             size_t n_cols,
-                             int k,
-                             bool row_major,
-                             cudaStream_t stream,
+  CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k,
+                             bool row_major, cudaStream_t stream,
                              std::shared_ptr<deviceAllocator> allocator)
-    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream, allocator),
-      means_(allocator, stream, n_rows)
-  {
-  }
+    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream,
+                                    allocator),
+      means_(allocator, stream, n_rows) {}
 
-  void preprocess(math_t* data)
-  {
+  void preprocess(math_t *data) {
     math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
 
-    raft::linalg::reduce(means_.data(),
-                         data,
-                         cosine::n_cols_,
-                         cosine::n_rows_,
-                         (math_t)0.0,
-                         cosine::row_major_,
-                         true,
+    raft::linalg::reduce(means_.data(), data, cosine::n_cols_, cosine::n_rows_,
+                         (math_t)0.0, cosine::row_major_, true,
                          cosine::stream_);
 
     raft::linalg::unaryOp(
-      means_.data(),
-      means_.data(),
-      cosine::n_rows_,
+      means_.data(), means_.data(), cosine::n_rows_,
       [=] __device__(math_t in) { return in * normalizer_const; },
       cosine::stream_);
 
-    raft::stats::meanCenter(data,
-                            data,
-                            means_.data(),
-                            cosine::n_cols_,
-                            cosine::n_rows_,
-                            cosine::row_major_,
-                            false,
+    raft::stats::meanCenter(data, data, means_.data(), cosine::n_cols_,
+                            cosine::n_rows_, cosine::row_major_, false,
                             cosine::stream_);
 
     CosineMetricProcessor<math_t>::preprocess(data);
   }
 
-  void revert(math_t* data)
-  {
+  void revert(math_t *data) {
     CosineMetricProcessor<math_t>::revert(data);
 
-    raft::stats::meanAdd(data,
-                         data,
-                         means_.data(),
-                         cosine::n_cols_,
-                         cosine::n_rows_,
-                         cosine::row_major_,
-                         false,
+    raft::stats::meanAdd(data, data, means_.data(), cosine::n_cols_,
+                         cosine::n_rows_, cosine::row_major_, false,
                          cosine::stream_);
   }
 
-  void postprocess(math_t* data) { CosineMetricProcessor<math_t>::postprocess(data); }
+  void postprocess(math_t *data) {
+    CosineMetricProcessor<math_t>::postprocess(data);
+  }
 
   ~CorrelationMetricProcessor() = default;
 
@@ -194,36 +149,33 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
 template <typename math_t>
 class DefaultMetricProcessor : public MetricProcessor<math_t> {
  public:
-  void preprocess(math_t* data) {}
+  void preprocess(math_t *data) {}
 
-  void revert(math_t* data) {}
+  void revert(math_t *data) {}
 
-  void postprocess(math_t* data) {}
+  void postprocess(math_t *data) {}
 
   ~DefaultMetricProcessor() = default;
 };
 
 template <typename math_t>
 inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
-  distance::DistanceType metric,
-  int n,
-  int D,
-  int k,
-  bool rowMajorQuery,
-  cudaStream_t userStream,
-  std::shared_ptr<deviceAllocator> allocator)
-{
-  MetricProcessor<math_t>* mp = nullptr;
+  distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery,
+  cudaStream_t userStream, std::shared_ptr<deviceAllocator> allocator) {
+  MetricProcessor<math_t> *mp = nullptr;
 
   switch (metric) {
     case distance::DistanceType::CosineExpanded:
-      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream, allocator);
+      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream,
+                                             allocator);
       break;
 
     case distance::DistanceType::CorrelationExpanded:
-      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream, allocator);
+      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery,
+                                                  userStream, allocator);
       break;
-    default: mp = new DefaultMetricProcessor<math_t>();
+    default:
+      mp = new DefaultMetricProcessor<math_t>();
   }
 
   return std::unique_ptr<MetricProcessor<math_t>>(mp);
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index 42ee11ba5b..a3a1972c13 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -28,17 +28,12 @@ namespace knn {
 using deviceAllocator = raft::mr::device::allocator;
 
 template <typename value_idx = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t* inK,
-                            value_idx* inV,
-                            value_t* outK,
-                            value_idx* outV,
-                            size_t n_samples,
-                            int n_parts,
-                            int k,
-                            cudaStream_t stream,
-                            value_idx* translations)
-{
-  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
+inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
+                            value_idx *outV, size_t n_samples, int n_parts,
+                            int k, cudaStream_t stream,
+                            value_idx *translations) {
+  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream,
+                          translations);
 }
 
 /**
@@ -64,42 +59,23 @@ inline void knn_merge_parts(value_t* inK,
  * @param[in] expanded should lp-based distances be returned in their expanded
  * 					 form (e.g., without raising to the 1/p power).
  */
-inline void brute_force_knn(raft::handle_t const& handle,
-                            std::vector<float*>& input,
-                            std::vector<int>& sizes,
-                            int D,
-                            float* search_items,
-                            int n,
-                            int64_t* res_I,
-                            float* res_D,
-                            int k,
-                            bool rowMajorIndex                 = true,
-                            bool rowMajorQuery                 = true,
-                            std::vector<int64_t>* translations = nullptr,
-                            distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
-                            float metric_arg              = 2.0f)
-{
-  ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
+inline void brute_force_knn(
+  raft::handle_t const &handle, std::vector<float *> &input,
+  std::vector<int> &sizes, int D, float *search_items, int n, int64_t *res_I,
+  float *res_D, int k, bool rowMajorIndex = true, bool rowMajorQuery = true,
+  std::vector<int64_t> *translations = nullptr,
+  distance::DistanceType metric = distance::DistanceType::L2Unexpanded,
+  float metric_arg = 2.0f) {
+  ASSERT(input.size() == sizes.size(),
+         "input and sizes vectors must be the same size");
 
   std::vector<cudaStream_t> int_streams = handle.get_internal_streams();
 
-  detail::brute_force_knn_impl(input,
-                               sizes,
-                               D,
-                               search_items,
-                               n,
-                               res_I,
-                               res_D,
-                               k,
-                               handle.get_device_allocator(),
-                               handle.get_stream(),
-                               int_streams.data(),
-                               handle.get_num_internal_streams(),
-                               rowMajorIndex,
-                               rowMajorQuery,
-                               translations,
-                               metric,
-                               metric_arg);
+  detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D,
+                               k, handle.get_device_allocator(),
+                               handle.get_stream(), int_streams.data(),
+                               handle.get_num_internal_streams(), rowMajorIndex,
+                               rowMajorQuery, translations, metric, metric_arg);
 }
 
 }  // namespace knn
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index 7032a0009e..922ae7cfab 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -24,7 +24,8 @@ using namespace matrix;
 
 // aggregate of control params for Eigen Solver:
 //
-template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t,
+          typename size_type_t = index_type_t>
 struct cluster_solver_config_t {
   size_type_t n_clusters;
   size_type_t maxIter;
@@ -34,37 +35,25 @@ struct cluster_solver_config_t {
   unsigned long long seed{123456};
 };
 
-template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t,
+          typename size_type_t = index_type_t>
 struct kmeans_solver_t {
-  explicit kmeans_solver_t(
-    cluster_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
-    : config_(config)
-  {
-  }
+  explicit kmeans_solver_t(cluster_solver_config_t<index_type_t, value_type_t,
+                                                   size_type_t> const& config)
+    : config_(config) {}
 
   template <typename thrust_exe_policy_t>
-  std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
-                                              thrust_exe_policy_t t_exe_policy,
-                                              size_type_t n_obs_vecs,
-                                              size_type_t dim,
-                                              value_type_t const* __restrict__ obs,
-                                              index_type_t* __restrict__ codes) const
-  {
+  std::pair<value_type_t, index_type_t> solve(
+    handle_t const& handle, thrust_exe_policy_t t_exe_policy,
+    size_type_t n_obs_vecs, size_type_t dim,
+    value_type_t const* __restrict__ obs,
+    index_type_t* __restrict__ codes) const {
     RAFT_EXPECTS(obs != nullptr, "Null obs buffer.");
     RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
     value_type_t residual{};
     index_type_t iters{};
-    kmeans(handle,
-           t_exe_policy,
-           n_obs_vecs,
-           dim,
-           config_.n_clusters,
-           config_.tol,
-           config_.maxIter,
-           obs,
-           codes,
-           residual,
-           iters,
+    kmeans(handle, t_exe_policy, n_obs_vecs, dim, config_.n_clusters,
+           config_.tol, config_.maxIter, obs, codes, residual, iters,
            config_.seed);
     return std::make_pair(residual, iters);
   }
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index 156b996586..e36dca2e0c 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -23,7 +23,8 @@ using namespace matrix;
 
 // aggregate of control params for Eigen Solver:
 //
-template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t,
+          typename size_type_t = index_type_t>
 struct eigen_solver_config_t {
   size_type_t n_eigVecs;
   size_type_t maxIter;
@@ -33,59 +34,42 @@ struct eigen_solver_config_t {
 
   bool reorthogonalize{false};
   unsigned long long seed{
-    1234567};  // CAVEAT: this default value is now common to all instances of using seed in
-               // Lanczos; was not the case before: there were places where a default seed = 123456
-               // was used; this may trigger slightly different # solver iterations
+    1234567};  // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations
 };
 
-template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t,
+          typename size_type_t = index_type_t>
 struct lanczos_solver_t {
-  explicit lanczos_solver_t(
-    eigen_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
-    : config_(config)
-  {
-  }
+  explicit lanczos_solver_t(eigen_solver_config_t<index_type_t, value_type_t,
+                                                  size_type_t> const& config)
+    : config_(config) {}
 
-  index_type_t solve_smallest_eigenvectors(handle_t const& handle,
-                                           sparse_matrix_t<index_type_t, value_type_t> const& A,
-                                           value_type_t* __restrict__ eigVals,
-                                           value_type_t* __restrict__ eigVecs) const
-  {
+  index_type_t solve_smallest_eigenvectors(
+    handle_t const& handle,
+    sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const {
     RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
     RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
     index_type_t iters{};
-    computeSmallestEigenvectors(handle,
-                                A,
-                                config_.n_eigVecs,
-                                config_.maxIter,
-                                config_.restartIter,
-                                config_.tol,
-                                config_.reorthogonalize,
-                                iters,
-                                eigVals,
-                                eigVecs,
-                                config_.seed);
+    computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter,
+                                config_.restartIter, config_.tol,
+                                config_.reorthogonalize, iters, eigVals,
+                                eigVecs, config_.seed);
     return iters;
   }
 
-  index_type_t solve_largest_eigenvectors(handle_t const& handle,
-                                          sparse_matrix_t<index_type_t, value_type_t> const& A,
-                                          value_type_t* __restrict__ eigVals,
-                                          value_type_t* __restrict__ eigVecs) const
-  {
+  index_type_t solve_largest_eigenvectors(
+    handle_t const& handle,
+    sparse_matrix_t<index_type_t, value_type_t> const& A,
+    value_type_t* __restrict__ eigVals,
+    value_type_t* __restrict__ eigVecs) const {
     RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
     RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
     index_type_t iters{};
-    computeLargestEigenvectors(handle,
-                               A,
-                               config_.n_eigVecs,
-                               config_.maxIter,
-                               config_.restartIter,
-                               config_.tol,
-                               config_.reorthogonalize,
-                               iters,
-                               eigVals,
-                               eigVecs,
+    computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter,
+                               config_.restartIter, config_.tol,
+                               config_.reorthogonalize, iters, eigVals, eigVecs,
                                config_.seed);
     return iters;
   }
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index e0c3565b77..fb05bff3e2 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -44,15 +44,15 @@ using namespace raft::linalg;
 // Useful grid settings
 // =========================================================
 
-constexpr unsigned int BLOCK_SIZE      = 1024;
-constexpr unsigned int WARP_SIZE       = 32;
+constexpr unsigned int BLOCK_SIZE = 1024;
+constexpr unsigned int WARP_SIZE = 32;
 constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
 
 // =========================================================
 // CUDA kernels
 // =========================================================
 
-/**
+/** 
  *  @brief Compute distances between observation vectors and centroids
  *    Block dimensions should be (warpSize, 1,
  *    blockSize/warpSize). Ideally, the grid is large enough so there
@@ -76,13 +76,11 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
  *    initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void computeDistances(index_type_t n,
-                                        index_type_t d,
-                                        index_type_t k,
-                                        const value_type_t* __restrict__ obs,
-                                        const value_type_t* __restrict__ centroids,
-                                        value_type_t* __restrict__ dists)
-{
+static __global__ void computeDistances(
+  index_type_t n, index_type_t d, index_type_t k,
+  const value_type_t* __restrict__ obs,
+  const value_type_t* __restrict__ centroids,
+  value_type_t* __restrict__ dists) {
   // Loop index
   index_type_t i;
 
@@ -117,10 +115,12 @@ static __global__ void computeDistances(index_type_t n,
 
         // Perform reduction on warp
         for (i = WARP_SIZE / 2; i > 0; i /= 2)
-          dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i);
+          dist_private +=
+            __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i);
 
         // Write result to global memory
-        if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private);
+        if (threadIdx.x == 0)
+          atomicAdd(dists + IDX(gidz, gidy, n), dist_private);
 
         // Move to another observation vector
         gidz += blockDim.z * gridDim.z;
@@ -135,8 +135,8 @@ static __global__ void computeDistances(index_type_t n,
   }
 }
 
-/**
- *  @brief Find closest centroid to observation vectors.
+/** 
+ *  @brief Find closest centroid to observation vectors. 
  *    Block and grid dimensions should be 1-dimensional. Ideally the
  *    grid is large enough so there are n threads.
  *  @tparam index_type_t the type of data used for indexing.
@@ -157,12 +157,10 @@ static __global__ void computeDistances(index_type_t n,
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void minDistances(index_type_t n,
-                                    index_type_t k,
+static __global__ void minDistances(index_type_t n, index_type_t k,
                                     value_type_t* __restrict__ dists,
                                     index_type_t* __restrict__ codes,
-                                    index_type_t* __restrict__ clusterSizes)
-{
+                                    index_type_t* __restrict__ clusterSizes) {
   // Loop index
   index_type_t i, j;
 
@@ -181,8 +179,8 @@ static __global__ void minDistances(index_type_t n,
     dist_min = dists[IDX(i, 0, n)];
     for (j = 1; j < k; ++j) {
       dist_curr = dists[IDX(i, j, n)];
-      code_min  = (dist_curr < dist_min) ? j : code_min;
-      dist_min  = (dist_curr < dist_min) ? dist_curr : dist_min;
+      code_min = (dist_curr < dist_min) ? j : code_min;
+      dist_min = (dist_curr < dist_min) ? dist_curr : dist_min;
     }
 
     // Transfer result to global memory
@@ -197,8 +195,8 @@ static __global__ void minDistances(index_type_t n,
   }
 }
 
-/**
- *  @brief Check if newly computed distances are smaller than old distances.
+/** 
+ *  @brief Check if newly computed distances are smaller than old distances. 
  *    Block and grid dimensions should be 1-dimensional. Ideally the
  *    grid is large enough so there are n threads.
  *  @tparam index_type_t the type of data used for indexing.
@@ -221,8 +219,7 @@ static __global__ void minDistances2(index_type_t n,
                                      value_type_t* __restrict__ dists_old,
                                      const value_type_t* __restrict__ dists_new,
                                      index_type_t* __restrict__ codes_old,
-                                     index_type_t code_new)
-{
+                                     index_type_t code_new) {
   // Loop index
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -247,7 +244,7 @@ static __global__ void minDistances2(index_type_t n,
   }
 }
 
-/**
+/** 
  *  @brief Compute size of k-means clusters.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
  *    grid is large enough so there are n threads.
@@ -259,11 +256,9 @@ static __global__ void minDistances2(index_type_t n,
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t>
-static __global__ void computeClusterSizes(index_type_t n,
-                                           index_type_t k,
-                                           const index_type_t* __restrict__ codes,
-                                           index_type_t* __restrict__ clusterSizes)
-{
+static __global__ void computeClusterSizes(
+  index_type_t n, index_type_t k, const index_type_t* __restrict__ codes,
+  index_type_t* __restrict__ clusterSizes) {
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
   while (i < n) {
     atomicAdd(clusterSizes + codes[i], 1);
@@ -271,8 +266,8 @@ static __global__ void computeClusterSizes(index_type_t n,
   }
 }
 
-/**
- *  @brief Divide rows of centroid matrix by cluster sizes.
+/** 
+ *  @brief Divide rows of centroid matrix by cluster sizes. 
  *    Divides the ith column of the sum matrix by the size of the ith
  *    cluster. If the sum matrix has been initialized so that the ith
  *    row is the sum of all observation vectors in the ith cluster,
@@ -293,11 +288,9 @@ static __global__ void computeClusterSizes(index_type_t n,
  *    column is the mean position of a cluster).
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void divideCentroids(index_type_t d,
-                                       index_type_t k,
-                                       const index_type_t* __restrict__ clusterSizes,
-                                       value_type_t* __restrict__ centroids)
-{
+static __global__ void divideCentroids(
+  index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes,
+  value_type_t* __restrict__ centroids) {
   // Global indices
   index_type_t gidx, gidy;
 
@@ -348,17 +341,15 @@ static __global__ void divideCentroids(index_type_t d,
  *    coordinates.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t, typename thrust_exe_pol_t>
+template <typename index_type_t, typename value_type_t,
+          typename thrust_exe_pol_t>
 static int chooseNewCentroid(handle_t const& handle,
                              thrust_exe_pol_t thrust_exec_policy,
-                             index_type_t n,
-                             index_type_t d,
-                             index_type_t k,
+                             index_type_t n, index_type_t d, index_type_t k,
                              value_type_t rand,
                              const value_type_t* __restrict__ obs,
                              value_type_t* __restrict__ dists,
-                             value_type_t* __restrict__ centroid)
-{
+                             value_type_t* __restrict__ centroid) {
   // Cumulative sum of distances
   value_type_t* distsCumSum = dists + n;
   // Residual sum of squares
@@ -367,43 +358,43 @@ static int chooseNewCentroid(handle_t const& handle,
   index_type_t obsIndex;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   // Compute cumulative sum of distances
-  thrust::inclusive_scan(thrust_exec_policy,
-                         thrust::device_pointer_cast(dists),
+  thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists),
                          thrust::device_pointer_cast(dists + n),
                          thrust::device_pointer_cast(distsCumSum));
   CHECK_CUDA(stream);
-  CUDA_TRY(cudaMemcpyAsync(
-    &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
+  CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t),
+                           cudaMemcpyDeviceToHost, stream));
 
   // Randomly choose observation vector
   //   Probabilities are proportional to square of distance to closest
   //   centroid (see k-means++ algorithm)
   //
-  // seg-faults due to Thrust bug
-  // on binary-search-like algorithms
-  // when run with stream dependent
-  // execution policies; fixed on Thrust GitHub
-  // hence replace w/ linear interpolation,
-  // until the Thrust issue gets resolved:
+  //seg-faults due to Thrust bug
+  //on binary-search-like algorithms
+  //when run with stream dependent
+  //execution policies; fixed on Thrust GitHub
+  //hence replace w/ linear interpolation,
+  //until the Thrust issue gets resolved:
   //
   // obsIndex = (thrust::lower_bound(
   //               thrust_exec_policy, thrust::device_pointer_cast(distsCumSum),
   //               thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) -
   //             thrust::device_pointer_cast(distsCumSum));
   //
-  // linear interpolation logic:
+  //linear interpolation logic:
   //{
   value_type_t minSum{0};
-  CUDA_TRY(
-    cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
+  CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t),
+                           cudaMemcpyDeviceToHost, stream));
   CHECK_CUDA(stream);
 
   if (distsSum > minSum) {
     value_type_t vIndex = static_cast<value_type_t>(n - 1);
-    obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) / (distsSum - minSum));
+    obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) /
+                                         (distsSum - minSum));
   } else {
     obsIndex = 0;
   }
@@ -414,23 +405,21 @@ static int chooseNewCentroid(handle_t const& handle,
   obsIndex = min(obsIndex, n - 1);
 
   // Record new centroid position
-  CUDA_TRY(cudaMemcpyAsync(centroid,
-                           obs + IDX(0, obsIndex, d),
-                           d * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice,
+  CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d),
+                           d * sizeof(value_type_t), cudaMemcpyDeviceToDevice,
                            stream));
 
   return 0;
 }
 
 /**
- *  @brief Choose initial cluster centroids for k-means algorithm.
+ *  @brief Choose initial cluster centroids for k-means algorithm.  
  *    Centroids are randomly chosen with k-means++ algorithm
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
  *  @tparam thrust_exe_pol_t the type of thrust execution policy.
  *  @param handle the raft handle.
- *  @param  thrust_exec_policy thrust execution policy
+ *  @param  thrust_exec_policy thrust execution policy 
  *    (assumed to have same stream as handle.stream).
  *  @param n Number of observation vectors.
  *  @param d Dimension of observation vectors.
@@ -450,19 +439,14 @@ static int chooseNewCentroid(handle_t const& handle,
  *    distance between observation vectors and the closest centroid.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t, typename thrust_exe_pol_t>
-static int initializeCentroids(handle_t const& handle,
-                               thrust_exe_pol_t thrust_exec_policy,
-                               index_type_t n,
-                               index_type_t d,
-                               index_type_t k,
-                               const value_type_t* __restrict__ obs,
-                               value_type_t* __restrict__ centroids,
-                               index_type_t* __restrict__ codes,
-                               index_type_t* __restrict__ clusterSizes,
-                               value_type_t* __restrict__ dists,
-                               unsigned long long seed)
-{
+template <typename index_type_t, typename value_type_t,
+          typename thrust_exe_pol_t>
+static int initializeCentroids(
+  handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n,
+  index_type_t d, index_type_t k, const value_type_t* __restrict__ obs,
+  value_type_t* __restrict__ centroids, index_type_t* __restrict__ codes,
+  index_type_t* __restrict__ clusterSizes, value_type_t* __restrict__ dists,
+  unsigned long long seed) {
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
@@ -475,7 +459,7 @@ static int initializeCentroids(handle_t const& handle,
   thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   constexpr index_type_t grid_lower_bound{65535};
 
@@ -487,43 +471,36 @@ static int initializeCentroids(handle_t const& handle,
   dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE};
 
   // CUDA grid dimensions
-  dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-                    1,
-                    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
+  dim3 gridDim_warp{
+    min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1,
+    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
 
   // CUDA grid dimensions
-  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1};
+  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound),
+                     1, 1};
 
   // Assign observation vectors to code 0
   CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
 
   // Choose first centroid
-  thrust::fill(thrust_exec_policy,
-               thrust::device_pointer_cast(dists),
-               thrust::device_pointer_cast(dists + n),
-               1);
+  thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists),
+               thrust::device_pointer_cast(dists + n), 1);
   CHECK_CUDA(stream);
-  if (chooseNewCentroid(
-        handle, thrust_exec_policy, n, d, k, uniformDist(rng), obs, dists, centroids))
+  if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng),
+                        obs, dists, centroids))
     WARNING("error in k-means++ (could not pick centroid)");
 
   // Compute distances from first centroid
   CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream));
-  computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(n, d, 1, obs, centroids, dists);
+  computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
+    n, d, 1, obs, centroids, dists);
   CHECK_CUDA(stream);
 
   // Choose remaining centroids
   for (i = 1; i < k; ++i) {
     // Choose ith centroid
-    if (chooseNewCentroid(handle,
-                          thrust_exec_policy,
-                          n,
-                          d,
-                          k,
-                          uniformDist(rng),
-                          obs,
-                          dists,
-                          centroids + IDX(0, i, d)))
+    if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k, uniformDist(rng),
+                          obs, dists, centroids + IDX(0, i, d)))
       WARNING("error in k-means++ (could not pick centroid)");
 
     // Compute distances from ith centroid
@@ -533,20 +510,22 @@ static int initializeCentroids(handle_t const& handle,
     CHECK_CUDA(stream);
 
     // Recompute minimum distances
-    minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n, codes, i);
+    minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n,
+                                                            codes, i);
     CHECK_CUDA(stream);
   }
 
   // Compute cluster sizes
   CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
-  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, k, codes, clusterSizes);
+  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, k, codes,
+                                                                clusterSizes);
   CHECK_CUDA(stream);
 
   return 0;
 }
 
-/**
- *  @brief Find cluster centroids closest to observation vectors.
+/** 
+ *  @brief Find cluster centroids closest to observation vectors. 
  *    Distance is measured with Euclidean norm.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
@@ -574,21 +553,16 @@ static int initializeCentroids(handle_t const& handle,
  *    of squares of assignment.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t, typename thrust_exe_pol_t>
-static int assignCentroids(handle_t const& handle,
-                           thrust_exe_pol_t thrust_exec_policy,
-                           index_type_t n,
-                           index_type_t d,
-                           index_type_t k,
-                           const value_type_t* __restrict__ obs,
-                           const value_type_t* __restrict__ centroids,
-                           value_type_t* __restrict__ dists,
-                           index_type_t* __restrict__ codes,
-                           index_type_t* __restrict__ clusterSizes,
-                           value_type_t* residual_host)
-{
+template <typename index_type_t, typename value_type_t,
+          typename thrust_exe_pol_t>
+static int assignCentroids(
+  handle_t const& handle, thrust_exe_pol_t thrust_exec_policy, index_type_t n,
+  index_type_t d, index_type_t k, const value_type_t* __restrict__ obs,
+  const value_type_t* __restrict__ centroids, value_type_t* __restrict__ dists,
+  index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes,
+  value_type_t* residual_host) {
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   // Compute distance between centroids and observation vectors
   CUDA_TRY(cudaMemsetAsync(dists, 0, n * k * sizeof(value_type_t), stream));
@@ -600,9 +574,11 @@ static int assignCentroids(handle_t const& handle,
   constexpr index_type_t grid_lower_bound{65535};
   gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound);
   gridDim.y = min(k, grid_lower_bound);
-  gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
+  gridDim.z =
+    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
 
-  computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids, dists);
+  computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids,
+                                                     dists);
   CHECK_CUDA(stream);
 
   // Find centroid closest to each observation vector
@@ -610,21 +586,23 @@ static int assignCentroids(handle_t const& handle,
   blockDim.x = BLOCK_SIZE;
   blockDim.y = 1;
   blockDim.z = 1;
-  gridDim.x  = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
-  gridDim.y  = 1;
-  gridDim.z  = 1;
-  minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
+  gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
+  gridDim.y = 1;
+  gridDim.z = 1;
+  minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes,
+                                                 clusterSizes);
   CHECK_CUDA(stream);
 
   // Compute residual sum of squares
-  *residual_host = thrust::reduce(
-    thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n));
+  *residual_host =
+    thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists),
+                   thrust::device_pointer_cast(dists + n));
 
   return 0;
 }
 
-/**
- *  @brief Update cluster centroids for k-means algorithm.
+/** 
+ *  @brief Update cluster centroids for k-means algorithm. 
  *    All clusters are assumed to be non-empty.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
@@ -650,31 +628,29 @@ static int assignCentroids(handle_t const& handle,
  *    Workspace.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t, typename thrust_exe_pol_t>
+template <typename index_type_t, typename value_type_t,
+          typename thrust_exe_pol_t>
 static int updateCentroids(handle_t const& handle,
-                           thrust_exe_pol_t thrust_exec_policy,
-                           index_type_t n,
-                           index_type_t d,
-                           index_type_t k,
+                           thrust_exe_pol_t thrust_exec_policy, index_type_t n,
+                           index_type_t d, index_type_t k,
                            const value_type_t* __restrict__ obs,
                            const index_type_t* __restrict__ codes,
                            const index_type_t* __restrict__ clusterSizes,
                            value_type_t* __restrict__ centroids,
                            value_type_t* __restrict__ work,
-                           index_type_t* __restrict__ work_int)
-{
+                           index_type_t* __restrict__ work_int) {
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
 
   // Useful constants
-  const value_type_t one  = 1;
+  const value_type_t one = 1;
   const value_type_t zero = 0;
 
   constexpr index_type_t grid_lower_bound{65535};
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   // Device memory
   thrust::device_ptr<value_type_t> obs_copy(work);
@@ -682,56 +658,34 @@ static int updateCentroids(handle_t const& handle,
   thrust::device_ptr<index_type_t> rows(work_int + d * n);
 
   // Take transpose of observation matrix
-  CUBLAS_CHECK(cublasgeam(cublas_h,
-                          CUBLAS_OP_T,
-                          CUBLAS_OP_N,
-                          n,
-                          d,
-                          &one,
-                          obs,
-                          d,
-                          &zero,
-                          (value_type_t*)NULL,
-                          n,
-                          thrust::raw_pointer_cast(obs_copy),
-                          n,
-                          stream));
+  CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs,
+                          d, &zero, (value_type_t*)NULL, n,
+                          thrust::raw_pointer_cast(obs_copy), n, stream));
 
   // Cluster assigned to each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
   CHECK_CUDA(stream);
-  thrust::transform(thrust_exec_policy,
-                    rows,
-                    rows + d * n,
-                    thrust::make_constant_iterator<index_type_t>(n),
-                    rows,
+  thrust::transform(thrust_exec_policy, rows, rows + d * n,
+                    thrust::make_constant_iterator<index_type_t>(n), rows,
                     thrust::modulus<index_type_t>());
   CHECK_CUDA(stream);
-  thrust::gather(
-    thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy);
+  thrust::gather(thrust_exec_policy, rows, rows + d * n,
+                 thrust::device_pointer_cast(codes), codes_copy);
   CHECK_CUDA(stream);
 
   // Row associated with each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
   CHECK_CUDA(stream);
-  thrust::transform(thrust_exec_policy,
-                    rows,
-                    rows + d * n,
-                    thrust::make_constant_iterator<index_type_t>(n),
-                    rows,
+  thrust::transform(thrust_exec_policy, rows, rows + d * n,
+                    thrust::make_constant_iterator<index_type_t>(n), rows,
                     thrust::divides<index_type_t>());
   CHECK_CUDA(stream);
 
   // Sort and reduce to add observation vectors in same cluster
-  thrust::stable_sort_by_key(thrust_exec_policy,
-                             codes_copy,
-                             codes_copy + d * n,
+  thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n,
                              make_zip_iterator(make_tuple(obs_copy, rows)));
   CHECK_CUDA(stream);
-  thrust::reduce_by_key(thrust_exec_policy,
-                        rows,
-                        rows + d * n,
-                        obs_copy,
+  thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy,
                         codes_copy,  // Output to codes_copy is ignored
                         thrust::device_pointer_cast(centroids));
   CHECK_CUDA(stream);
@@ -742,11 +696,12 @@ static int updateCentroids(handle_t const& handle,
   dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1};
 
   // CUDA grid dimensions
-  dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-               min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound),
-               1};
+  dim3 gridDim{
+    min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+    min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1};
 
-  divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes, centroids);
+  divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes,
+                                                    centroids);
   CHECK_CUDA(stream);
 
   return 0;
@@ -760,8 +715,8 @@ namespace raft {
 // k-means algorithm
 // =========================================================
 
-/**
- *  @brief Find clusters with k-means algorithm.
+/** 
+ *  @brief Find clusters with k-means algorithm. 
  *    Initial centroids are chosen with k-means++ algorithm. Empty
  *    clusters are reinitialized by choosing new centroids with
  *    k-means++ algorithm.
@@ -799,24 +754,17 @@ namespace raft {
  *  @param seed random seed to be used.
  *  @return error flag.
  */
-template <typename index_type_t, typename value_type_t, typename thrust_exe_pol_t>
-int kmeans(handle_t const& handle,
-           thrust_exe_pol_t thrust_exec_policy,
-           index_type_t n,
-           index_type_t d,
-           index_type_t k,
-           value_type_t tol,
-           index_type_t maxiter,
-           const value_type_t* __restrict__ obs,
+template <typename index_type_t, typename value_type_t,
+          typename thrust_exe_pol_t>
+int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
+           index_type_t n, index_type_t d, index_type_t k, value_type_t tol,
+           index_type_t maxiter, const value_type_t* __restrict__ obs,
            index_type_t* __restrict__ codes,
            index_type_t* __restrict__ clusterSizes,
            value_type_t* __restrict__ centroids,
-           value_type_t* __restrict__ work,
-           index_type_t* __restrict__ work_int,
-           value_type_t* residual_host,
-           index_type_t* iters_host,
-           unsigned long long seed)
-{
+           value_type_t* __restrict__ work, index_type_t* __restrict__ work_int,
+           value_type_t* residual_host, index_type_t* iters_host,
+           unsigned long long seed) {
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
@@ -838,120 +786,100 @@ int kmeans(handle_t const& handle,
   // -------------------------------------------------------
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   // Trivial cases
   if (k == 1) {
     CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
-    CUDA_TRY(
-      cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream));
-    if (updateCentroids(
-          handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
+    CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t),
+                             cudaMemcpyHostToDevice, stream));
+    if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes,
+                        clusterSizes, centroids, work, work_int))
       WARNING("could not compute k-means centroids");
 
     dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
 
     dim3 gridDim{
-      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-      1,
-      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
+      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1,
+      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE),
+          grid_lower_bound)};
 
     CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
-    computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
+    computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids,
+                                                       work);
     CHECK_CUDA(stream);
-    *residual_host = thrust::reduce(
-      thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n));
+    *residual_host =
+      thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work),
+                     thrust::device_pointer_cast(work + n));
     CHECK_CUDA(stream);
     return 0;
   }
   if (n <= k) {
-    thrust::sequence(thrust_exec_policy,
-                     thrust::device_pointer_cast(codes),
+    thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes),
                      thrust::device_pointer_cast(codes + n));
     CHECK_CUDA(stream);
-    thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1);
+    thrust::fill_n(thrust_exec_policy,
+                   thrust::device_pointer_cast(clusterSizes), n, 1);
     CHECK_CUDA(stream);
 
     if (n < k)
-      CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream));
-    CUDA_TRY(cudaMemcpyAsync(
-      centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
+      CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0,
+                               (k - n) * sizeof(index_type_t), stream));
+    CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t),
+                             cudaMemcpyDeviceToDevice, stream));
     *residual_host = 0;
     return 0;
   }
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(
+    linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // k-means++ algorithm
   // -------------------------------------------------------
 
   // Choose initial cluster centroids
-  if (initializeCentroids(
-        handle, thrust_exec_policy, n, d, k, obs, centroids, codes, clusterSizes, work, seed))
+  if (initializeCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
+                          codes, clusterSizes, work, seed))
     WARNING("could not initialize k-means centroids");
 
   // Apply k-means iteration until convergence
   for (iter = 0; iter < maxiter; ++iter) {
     // Update cluster centroids
-    if (updateCentroids(
-          handle, thrust_exec_policy, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
+    if (updateCentroids(handle, thrust_exec_policy, n, d, k, obs, codes,
+                        clusterSizes, centroids, work, work_int))
       WARNING("could not update k-means centroids");
 
     // Determine centroid closest to each observation
     residualPrev = *residual_host;
-    if (assignCentroids(handle,
-                        thrust_exec_policy,
-                        n,
-                        d,
-                        k,
-                        obs,
-                        centroids,
-                        work,
-                        codes,
-                        clusterSizes,
-                        residual_host))
+    if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
+                        work, codes, clusterSizes, residual_host))
       WARNING("could not assign observation vectors to k-means clusters");
 
     // Reinitialize empty clusters with new centroids
-    index_type_t emptyCentroid = (thrust::find(thrust_exec_policy,
-                                               thrust::device_pointer_cast(clusterSizes),
-                                               thrust::device_pointer_cast(clusterSizes + k),
-                                               0) -
-                                  thrust::device_pointer_cast(clusterSizes));
+    index_type_t emptyCentroid =
+      (thrust::find(thrust_exec_policy,
+                    thrust::device_pointer_cast(clusterSizes),
+                    thrust::device_pointer_cast(clusterSizes + k), 0) -
+       thrust::device_pointer_cast(clusterSizes));
 
     // FIXME: emptyCentroid never reaches k (infinite loop) under certain
     // conditions, such as if obs is corrupt (as seen as a result of a
     // DataFrame column of NULL edge vals used to create the Graph)
     while (emptyCentroid < k) {
-      if (chooseNewCentroid(handle,
-                            thrust_exec_policy,
-                            n,
-                            d,
-                            k,
-                            uniformDist(rng),
-                            obs,
-                            work,
+      if (chooseNewCentroid(handle, thrust_exec_policy, n, d, k,
+                            uniformDist(rng), obs, work,
                             centroids + IDX(0, emptyCentroid, d)))
         WARNING("could not replace empty centroid");
-      if (assignCentroids(handle,
-                          thrust_exec_policy,
-                          n,
-                          d,
-                          k,
-                          obs,
-                          centroids,
-                          work,
-                          codes,
-                          clusterSizes,
-                          residual_host))
+      if (assignCentroids(handle, thrust_exec_policy, n, d, k, obs, centroids,
+                          work, codes, clusterSizes, residual_host))
         WARNING("could not assign observation vectors to k-means clusters");
-      emptyCentroid = (thrust::find(thrust_exec_policy,
-                                    thrust::device_pointer_cast(clusterSizes),
-                                    thrust::device_pointer_cast(clusterSizes + k),
-                                    0) -
-                       thrust::device_pointer_cast(clusterSizes));
+      emptyCentroid =
+        (thrust::find(thrust_exec_policy,
+                      thrust::device_pointer_cast(clusterSizes),
+                      thrust::device_pointer_cast(clusterSizes + k), 0) -
+         thrust::device_pointer_cast(clusterSizes));
       CHECK_CUDA(stream);
     }
 
@@ -963,13 +891,14 @@ int kmeans(handle_t const& handle,
   }
 
   // Warning if k-means has failed to converge
-  if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge");
+  if (std::fabs(residualPrev - (*residual_host)) / n >= tol)
+    WARNING("k-means failed to converge");
 
   *iters_host = iter;
   return 0;
 }
 
-/**
+/** 
  *  @brief Find clusters with k-means algorithm.
  *    Initial centroids are chosen with k-means++ algorithm. Empty
  *    clusters are reinitialized by choosing new centroids with
@@ -997,20 +926,13 @@ int kmeans(handle_t const& handle,
  *  @param seed random seed to be used.
  *  @return error flag
  */
-template <typename index_type_t, typename value_type_t, typename thrust_exe_pol_t>
-int kmeans(handle_t const& handle,
-           thrust_exe_pol_t thrust_exec_policy,
-           index_type_t n,
-           index_type_t d,
-           index_type_t k,
-           value_type_t tol,
-           index_type_t maxiter,
-           const value_type_t* __restrict__ obs,
-           index_type_t* __restrict__ codes,
-           value_type_t& residual,
-           index_type_t& iters,
-           unsigned long long seed = 123456)
-{
+template <typename index_type_t, typename value_type_t,
+          typename thrust_exe_pol_t>
+int kmeans(handle_t const& handle, thrust_exe_pol_t thrust_exec_policy,
+           index_type_t n, index_type_t d, index_type_t k, value_type_t tol,
+           index_type_t maxiter, const value_type_t* __restrict__ obs,
+           index_type_t* __restrict__ codes, value_type_t& residual,
+           index_type_t& iters, unsigned long long seed = 123456) {
   using namespace matrix;
 
   // Check that parameters are valid
@@ -1027,22 +949,10 @@ int kmeans(handle_t const& handle,
   vector_t<index_type_t> work_int(handle, 2 * d * n);
 
   // Perform k-means
-  return kmeans<index_type_t, value_type_t>(handle,
-                                            thrust_exec_policy,
-                                            n,
-                                            d,
-                                            k,
-                                            tol,
-                                            maxiter,
-                                            obs,
-                                            codes,
-                                            clusterSizes.raw(),
-                                            centroids.raw(),
-                                            work.raw(),
-                                            work_int.raw(),
-                                            &residual,
-                                            &iters,
-                                            seed);
+  return kmeans<index_type_t, value_type_t>(
+    handle, thrust_exec_policy, n, d, k, tol, maxiter, obs, codes,
+    clusterSizes.raw(), centroids.raw(), work.raw(), work_int.raw(), &residual,
+    &iters, seed);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp
index 35fc22c770..d14bf05f37 100644
--- a/cpp/include/raft/spectral/lapack.hpp
+++ b/cpp/include/raft/spectral/lapack.hpp
@@ -21,125 +21,66 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/error.hpp>
 
-// for now; TODO: check if/where this `define` should be;
+//for now; TODO: check if/where this `define` should be;
 //
 #define USE_LAPACK
 
 namespace raft {
 
-#define lapackCheckError(status)                                                     \
-  {                                                                                  \
-    if (status < 0) {                                                                \
-      std::stringstream ss;                                                          \
-      ss << "Lapack error: argument number " << -status << " had an illegal value."; \
-      throw exception(ss.str());                                                     \
-    } else if (status > 0)                                                           \
-      RAFT_FAIL("Lapack error: internal error.");                                    \
+#define lapackCheckError(status)                        \
+  {                                                     \
+    if (status < 0) {                                   \
+      std::stringstream ss;                             \
+      ss << "Lapack error: argument number " << -status \
+         << " had an illegal value.";                   \
+      throw exception(ss.str());                        \
+    } else if (status > 0)                              \
+      RAFT_FAIL("Lapack error: internal error.");       \
   }
 
-extern "C" void sgeqrf_(
-  int* m, int* n, float* a, int* lda, float* tau, float* work, int* lwork, int* info);
-extern "C" void dgeqrf_(
-  int* m, int* n, double* a, int* lda, double* tau, double* work, int* lwork, int* info);
-extern "C" void sormqr_(char* side,
-                        char* trans,
-                        int* m,
-                        int* n,
-                        int* k,
-                        float* a,
-                        int* lda,
-                        const float* tau,
-                        float* c,
-                        int* ldc,
-                        float* work,
-                        int* lwork,
-                        int* info);
-extern "C" void dormqr_(char* side,
-                        char* trans,
-                        int* m,
-                        int* n,
-                        int* k,
-                        double* a,
-                        int* lda,
-                        const double* tau,
-                        double* c,
-                        int* ldc,
-                        double* work,
-                        int* lwork,
-                        int* info);
-extern "C" int dgeev_(char* jobvl,
-                      char* jobvr,
-                      int* n,
-                      double* a,
-                      int* lda,
-                      double* wr,
-                      double* wi,
-                      double* vl,
-                      int* ldvl,
-                      double* vr,
-                      int* ldvr,
-                      double* work,
-                      int* lwork,
-                      int* info);
-
-extern "C" int sgeev_(char* jobvl,
-                      char* jobvr,
-                      int* n,
-                      float* a,
-                      int* lda,
-                      float* wr,
-                      float* wi,
-                      float* vl,
-                      int* ldvl,
-                      float* vr,
-                      int* ldvr,
-                      float* work,
-                      int* lwork,
-                      int* info);
-
-extern "C" cusolverStatus_t cusolverDnSgemmHost(cublasOperation_t transa,
-                                                cublasOperation_t transb,
-                                                int m,
-                                                int n,
-                                                int k,
-                                                const float* alpha,
-                                                const float* A,
-                                                int lda,
-                                                const float* B,
-                                                int ldb,
-                                                const float* beta,
-                                                float* C,
-                                                int ldc);
-
-extern "C" cusolverStatus_t cusolverDnDgemmHost(cublasOperation_t transa,
-                                                cublasOperation_t transb,
-                                                int m,
-                                                int n,
-                                                int k,
-                                                const double* alpha,
-                                                const double* A,
-                                                int lda,
-                                                const double* B,
-                                                int ldb,
-                                                const double* beta,
-                                                double* C,
-                                                int ldc);
-
-extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float* d, float* e, int* info);
-
-extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double* d, double* e, int* info);
-
-extern "C" cusolverStatus_t cusolverDnSsteqrHost(
-  const signed char* compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info);
-
-extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char* compz,
-                                                 int n,
-                                                 double* d,
-                                                 double* e,
-                                                 double* z,
-                                                 int ldz,
-                                                 double* work,
-                                                 int* info);
+extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau,
+                        float *work, int *lwork, int *info);
+extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau,
+                        double *work, int *lwork, int *info);
+extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k,
+                        float *a, int *lda, const float *tau, float *c,
+                        int *ldc, float *work, int *lwork, int *info);
+extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k,
+                        double *a, int *lda, const double *tau, double *c,
+                        int *ldc, double *work, int *lwork, int *info);
+extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
+                      double *wr, double *wi, double *vl, int *ldvl, double *vr,
+                      int *ldvr, double *work, int *lwork, int *info);
+
+extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda,
+                      float *wr, float *wi, float *vl, int *ldvl, float *vr,
+                      int *ldvr, float *work, int *lwork, int *info);
+
+extern "C" cusolverStatus_t cusolverDnSgemmHost(
+  cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
+  const float *alpha, const float *A, int lda, const float *B, int ldb,
+  const float *beta, float *C, int ldc);
+
+extern "C" cusolverStatus_t cusolverDnDgemmHost(
+  cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
+  const double *alpha, const double *A, int lda, const double *B, int ldb,
+  const double *beta, double *C, int ldc);
+
+extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e,
+                                                 int *info);
+
+extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e,
+                                                 int *info);
+
+extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz,
+                                                 int n, float *d, float *e,
+                                                 float *z, int ldz, float *work,
+                                                 int *info);
+
+extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz,
+                                                 int n, double *d, double *e,
+                                                 double *z, int ldz,
+                                                 double *work, int *info);
 
 template <typename T>
 class Lapack {
@@ -150,339 +91,182 @@ class Lapack {
  public:
   static void check_lapack_enabled();
 
-  static void gemm(bool transa,
-                   bool transb,
-                   int m,
-                   int n,
-                   int k,
-                   T alpha,
-                   const T* A,
-                   int lda,
-                   const T* B,
-                   int ldb,
-                   T beta,
-                   T* C,
+  static void gemm(bool transa, bool transb, int m, int n, int k, T alpha,
+                   const T *A, int lda, const T *B, int ldb, T beta, T *C,
                    int ldc);
 
   // special QR for lanczos
-  static void sterf(int n, T* d, T* e);
-  static void steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work);
+  static void sterf(int n, T *d, T *e);
+  static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work);
 
   // QR
   // computes the QR factorization of a general matrix
-  static void geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork);
+  static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork);
   // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf.
 
   // multiply C by implicit Q
-  static void ormqr(bool right_side,
-                    bool transq,
-                    int m,
-                    int n,
-                    int k,
-                    T* a,
-                    int lda,
-                    T* tau,
-                    T* c,
-                    int ldc,
-                    T* work,
-                    int* lwork);
-
-  static void geev(T* A, T* eigenvalues, int dim, int lda);
-  static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr);
-  static void geev(T* A,
-                   T* eigenvalues_r,
-                   T* eigenvalues_i,
-                   T* eigenvectors_r,
-                   T* eigenvectors_i,
-                   int dim,
-                   int lda,
+  static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
+                    int lda, T *tau, T *c, int ldc, T *work, int *lwork);
+
+  static void geev(T *A, T *eigenvalues, int dim, int lda);
+  static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
                    int ldvr);
+  static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r,
+                   T *eigenvectors_i, int dim, int lda, int ldvr);
 
  private:
-  static void lapack_gemm(const char transa,
-                          const char transb,
-                          int m,
-                          int n,
-                          int k,
-                          float alpha,
-                          const float* a,
-                          int lda,
-                          const float* b,
-                          int ldb,
-                          float beta,
-                          float* c,
-                          int ldc)
-  {
-    cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cusolverDnSgemmHost(
-      cublas_transa, cublas_transb, m, n, k, &alpha, (float*)a, lda, (float*)b, ldb, &beta, c, ldc);
+  static void lapack_gemm(const char transa, const char transb, int m, int n,
+                          int k, float alpha, const float *a, int lda,
+                          const float *b, int ldb, float beta, float *c,
+                          int ldc) {
+    cublasOperation_t cublas_transa =
+      (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t cublas_transb =
+      (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha,
+                        (float *)a, lda, (float *)b, ldb, &beta, c, ldc);
   }
 
-  static void lapack_gemm(const signed char transa,
-                          const signed char transb,
-                          int m,
-                          int n,
-                          int k,
-                          double alpha,
-                          const double* a,
-                          int lda,
-                          const double* b,
-                          int ldb,
-                          double beta,
-                          double* c,
-                          int ldc)
-  {
-    cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cusolverDnDgemmHost(cublas_transa,
-                        cublas_transb,
-                        m,
-                        n,
-                        k,
-                        &alpha,
-                        (double*)a,
-                        lda,
-                        (double*)b,
-                        ldb,
-                        &beta,
-                        c,
-                        ldc);
+  static void lapack_gemm(const signed char transa, const signed char transb,
+                          int m, int n, int k, double alpha, const double *a,
+                          int lda, const double *b, int ldb, double beta,
+                          double *c, int ldc) {
+    cublasOperation_t cublas_transa =
+      (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t cublas_transb =
+      (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha,
+                        (double *)a, lda, (double *)b, ldb, &beta, c, ldc);
   }
 
-  static void lapack_sterf(int n, float* d, float* e, int* info)
-  {
+  static void lapack_sterf(int n, float *d, float *e, int *info) {
     cusolverDnSsterfHost(n, d, e, info);
   }
 
-  static void lapack_sterf(int n, double* d, double* e, int* info)
-  {
+  static void lapack_sterf(int n, double *d, double *e, int *info) {
     cusolverDnDsterfHost(n, d, e, info);
   }
 
-  static void lapack_steqr(
-    const signed char compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info)
-  {
+  static void lapack_steqr(const signed char compz, int n, float *d, float *e,
+                           float *z, int ldz, float *work, int *info) {
     cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info);
   }
 
-  static void lapack_steqr(const signed char compz,
-                           int n,
-                           double* d,
-                           double* e,
-                           double* z,
-                           int ldz,
-                           double* work,
-                           int* info)
-  {
+  static void lapack_steqr(const signed char compz, int n, double *d, double *e,
+                           double *z, int ldz, double *work, int *info) {
     cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info);
   }
 
-  static void lapack_geqrf(
-    int m, int n, float* a, int lda, float* tau, float* work, int* lwork, int* info)
-  {
+  static void lapack_geqrf(int m, int n, float *a, int lda, float *tau,
+                           float *work, int *lwork, int *info) {
     sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
   }
 
-  static void lapack_geqrf(
-    int m, int n, double* a, int lda, double* tau, double* work, int* lwork, int* info)
-  {
+  static void lapack_geqrf(int m, int n, double *a, int lda, double *tau,
+                           double *work, int *lwork, int *info) {
     dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
   }
 
-  static void lapack_ormqr(char side,
-                           char trans,
-                           int m,
-                           int n,
-                           int k,
-                           float* a,
-                           int lda,
-                           float* tau,
-                           float* c,
-                           int ldc,
-                           float* work,
-                           int* lwork,
-                           int* info)
-  {
-    sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info);
+  static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a,
+                           int lda, float *tau, float *c, int ldc, float *work,
+                           int *lwork, int *info) {
+    sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork,
+            info);
   }
 
-  static void lapack_ormqr(char side,
-                           char trans,
-                           int m,
-                           int n,
-                           int k,
-                           double* a,
-                           int lda,
-                           double* tau,
-                           double* c,
-                           int ldc,
-                           double* work,
-                           int* lwork,
-                           int* info)
-  {
-    dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info);
+  static void lapack_ormqr(char side, char trans, int m, int n, int k,
+                           double *a, int lda, double *tau, double *c, int ldc,
+                           double *work, int *lwork, int *info) {
+    dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork,
+            info);
   }
 
-  static int lapack_geev_dispatch(char* jobvl,
-                                  char* jobvr,
-                                  int* n,
-                                  double* a,
-                                  int* lda,
-                                  double* wr,
-                                  double* wi,
-                                  double* vl,
-                                  int* ldvl,
-                                  double* vr,
-                                  int* ldvr,
-                                  double* work,
-                                  int* lwork,
-                                  int* info)
-  {
-    return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info);
+  static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a,
+                                  int *lda, double *wr, double *wi, double *vl,
+                                  int *ldvl, double *vr, int *ldvr,
+                                  double *work, int *lwork, int *info) {
+    return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work,
+                  lwork, info);
   }
 
-  static int lapack_geev_dispatch(char* jobvl,
-                                  char* jobvr,
-                                  int* n,
-                                  float* a,
-                                  int* lda,
-                                  float* wr,
-                                  float* wi,
-                                  float* vl,
-                                  int* ldvl,
-                                  float* vr,
-                                  int* ldvr,
-                                  float* work,
-                                  int* lwork,
-                                  int* info)
-  {
-    return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info);
+  static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a,
+                                  int *lda, float *wr, float *wi, float *vl,
+                                  int *ldvl, float *vr, int *ldvr, float *work,
+                                  int *lwork, int *info) {
+    return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work,
+                  lwork, info);
   }
 
   // real eigenvalues
-  static void lapack_geev(T* A, T* eigenvalues, int dim, int lda)
-  {
+  static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) {
     char job = 'N';
     std::vector<T> WI(dim);
-    int ldv       = 1;
-    T* vl         = 0;
+    int ldv = 1;
+    T *vl = 0;
     int work_size = 6 * dim;
     std::vector<T> work(work_size);
     int info;
-    lapack_geev_dispatch(&job,
-                         &job,
-                         &dim,
-                         A,
-                         &lda,
-                         eigenvalues,
-                         WI.data(),
-                         vl,
-                         &ldv,
-                         vl,
-                         &ldv,
-                         work.data(),
-                         &work_size,
-                         &info);
+    lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl,
+                         &ldv, vl, &ldv, work.data(), &work_size, &info);
     lapackCheckError(info);
   }
 
   // real eigenpairs
-  static void lapack_geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr)
-  {
+  static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim,
+                          int lda, int ldvr) {
     char jobvl = 'N';
     char jobvr = 'V';
     std::vector<T> WI(dim);
     int work_size = 6 * dim;
-    T* vl         = 0;
-    int ldvl      = 1;
+    T *vl = 0;
+    int ldvl = 1;
     std::vector<T> work(work_size);
     int info;
-    lapack_geev_dispatch(&jobvl,
-                         &jobvr,
-                         &dim,
-                         A,
-                         &lda,
-                         eigenvalues,
-                         WI.data(),
-                         vl,
-                         &ldvl,
-                         eigenvectors,
-                         &ldvr,
-                         work.data(),
-                         &work_size,
-                         &info);
+    lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(),
+                         vl, &ldvl, eigenvectors, &ldvr, work.data(),
+                         &work_size, &info);
     lapackCheckError(info);
   }
 
   // complex eigenpairs
-  static void lapack_geev(T* A,
-                          T* eigenvalues_r,
-                          T* eigenvalues_i,
-                          T* eigenvectors_r,
-                          T* eigenvectors_i,
-                          int dim,
-                          int lda,
-                          int ldvr)
-  {
-    char jobvl    = 'N';
-    char jobvr    = 'V';
+  static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i,
+                          T *eigenvectors_r, T *eigenvectors_i, int dim,
+                          int lda, int ldvr) {
+    char jobvl = 'N';
+    char jobvr = 'V';
     int work_size = 8 * dim;
-    int ldvl      = 1;
+    int ldvl = 1;
     std::vector<T> work(work_size);
     int info;
-    lapack_geev_dispatch(&jobvl,
-                         &jobvr,
-                         &dim,
-                         A,
-                         &lda,
-                         eigenvalues_r,
-                         eigenvalues_i,
-                         0,
-                         &ldvl,
-                         eigenvectors_r,
-                         &ldvr,
-                         work.data(),
-                         &work_size,
-                         &info);
+    lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r,
+                         eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr,
+                         work.data(), &work_size, &info);
     lapackCheckError(info);
   }
 };
 
 template <typename T>
-void Lapack<T>::check_lapack_enabled()
-{
+void Lapack<T>::check_lapack_enabled() {
 #ifndef USE_LAPACK
   RAFT_FAIL("Error: LAPACK not enabled.");
 #endif
 }
 
 template <typename T>
-void Lapack<T>::gemm(bool transa,
-                     bool transb,
-                     int m,
-                     int n,
-                     int k,
-                     T alpha,
-                     const T* A,
-                     int lda,
-                     const T* B,
-                     int ldb,
-                     T beta,
-                     T* C,
-                     int ldc)
-{
+void Lapack<T>::gemm(bool transa, bool transb, int m, int n, int k, T alpha,
+                     const T *A, int lda, const T *B, int ldb, T beta, T *C,
+                     int ldc) {
   // check_lapack_enabled();
   //#ifdef NVGRAPH_USE_LAPACK
   const char transA_char = transa ? 'T' : 'N';
   const char transB_char = transb ? 'T' : 'N';
-  lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C,
+              ldc);
   //#endif
 }
 
 template <typename T>
-void Lapack<T>::sterf(int n, T* d, T* e)
-{
+void Lapack<T>::sterf(int n, T *d, T *e) {
   //    check_lapack_enabled();
   //#ifdef NVGRAPH_USE_LAPACK
   int info;
@@ -492,8 +276,7 @@ void Lapack<T>::sterf(int n, T* d, T* e)
 }
 
 template <typename T>
-void Lapack<T>::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work)
-{
+void Lapack<T>::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) {
   //    check_lapack_enabled();
   //#ifdef NVGRAPH_USE_LAPACK
   int info;
@@ -503,8 +286,8 @@ void Lapack<T>::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work)
 }
 
 template <typename T>
-void Lapack<T>::geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork)
-{
+void Lapack<T>::geqrf(int m, int n, T *a, int lda, T *tau, T *work,
+                      int *lwork) {
   check_lapack_enabled();
 #ifdef USE_LAPACK
   int info;
@@ -513,22 +296,11 @@ void Lapack<T>::geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork)
 #endif
 }
 template <typename T>
-void Lapack<T>::ormqr(bool right_side,
-                      bool transq,
-                      int m,
-                      int n,
-                      int k,
-                      T* a,
-                      int lda,
-                      T* tau,
-                      T* c,
-                      int ldc,
-                      T* work,
-                      int* lwork)
-{
+void Lapack<T>::ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
+                      int lda, T *tau, T *c, int ldc, T *work, int *lwork) {
   check_lapack_enabled();
 #ifdef USE_LAPACK
-  char side  = right_side ? 'R' : 'L';
+  char side = right_side ? 'R' : 'L';
   char trans = transq ? 'T' : 'N';
   int info;
   lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info);
@@ -538,8 +310,7 @@ void Lapack<T>::ormqr(bool right_side,
 
 // real eigenvalues
 template <typename T>
-void Lapack<T>::geev(T* A, T* eigenvalues, int dim, int lda)
-{
+void Lapack<T>::geev(T *A, T *eigenvalues, int dim, int lda) {
   check_lapack_enabled();
 #ifdef USE_LAPACK
   lapack_geev(A, eigenvalues, dim, lda);
@@ -547,8 +318,8 @@ void Lapack<T>::geev(T* A, T* eigenvalues, int dim, int lda)
 }
 // real eigenpairs
 template <typename T>
-void Lapack<T>::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr)
-{
+void Lapack<T>::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
+                     int ldvr) {
   check_lapack_enabled();
 #ifdef USE_LAPACK
   lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr);
@@ -556,18 +327,13 @@ void Lapack<T>::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, in
 }
 // complex eigenpairs
 template <typename T>
-void Lapack<T>::geev(T* A,
-                     T* eigenvalues_r,
-                     T* eigenvalues_i,
-                     T* eigenvectors_r,
-                     T* eigenvectors_i,
-                     int dim,
-                     int lda,
-                     int ldvr)
-{
+void Lapack<T>::geev(T *A, T *eigenvalues_r, T *eigenvalues_i,
+                     T *eigenvectors_r, T *eigenvectors_i, int dim, int lda,
+                     int ldvr) {
   check_lapack_enabled();
 #ifdef USE_LAPACK
-  lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr);
+  lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i,
+              dim, lda, ldvr);
 #endif
 }
 
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index 89d2b7e8ec..c43154d17a 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -40,12 +40,10 @@ using size_type = int;  // for now; TODO: move it in appropriate header
 // Apply diagonal matrix to vector:
 //
 template <typename IndexType_, typename ValueType_>
-static __global__ void diagmv(IndexType_ n,
-                              ValueType_ alpha,
+static __global__ void diagmv(IndexType_ n, ValueType_ alpha,
                               const ValueType_* __restrict__ D,
                               const ValueType_* __restrict__ x,
-                              ValueType_* __restrict__ y)
-{
+                              ValueType_* __restrict__ y) {
   IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x;
   while (i < n) {
     y[i] += alpha * D[i] * x[i];
@@ -60,7 +58,7 @@ enum struct sparse_mv_alg_t : int {
   SPARSE_MV_UNDEFINED = -1,
   SPARSE_MV_ALG_DEFAULT,  // generic, for any sparse matrix
   SPARSE_MV_ALG1,         // typical for CSR
-  SPARSE_MV_ALG2          // may provide better performamce for irregular sparse matrices
+  SPARSE_MV_ALG2  // may provide better performamce for irregular sparse matrices
 };
 
 // Vector "view"-like aggregate for linear algebra purposes
@@ -70,21 +68,21 @@ struct vector_view_t {
   value_type* buffer_;
   size_type size_;
 
-  vector_view_t(value_type* buffer, size_type sz) : buffer_(buffer), size_(sz) {}
+  vector_view_t(value_type* buffer, size_type sz)
+    : buffer_(buffer), size_(sz) {}
 
-  vector_view_t(vector_view_t&& other) : buffer_(other.buffer_), size_(other.size_)
-  {
+  vector_view_t(vector_view_t&& other)
+    : buffer_(other.buffer_), size_(other.size_) {
     other.buffer_ = nullptr;
-    other.size_   = 0;
+    other.size_ = 0;
   }
 
-  vector_view_t& operator=(vector_view_t&& other)
-  {
+  vector_view_t& operator=(vector_view_t&& other) {
     buffer_ = other.buffer_;
-    size_   = other.size_;
+    size_ = other.size_;
 
     other.buffer_ = nullptr;
-    other.size_   = 0;
+    other.size_ = 0;
   }
 };
 
@@ -100,16 +98,15 @@ class vector_t {
  public:
   vector_t(handle_t const& raft_handle, size_type sz)
     : handle_(raft_handle),
-      buffer_(static_cast<value_type*>(raft_handle.get_device_allocator()->allocate(
-        sz * sizeof(value_type), raft_handle.get_stream()))),
+      buffer_(
+        static_cast<value_type*>(raft_handle.get_device_allocator()->allocate(
+          sz * sizeof(value_type), raft_handle.get_stream()))),
       size_(sz),
-      stream_(raft_handle.get_stream())
-  {
-  }
+      stream_(raft_handle.get_stream()) {}
 
-  ~vector_t(void)
-  {
-    handle_.get_device_allocator()->deallocate(buffer_, size_ * sizeof(value_type), stream_);
+  ~vector_t(void) {
+    handle_.get_device_allocator()->deallocate(
+      buffer_, size_ * sizeof(value_type), stream_);
   }
 
   size_type size(void) const { return size_; }
@@ -119,31 +116,26 @@ class vector_t {
   value_type const* raw(void) const { return buffer_; }
 
   template <typename ThrustExecPolicy>
-  value_type nrm1(ThrustExecPolicy t_exe_pol) const
-  {
-    return thrust::reduce(
-      t_exe_pol, buffer_, buffer_ + size_, value_type{0}, [] __device__(auto left, auto right) {
-        auto abs_left  = left > 0 ? left : -left;
-        auto abs_right = right > 0 ? right : -right;
-        return abs_left + abs_right;
-      });
+  value_type nrm1(ThrustExecPolicy t_exe_pol) const {
+    return thrust::reduce(t_exe_pol, buffer_, buffer_ + size_, value_type{0},
+                          [] __device__(auto left, auto right) {
+                            auto abs_left = left > 0 ? left : -left;
+                            auto abs_right = right > 0 ? right : -right;
+                            return abs_left + abs_right;
+                          });
   }
 
   template <typename ThrustExecPolicy>
-  void fill(ThrustExecPolicy t_exe_pol, value_type value)
-  {
+  void fill(ThrustExecPolicy t_exe_pol, value_type value) {
     thrust::fill_n(t_exe_pol, buffer_, size_, value);
   }
 };
 
 template <typename index_type, typename value_type>
 struct sparse_matrix_t {
-  sparse_matrix_t(handle_t const& raft_handle,
-                  index_type const* row_offsets,
-                  index_type const* col_indices,
-                  value_type const* values,
-                  index_type const nrows,
-                  index_type const ncols,
+  sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
+                  index_type const* col_indices, value_type const* values,
+                  index_type const nrows, index_type const ncols,
                   index_type const nnz)
     : handle_(raft_handle),
       row_offsets_(row_offsets),
@@ -151,25 +143,18 @@ struct sparse_matrix_t {
       values_(values),
       nrows_(nrows),
       ncols_(ncols),
-      nnz_(nnz)
-  {
-  }
+      nnz_(nnz) {}
 
-  sparse_matrix_t(handle_t const& raft_handle,
-                  index_type const* row_offsets,
-                  index_type const* col_indices,
-                  value_type const* values,
-                  index_type const nrows,
-                  index_type const nnz)
+  sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
+                  index_type const* col_indices, value_type const* values,
+                  index_type const nrows, index_type const nnz)
     : handle_(raft_handle),
       row_offsets_(row_offsets),
       col_indices_(col_indices),
       values_(values),
       nrows_(nrows),
       ncols_(nrows),
-      nnz_(nnz)
-  {
-  }
+      nnz_(nnz) {}
 
   template <typename CSRView>
   sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view)
@@ -179,9 +164,7 @@ struct sparse_matrix_t {
       values_(csr_view.edge_data),
       nrows_(csr_view.number_of_vertices),
       ncols_(csr_view.number_of_vertices),
-      nnz_(csr_view.number_of_edges)
-  {
-  }
+      nnz_(csr_view.number_of_edges) {}
 
   virtual ~sparse_matrix_t(void) =
     default;  // virtual because used as base for following matrix types
@@ -191,24 +174,21 @@ struct sparse_matrix_t {
   // descriptor creation works with non-const, and const-casting
   // down is dangerous)
   //
-  virtual void mv(value_type alpha,
-                  value_type* __restrict__ x,
-                  value_type beta,
+  virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
                   value_type* __restrict__ y,
                   sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
-                  bool transpose      = false,
-                  bool symmetric      = false) const
-  {
+                  bool transpose = false, bool symmetric = false) const {
     using namespace sparse;
 
     RAFT_EXPECTS(x != nullptr, "Null x buffer.");
     RAFT_EXPECTS(y != nullptr, "Null y buffer.");
 
     auto cusparse_h = handle_.get_cusparse_handle();
-    auto stream     = handle_.get_stream();
+    auto stream = handle_.get_stream();
 
-    cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE :  // transpose
-                                  CUSPARSE_OPERATION_NON_TRANSPOSE;         // non-transpose
+    cusparseOperation_t trans =
+      transpose ? CUSPARSE_OPERATION_TRANSPOSE :  // transpose
+        CUSPARSE_OPERATION_NON_TRANSPOSE;         //non-transpose
 
 #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP
     auto size_x = transpose ? nrows_ : ncols_;
@@ -216,19 +196,15 @@ struct sparse_matrix_t {
 
     cusparseSpMVAlg_t spmv_alg = translate_algorithm(alg);
 
-    // create descriptors:
+    //create descriptors:
     //(below casts are necessary, because
     // cusparseCreateCsr(...) takes non-const
     // void*; the casts should be harmless)
     //
     cusparseSpMatDescr_t matA;
-    CUSPARSE_CHECK(cusparsecreatecsr(&matA,
-                                     nrows_,
-                                     ncols_,
-                                     nnz_,
-                                     const_cast<index_type*>(row_offsets_),
-                                     const_cast<index_type*>(col_indices_),
-                                     const_cast<value_type*>(values_)));
+    CUSPARSE_CHECK(cusparsecreatecsr(
+      &matA, nrows_, ncols_, nnz_, const_cast<index_type*>(row_offsets_),
+      const_cast<index_type*>(col_indices_), const_cast<value_type*>(values_)));
 
     cusparseDnVecDescr_t vecX;
     CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x));
@@ -236,29 +212,31 @@ struct sparse_matrix_t {
     cusparseDnVecDescr_t vecY;
     CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y));
 
-    // get (scratch) external device buffer size:
+    //get (scratch) external device buffer size:
     //
     size_t bufferSize;
-    CUSPARSE_CHECK(cusparsespmv_buffersize(
-      cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream));
+    CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA,
+                                           vecX, &beta, vecY, spmv_alg,
+                                           &bufferSize, stream));
 
-    // allocate external buffer:
+    //allocate external buffer:
     //
     vector_t<value_type> external_buffer(handle_, bufferSize);
 
-    // finally perform SpMV:
+    //finally perform SpMV:
     //
-    CUSPARSE_CHECK(cusparsespmv(
-      cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream));
+    CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta,
+                                vecY, spmv_alg, external_buffer.raw(), stream));
 
-    // free descriptors:
+    //free descriptors:
     //(TODO: maybe wrap them in a RAII struct?)
     //
     CUSPARSE_CHECK(cusparseDestroyDnVec(vecY));
     CUSPARSE_CHECK(cusparseDestroyDnVec(vecX));
     CUSPARSE_CHECK(cusparseDestroySpMat(matA));
 #else
-    CUSPARSE_CHECK(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
+    CUSPARSE_CHECK(
+      cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
     cusparseMatDescr_t descr = 0;
     CUSPARSE_CHECK(cusparseCreateMatDescr(&descr));
     if (symmetric) {
@@ -267,20 +245,9 @@ struct sparse_matrix_t {
       CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
     }
     CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
-    CUSPARSE_CHECK(cusparsecsrmv(cusparse_h,
-                                 trans,
-                                 nrows_,
-                                 ncols_,
-                                 nnz_,
-                                 &alpha,
-                                 descr,
-                                 values_,
-                                 row_offsets_,
-                                 col_indices_,
-                                 x,
-                                 &beta,
-                                 y,
-                                 stream));
+    CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, ncols_, nnz_,
+                                 &alpha, descr, values_, row_offsets_,
+                                 col_indices_, x, &beta, y, stream));
     CUSPARSE_CHECK(cusparseDestroyMatDescr(descr));
 #endif
   }
@@ -288,18 +255,19 @@ struct sparse_matrix_t {
   handle_t const& get_handle(void) const { return handle_; }
 
 #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP
-  cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const
-  {
+  cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const {
     switch (alg) {
-      case sparse_mv_alg_t::SPARSE_MV_ALG1: return CUSPARSE_CSRMV_ALG1;
-      case sparse_mv_alg_t::SPARSE_MV_ALG2: return CUSPARSE_CSRMV_ALG2;
-      default: return CUSPARSE_MV_ALG_DEFAULT;
+      case sparse_mv_alg_t::SPARSE_MV_ALG1:
+        return CUSPARSE_CSRMV_ALG1;
+      case sparse_mv_alg_t::SPARSE_MV_ALG2:
+        return CUSPARSE_CSRMV_ALG2;
+      default:
+        return CUSPARSE_MV_ALG_DEFAULT;
     }
   }
 #endif
 
-  // private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence,
-  // aggregate
+  //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate
 
   handle_t const& handle_;
   index_type const* row_offsets_;
@@ -316,51 +284,44 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
   laplacian_matrix_t(handle_t const& raft_handle,
                      ThrustExePolicy thrust_exec_policy,
                      index_type const* row_offsets,
-                     index_type const* col_indices,
-                     value_type const* values,
-                     index_type const nrows,
-                     index_type const nnz)
-    : sparse_matrix_t<index_type, value_type>(
-        raft_handle, row_offsets, col_indices, values, nrows, nnz),
-      diagonal_(raft_handle, nrows)
-  {
+                     index_type const* col_indices, value_type const* values,
+                     index_type const nrows, index_type const nnz)
+    : sparse_matrix_t<index_type, value_type>(raft_handle, row_offsets,
+                                              col_indices, values, nrows, nnz),
+      diagonal_(raft_handle, nrows) {
     vector_t<value_type> ones{raft_handle, nrows};
     ones.fill(thrust_exec_policy, 1.0);
-    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
+    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
+                                                diagonal_.raw());
   }
 
   template <typename ThrustExePolicy>
   laplacian_matrix_t(handle_t const& raft_handle,
                      ThrustExePolicy thrust_exec_policy,
                      sparse_matrix_t<index_type, value_type> const& csr_m)
-    : sparse_matrix_t<index_type, value_type>(raft_handle,
-                                              csr_m.row_offsets_,
-                                              csr_m.col_indices_,
-                                              csr_m.values_,
-                                              csr_m.nrows_,
-                                              csr_m.nnz_),
-      diagonal_(raft_handle, csr_m.nrows_)
-  {
+    : sparse_matrix_t<index_type, value_type>(raft_handle, csr_m.row_offsets_,
+                                              csr_m.col_indices_, csr_m.values_,
+                                              csr_m.nrows_, csr_m.nnz_),
+      diagonal_(raft_handle, csr_m.nrows_) {
     vector_t<value_type> ones{raft_handle, csr_m.nrows_};
     ones.fill(thrust_exec_policy, 1.0);
-    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
+    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
+                                                diagonal_.raw());
   }
 
   // y = alpha*A*x + beta*y
   //
-  void mv(value_type alpha,
-          value_type* __restrict__ x,
-          value_type beta,
+  void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
           value_type* __restrict__ y,
           sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
-          bool transpose      = false,
-          bool symmetric      = false) const override
-  {
+          bool transpose = false, bool symmetric = false) const override {
     constexpr int BLOCK_SIZE = 1024;
-    auto n                   = sparse_matrix_t<index_type, value_type>::nrows_;
+    auto n = sparse_matrix_t<index_type, value_type>::nrows_;
 
-    auto cublas_h = sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
-    auto stream   = sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
+    auto cublas_h =
+      sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
+    auto stream =
+      sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
 
     // scales y by beta:
     //
@@ -372,7 +333,8 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
     // Apply diagonal matrix
     //
-    dim3 gridDim{std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
+    dim3 gridDim{
+      std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
 
     dim3 blockDim{BLOCK_SIZE, 1, 1};
     diagmv<<<gridDim, blockDim, 0, stream>>>(n, alpha, diagonal_.raw(), x, y);
@@ -380,7 +342,8 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
     // Apply adjacency matrix
     //
-    sparse_matrix_t<index_type, value_type>::mv(-alpha, x, 1, y, alg, transpose, symmetric);
+    sparse_matrix_t<index_type, value_type>::mv(-alpha, x, 1, y, alg, transpose,
+                                                symmetric);
   }
 
   vector_t<value_type> diagonal_;
@@ -392,68 +355,58 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
   modularity_matrix_t(handle_t const& raft_handle,
                       ThrustExePolicy thrust_exec_policy,
                       index_type const* row_offsets,
-                      index_type const* col_indices,
-                      value_type const* values,
-                      index_type const nrows,
-                      index_type const nnz)
+                      index_type const* col_indices, value_type const* values,
+                      index_type const nrows, index_type const nnz)
     : laplacian_matrix_t<index_type, value_type>(
-        raft_handle, thrust_exec_policy, row_offsets, col_indices, values, nrows, nnz)
-  {
-    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(thrust_exec_policy);
+        raft_handle, thrust_exec_policy, row_offsets, col_indices, values,
+        nrows, nnz) {
+    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(
+      thrust_exec_policy);
   }
 
   template <typename ThrustExePolicy>
   modularity_matrix_t(handle_t const& raft_handle,
                       ThrustExePolicy thrust_exec_policy,
                       sparse_matrix_t<index_type, value_type> const& csr_m)
-    : laplacian_matrix_t<index_type, value_type>(raft_handle, thrust_exec_policy, csr_m)
-  {
-    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(thrust_exec_policy);
+    : laplacian_matrix_t<index_type, value_type>(raft_handle,
+                                                 thrust_exec_policy, csr_m) {
+    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1(
+      thrust_exec_policy);
   }
 
   // y = alpha*A*x + beta*y
   //
-  void mv(value_type alpha,
-          value_type* __restrict__ x,
-          value_type beta,
+  void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
           value_type* __restrict__ y,
           sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
-          bool transpose      = false,
-          bool symmetric      = false) const override
-  {
+          bool transpose = false, bool symmetric = false) const override {
     auto n = sparse_matrix_t<index_type, value_type>::nrows_;
 
-    auto cublas_h = sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
-    auto stream   = sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
+    auto cublas_h =
+      sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
+    auto stream =
+      sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
 
     // y = A*x
     //
-    sparse_matrix_t<index_type, value_type>::mv(alpha, x, 0, y, alg, transpose, symmetric);
+    sparse_matrix_t<index_type, value_type>::mv(alpha, x, 0, y, alg, transpose,
+                                                symmetric);
     value_type dot_res;
 
     // gamma = d'*x
     //
     // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
-    CUBLAS_CHECK(linalg::cublasdot(cublas_h,
-                                   n,
-                                   laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
-                                   1,
-                                   x,
-                                   1,
-                                   &dot_res,
-                                   stream));
+    CUBLAS_CHECK(linalg::cublasdot(
+      cublas_h, n, laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
+      1, x, 1, &dot_res, stream));
 
     // y = y -(gamma/edge_sum)*d
     //
     value_type gamma_ = -dot_res / edge_sum_;
-    CUBLAS_CHECK(linalg::cublasaxpy(cublas_h,
-                                    n,
-                                    &gamma_,
-                                    laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
-                                    1,
-                                    y,
-                                    1,
-                                    stream));
+    CUBLAS_CHECK(linalg::cublasaxpy(
+      cublas_h, n, &gamma_,
+      laplacian_matrix_t<index_type, value_type>::diagonal_.raw(), 1, y, 1,
+      stream));
   }
 
   value_type edge_sum_;
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index bb7087a3be..f8dfe5daa3 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -40,8 +40,7 @@
 #endif
 
 #ifdef COLLECT_TIME_STATISTICS
-static double timer(void)
-{
+static double timer(void) {
   struct timeval tv;
   cudaDeviceSynchronize();
   gettimeofday(&tv, NULL);
@@ -80,27 +79,19 @@ using namespace linalg;
  *    performed.
  *  @return error flag.
  */
-template <typename vertex_t,
-          typename weight_t,
-          typename ThrustExePolicy,
-          typename EigenSolver,
-          typename ClusterSolver>
+template <typename vertex_t, typename weight_t, typename ThrustExePolicy,
+          typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const& handle,
-  ThrustExePolicy thrust_exec_policy,
-  sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-  EigenSolver const& eigen_solver,
-  ClusterSolver const& cluster_solver,
-  vertex_t* __restrict__ clusters,
-  weight_t* eigVals,
-  weight_t* eigVecs)
-{
+  handle_t const &handle, ThrustExePolicy thrust_exec_policy,
+  sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+  EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
+  vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   std::tuple<vertex_t, weight_t, vertex_t>
     stats;  // # iters eigen solver, cluster solver residual, # iters cluster solver
@@ -113,10 +104,11 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   modularity_matrix_t<vertex_t, weight_t> B{handle, thrust_exec_policy, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
-  auto nEigVecs     = eigen_config.n_eigVecs;
+  auto nEigVecs = eigen_config.n_eigVecs;
 
   // Compute eigenvectors corresponding to largest eigenvalues
-  std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
+  std::get<0>(stats) =
+    eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
 
   // Whiten eigenvector matrix
   transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs);
@@ -127,8 +119,8 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   CHECK_CUDA(stream);
 
   // Find partition clustering
-  auto pair_cluster =
-    cluster_solver.solve(handle, thrust_exec_policy, n, nEigVecs, eigVecs, clusters);
+  auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n,
+                                           nEigVecs, eigVecs, clusters);
 
   std::get<1>(stats) = pair_cluster.first;
   std::get<2>(stats) = pair_cluster.second;
@@ -147,13 +139,12 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  *  @param modularity On exit, modularity
  */
 template <typename vertex_t, typename weight_t, typename ThrustExePolicy>
-void analyzeModularity(handle_t const& handle,
+void analyzeModularity(handle_t const &handle,
                        ThrustExePolicy thrust_exec_policy,
-                       sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                       sparse_matrix_t<vertex_t, weight_t> const &csr_m,
                        vertex_t nClusters,
-                       vertex_t const* __restrict__ clusters,
-                       weight_t& modularity)
-{
+                       vertex_t const *__restrict__ clusters,
+                       weight_t &modularity) {
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
 
   vertex_t i;
@@ -161,14 +152,15 @@ void analyzeModularity(handle_t const& handle,
   weight_t partModularity, clustersize;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   // Device memory
   vector_t<weight_t> part_i(handle, n);
   vector_t<weight_t> Bx(handle, n);
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(
+    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Modularity
   modularity_matrix_t<vertex_t, weight_t> B{handle, thrust_exec_policy, csr_m};
@@ -178,8 +170,8 @@ void analyzeModularity(handle_t const& handle,
 
   // Iterate through partitions
   for (i = 0; i < nClusters; ++i) {
-    if (!construct_indicator(
-          handle, thrust_exec_policy, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) {
+    if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize,
+                             partModularity, clusters, part_i, Bx, B)) {
       WARNING("empty partition");
       continue;
     }
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index e2576c1d69..841fca04d9 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -62,30 +62,22 @@ using namespace linalg;
  *    performed.
  *  @return statistics: number of eigensolver iterations, .
  */
-template <typename vertex_t,
-          typename weight_t,
-          typename ThrustExePolicy,
-          typename EigenSolver,
-          typename ClusterSolver>
-std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
-                                                   ThrustExePolicy thrust_exec_policy,
-                                                   sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-                                                   EigenSolver const& eigen_solver,
-                                                   ClusterSolver const& cluster_solver,
-                                                   vertex_t* __restrict__ clusters,
-                                                   weight_t* eigVals,
-                                                   weight_t* eigVecs)
-{
+template <typename vertex_t, typename weight_t, typename ThrustExePolicy,
+          typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(
+  handle_t const &handle, ThrustExePolicy thrust_exec_policy,
+  sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+  EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
+  vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   std::tuple<vertex_t, weight_t, vertex_t>
-    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver,
-            //cluster solver residual, # iters cluster solver
+    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver
 
   vertex_t n = csr_m.nrows_;
 
@@ -96,21 +88,22 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
   // Compute eigenvectors of Laplacian
 
   // Initialize Laplacian
-  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
   laplacian_matrix_t<vertex_t, weight_t> L{handle, thrust_exec_policy, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
-  auto nEigVecs     = eigen_config.n_eigVecs;
+  auto nEigVecs = eigen_config.n_eigVecs;
 
   // Compute smallest eigenvalues and eigenvectors
-  std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
+  std::get<0>(stats) =
+    eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
 
   // Whiten eigenvector matrix
   transform_eigen_matrix(handle, thrust_exec_policy, n, nEigVecs, eigVecs);
 
   // Find partition clustering
-  auto pair_cluster =
-    cluster_solver.solve(handle, thrust_exec_policy, n, nEigVecs, eigVecs, clusters);
+  auto pair_cluster = cluster_solver.solve(handle, thrust_exec_policy, n,
+                                           nEigVecs, eigVecs, clusters);
 
   std::get<1>(stats) = pair_cluster.first;
   std::get<2>(stats) = pair_cluster.second;
@@ -137,21 +130,18 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
  *  @return error flag.
  */
 template <typename vertex_t, typename weight_t, typename ThrustExePolicy>
-void analyzePartition(handle_t const& handle,
+void analyzePartition(handle_t const &handle,
                       ThrustExePolicy thrust_exec_policy,
-                      sparse_matrix_t<vertex_t, weight_t> const& csr_m,
-                      vertex_t nClusters,
-                      const vertex_t* __restrict__ clusters,
-                      weight_t& edgeCut,
-                      weight_t& cost)
-{
+                      sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+                      vertex_t nClusters, const vertex_t *__restrict__ clusters,
+                      weight_t &edgeCut, weight_t &cost) {
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
 
   vertex_t i;
   vertex_t n = csr_m.nrows_;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   weight_t partEdgesCut, clustersize;
 
@@ -160,21 +150,22 @@ void analyzePartition(handle_t const& handle,
   vector_t<weight_t> Lx(handle, n);
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(
+    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Laplacian
-  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
   laplacian_matrix_t<vertex_t, weight_t> L{handle, thrust_exec_policy, csr_m};
 
   // Initialize output
-  cost    = 0;
+  cost = 0;
   edgeCut = 0;
 
   // Iterate through partitions
   for (i = 0; i < nClusters; ++i) {
     // Construct indicator vector for ith partition
-    if (!construct_indicator(
-          handle, thrust_exec_policy, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) {
+    if (!construct_indicator(handle, thrust_exec_policy, i, n, clustersize,
+                             partEdgesCut, clusters, part_i, Lx, L)) {
       WARNING("empty partition");
       continue;
     }
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp
index 5349cb2810..40dde30a74 100644
--- a/cpp/include/raft/spectral/spectral_util.hpp
+++ b/cpp/include/raft/spectral/spectral_util.hpp
@@ -28,18 +28,20 @@ namespace raft {
 namespace spectral {
 
 template <typename index_type_t, typename value_type_t>
-static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs)
-{
+static __global__ void scale_obs_kernel(index_type_t m, index_type_t n,
+                                        value_type_t* obs) {
   index_type_t i, j, k, index, mm;
   value_type_t alpha, v, last;
   bool valid;
   // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension
 
   // compute alpha
-  mm    = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x);  // m in multiple of blockDim.x
+  mm = (((m + blockDim.x - 1) / blockDim.x) *
+        blockDim.x);  // m in multiple of blockDim.x
   alpha = 0.0;
 
-  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
+  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n;
+       j += blockDim.y * gridDim.y) {
     for (i = threadIdx.x; i < mm; i += blockDim.x) {
       // check if the thread is valid
       valid = i < m;
@@ -64,17 +66,17 @@ static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_ty
   // scale by alpha
   alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x);
   alpha = std::sqrt(alpha);
-  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
+  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n;
+       j += blockDim.y * gridDim.y) {
     for (i = threadIdx.x; i < m; i += blockDim.x) {  // blockDim.x=32
-      index      = i + j * m;
+      index = i + j * m;
       obs[index] = obs[index] / alpha;
     }
   }
 }
 
 template <typename index_type_t>
-index_type_t next_pow2(index_type_t n)
-{
+index_type_t next_pow2(index_type_t n) {
   index_type_t v;
   // Reference:
   // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float
@@ -88,8 +90,7 @@ index_type_t next_pow2(index_type_t n)
 }
 
 template <typename index_type_t, typename value_type_t>
-cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs)
-{
+cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) {
   index_type_t p2m;
 
   // find next power of 2
@@ -101,20 +102,19 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs)
   dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1};
 
   // launch scaling kernel (scale each column of obs by its norm)
-  scale_obs_kernel<index_type_t, value_type_t><<<nblocks, nthreads>>>(m, n, obs);
+  scale_obs_kernel<index_type_t, value_type_t>
+    <<<nblocks, nthreads>>>(m, n, obs);
 
   return cudaSuccess;
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t, typename ThrustExePolicy>
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename ThrustExePolicy>
 void transform_eigen_matrix(handle_t const& handle,
-                            ThrustExePolicy thrust_exec_policy,
-                            edge_t n,
-                            vertex_t nEigVecs,
-                            weight_t* eigVecs)
-{
+                            ThrustExePolicy thrust_exec_policy, edge_t n,
+                            vertex_t nEigVecs, weight_t* eigVecs) {
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
+  auto stream = handle.get_stream();
 
   const weight_t zero{0.0};
   const weight_t one{1.0};
@@ -123,9 +123,9 @@ void transform_eigen_matrix(handle_t const& handle,
   for (auto i = 0; i < nEigVecs; ++i) {
     weight_t mean, std;
 
-    mean = thrust::reduce(thrust_exec_policy,
-                          thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
-                          thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
+    mean = thrust::reduce(
+      thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
     CHECK_CUDA(stream);
     mean /= n;
     thrust::transform(thrust_exec_policy,
@@ -136,7 +136,8 @@ void transform_eigen_matrix(handle_t const& handle,
                       thrust::minus<weight_t>());
     CHECK_CUDA(stream);
 
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+    CUBLAS_CHECK(
+      cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
 
     std /= std::sqrt(static_cast<weight_t>(n));
 
@@ -153,25 +154,16 @@ void transform_eigen_matrix(handle_t const& handle,
   //   TODO: in-place transpose
   {
     vector_t<weight_t> work(handle, nEigVecs * n);
-    CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-    CUBLAS_CHECK(cublasgeam(cublas_h,
-                            CUBLAS_OP_T,
-                            CUBLAS_OP_N,
-                            nEigVecs,
-                            n,
-                            &one,
-                            eigVecs,
-                            n,
-                            &zero,
-                            (weight_t*)NULL,
-                            nEigVecs,
-                            work.raw(),
-                            nEigVecs,
-                            stream));
-
-    CUDA_TRY(cudaMemcpyAsync(
-      eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream));
+    CUBLAS_CHECK(
+      cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n,
+                            &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs,
+                            work.raw(), nEigVecs, stream));
+
+    CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(),
+                             nEigVecs * n * sizeof(weight_t),
+                             cudaMemcpyDeviceToDevice, stream));
   }
 }
 
@@ -186,48 +178,49 @@ struct equal_to_i_op {
  public:
   equal_to_i_op(index_type_t _i) : i(_i) {}
   template <typename Tuple_>
-  __host__ __device__ void operator()(Tuple_ t)
-  {
-    thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0;
+  __host__ __device__ void operator()(Tuple_ t) {
+    thrust::get<1>(t) =
+      (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0;
   }
 };
 }  // namespace
 
 // Construct indicator vector for ith partition
 //
-template <typename vertex_t, typename edge_t, typename weight_t, typename ThrustExePolicy>
+template <typename vertex_t, typename edge_t, typename weight_t,
+          typename ThrustExePolicy>
 bool construct_indicator(handle_t const& handle,
-                         ThrustExePolicy thrust_exec_policy,
-                         edge_t index,
-                         edge_t n,
-                         weight_t& clustersize,
-                         weight_t& partStats,
+                         ThrustExePolicy thrust_exec_policy, edge_t index,
+                         edge_t n, weight_t& clustersize, weight_t& partStats,
                          vertex_t const* __restrict__ clusters,
-                         vector_t<weight_t>& part_i,
-                         vector_t<weight_t>& Bx,
-                         laplacian_matrix_t<vertex_t, weight_t> const& B)
-{
+                         vector_t<weight_t>& part_i, vector_t<weight_t>& Bx,
+                         laplacian_matrix_t<vertex_t, weight_t> const& B) {
   auto cublas_h = handle.get_cublas_handle();
-  auto stream   = handle.get_stream();
-
-  thrust::for_each(
-    thrust_exec_policy,
-    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters),
-                                                 thrust::device_pointer_cast(part_i.raw()))),
-    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n),
-                                                 thrust::device_pointer_cast(part_i.raw() + n))),
-    equal_to_i_op<vertex_t, weight_t>(index));
+  auto stream = handle.get_stream();
+
+  thrust::for_each(thrust_exec_policy,
+                   thrust::make_zip_iterator(thrust::make_tuple(
+                     thrust::device_pointer_cast(clusters),
+                     thrust::device_pointer_cast(part_i.raw()))),
+                   thrust::make_zip_iterator(thrust::make_tuple(
+                     thrust::device_pointer_cast(clusters + n),
+                     thrust::device_pointer_cast(part_i.raw() + n))),
+                   equal_to_i_op<vertex_t, weight_t>(index));
   CHECK_CUDA(stream);
 
   // Compute size of ith partition
-  CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
+  CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1,
+                         &clustersize, stream));
 
   clustersize = round(clustersize);
-  if (clustersize < 0.5) { return false; }
+  if (clustersize < 0.5) {
+    return false;
+  }
 
   // Compute part stats
   B.mv(1, part_i.raw(), 0, Bx.raw());
-  CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+  CUBLAS_CHECK(
+    cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
 
   return true;
 }
diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/warn_dbg.hpp
index 08a4e6efb5..406f1b7c7e 100644
--- a/cpp/include/raft/spectral/warn_dbg.hpp
+++ b/cpp/include/raft/spectral/warn_dbg.hpp
@@ -4,13 +4,13 @@
 #include <string>
 
 #define STRINGIFY_DETAIL(x) #x
-#define RAFT_STRINGIFY(x)   STRINGIFY_DETAIL(x)
+#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x)
 
 #ifdef DEBUG
 #define COUT() (std::cout)
 #define CERR() (std::cerr)
 
-// nope:
+//nope:
 //
 #define WARNING(message)                                                  \
   do {                                                                    \
diff --git a/cpp/include/raft/stats/mean.cuh b/cpp/include/raft/stats/mean.cuh
index 4d6724482c..8691cabc85 100644
--- a/cpp/include/raft/stats/mean.cuh
+++ b/cpp/include/raft/stats/mean.cuh
@@ -26,15 +26,15 @@ namespace stats {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
+__global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D,
+                                   IdxType N) {
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId           = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
-  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data            = Type(0);
-  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
+  IdxType thisColId = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId = threadIdx.x / ColsPerBlk;
+  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data = Type(0);
+  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
   for (IdxType i = rowId; i < N; i += stride)
     thread_data += (colId < D) ? data[i * D + colId] : Type(0);
   __shared__ Type smu[ColsPerBlk];
@@ -46,8 +46,8 @@ __global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxTyp
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
+__global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D,
+                                   IdxType N) {
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
@@ -57,7 +57,9 @@ __global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxTyp
     thread_data += data[idx];
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; }
+  if (threadIdx.x == 0) {
+    mu[blockIdx.x] = acc / N;
+  }
 }
 
 /**
@@ -78,22 +80,24 @@ __global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxTyp
  * @param stream: cuda stream
  */
 template <typename Type, typename IdxType = int>
-void mean(
-  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
-{
+void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
+          bool rowMajor, cudaStream_t stream) {
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk    = 32;
-    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk = 32;
+    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
+              raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream));
-    meanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(mu, data, D, N);
+    meanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
+      <<<grid, TPB, 0, stream>>>(mu, data, D, N);
     CUDA_CHECK(cudaPeekAtLastError());
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::scalarMultiply(mu, mu, ratio, D, stream);
   } else {
-    meanKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(mu, data, D, N);
+    meanKernelColMajor<Type, IdxType, TPB>
+      <<<D, TPB, 0, stream>>>(mu, data, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/stats/mean_center.cuh b/cpp/include/raft/stats/mean_center.cuh
index c0ba24312b..04934d4388 100644
--- a/cpp/include/raft/stats/mean_center.cuh
+++ b/cpp/include/raft/stats/mean_center.cuh
@@ -38,25 +38,12 @@ namespace stats {
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int, int TPB = 256>
-void meanCenter(Type* out,
-                const Type* data,
-                const Type* mu,
-                IdxType D,
-                IdxType N,
-                bool rowMajor,
-                bool bcastAlongRows,
-                cudaStream_t stream)
-{
+void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D,
+                IdxType N, bool rowMajor, bool bcastAlongRows,
+                cudaStream_t stream) {
   raft::linalg::matrixVectorOp(
-    out,
-    data,
-    mu,
-    D,
-    N,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; },
-    stream);
+    out, data, mu, D, N, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; }, stream);
 }
 
 /**
@@ -74,25 +61,11 @@ void meanCenter(Type* out,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int, int TPB = 256>
-void meanAdd(Type* out,
-             const Type* data,
-             const Type* mu,
-             IdxType D,
-             IdxType N,
-             bool rowMajor,
-             bool bcastAlongRows,
-             cudaStream_t stream)
-{
+void meanAdd(Type *out, const Type *data, const Type *mu, IdxType D, IdxType N,
+             bool rowMajor, bool bcastAlongRows, cudaStream_t stream) {
   raft::linalg::matrixVectorOp(
-    out,
-    data,
-    mu,
-    D,
-    N,
-    rowMajor,
-    bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; },
-    stream);
+    out, data, mu, D, N, rowMajor, bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; }, stream);
 }
 
 };  // end namespace stats
diff --git a/cpp/include/raft/stats/stddev.cuh b/cpp/include/raft/stats/stddev.cuh
index 1dd9cd56bc..f12c633829 100644
--- a/cpp/include/raft/stats/stddev.cuh
+++ b/cpp/include/raft/stats/stddev.cuh
@@ -26,15 +26,15 @@ namespace stats {
 
 ///@todo: ColPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N)
-{
+__global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D,
+                                     IdxType N) {
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId           = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
-  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data            = Type(0);
-  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
+  IdxType thisColId = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId = threadIdx.x / ColsPerBlk;
+  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data = Type(0);
+  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
   for (IdxType i = rowId; i < N; i += stride) {
     Type val = (colId < D) ? data[i * D + colId] : Type(0);
     thread_data += val * val;
@@ -48,39 +48,41 @@ __global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, Idx
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void stddevKernelColMajor(
-  Type* std, const Type* data, const Type* mu, IdxType D, IdxType N)
-{
+__global__ void stddevKernelColMajor(Type *std, const Type *data,
+                                     const Type *mu, IdxType D, IdxType N) {
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
   IdxType colStart = N * blockIdx.x;
-  Type m           = mu[blockIdx.x];
+  Type m = mu[blockIdx.x];
   for (IdxType i = threadIdx.x; i < N; i += TPB) {
     IdxType idx = colStart + i;
-    Type diff   = data[idx] - m;
+    Type diff = data[idx] - m;
     thread_data += diff * diff;
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) { std[blockIdx.x] = raft::mySqrt(acc / N); }
+  if (threadIdx.x == 0) {
+    std[blockIdx.x] = raft::mySqrt(acc / N);
+  }
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void varsKernelColMajor(
-  Type* var, const Type* data, const Type* mu, IdxType D, IdxType N)
-{
+__global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu,
+                                   IdxType D, IdxType N) {
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
   IdxType colStart = N * blockIdx.x;
-  Type m           = mu[blockIdx.x];
+  Type m = mu[blockIdx.x];
   for (IdxType i = threadIdx.x; i < N; i += TPB) {
     IdxType idx = colStart + i;
-    Type diff   = data[idx] - m;
+    Type diff = data[idx] - m;
     thread_data += diff * diff;
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; }
+  if (threadIdx.x == 0) {
+    var[blockIdx.x] = acc / N;
+  }
 }
 
 /**
@@ -102,33 +104,28 @@ __global__ void varsKernelColMajor(
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void stddev(Type* std,
-            const Type* data,
-            const Type* mu,
-            IdxType D,
-            IdxType N,
-            bool sample,
-            bool rowMajor,
-            cudaStream_t stream)
-{
+void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
+            bool sample, bool rowMajor, cudaStream_t stream) {
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk    = 32;
-    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk = 32;
+    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
+              raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D));
-    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(std, data, D, N);
+    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
+      <<<grid, TPB, 0, stream>>>(std, data, D, N);
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::binaryOp(
-      std,
-      std,
-      mu,
-      D,
-      [ratio] __device__(Type a, Type b) { return raft::mySqrt(a * ratio - b * b); },
+      std, std, mu, D,
+      [ratio] __device__(Type a, Type b) {
+        return raft::mySqrt(a * ratio - b * b);
+      },
       stream);
   } else {
-    stddevKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(std, data, mu, D, N);
+    stddevKernelColMajor<Type, IdxType, TPB>
+      <<<D, TPB, 0, stream>>>(std, data, mu, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -152,28 +149,25 @@ void stddev(Type* std,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void vars(Type* var,
-          const Type* data,
-          const Type* mu,
-          IdxType D,
-          IdxType N,
-          bool sample,
-          bool rowMajor,
-          cudaStream_t stream)
-{
+void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
+          bool sample, bool rowMajor, cudaStream_t stream) {
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk    = 32;
-    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk = 32;
+    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
+              raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D));
-    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(var, data, D, N);
+    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
+      <<<grid, TPB, 0, stream>>>(var, data, D, N);
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::binaryOp(
-      var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream);
+      var, var, mu, D,
+      [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream);
   } else {
-    varsKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(var, data, mu, D, N);
+    varsKernelColMajor<Type, IdxType, TPB>
+      <<<D, TPB, 0, stream>>>(var, data, mu, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/stats/sum.cuh b/cpp/include/raft/stats/sum.cuh
index c7b8ce12b6..5f8416c7e2 100644
--- a/cpp/include/raft/stats/sum.cuh
+++ b/cpp/include/raft/stats/sum.cuh
@@ -26,15 +26,15 @@ namespace stats {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
+__global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D,
+                                  IdxType N) {
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId           = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
-  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data            = Type(0);
-  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
+  IdxType thisColId = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId = threadIdx.x / ColsPerBlk;
+  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data = Type(0);
+  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
   for (IdxType i = rowId; i < N; i += stride)
     thread_data += (colId < D) ? data[i * D + colId] : Type(0);
   __shared__ Type smu[ColsPerBlk];
@@ -46,8 +46,8 @@ __global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
-{
+__global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D,
+                                  IdxType N) {
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
@@ -57,7 +57,9 @@ __global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType
     thread_data += data[idx];
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) { mu[blockIdx.x] = acc; }
+  if (threadIdx.x == 0) {
+    mu[blockIdx.x] = acc;
+  }
 }
 
 /**
@@ -75,19 +77,21 @@ __global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
-{
+void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
+         cudaStream_t stream) {
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk    = 32;
-    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk = 32;
+    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
+              raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D));
     sumKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
       <<<grid, TPB, 0, stream>>>(output, input, D, N);
   } else {
-    sumKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(output, input, D, N);
+    sumKernelColMajor<Type, IdxType, TPB>
+      <<<D, TPB, 0, stream>>>(output, input, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh
index 1e0885fb99..1829fc0351 100644
--- a/cpp/include/raft/vectorized.cuh
+++ b/cpp/include/raft/vectorized.cuh
@@ -22,11 +22,11 @@
 namespace raft {
 
 template <typename math_, int VecLen>
-struct IOType {
-};
+struct IOType {};
 template <>
 struct IOType<bool, 1> {
-  static_assert(sizeof(bool) == sizeof(int8_t), "IOType bool size assumption failed");
+  static_assert(sizeof(bool) == sizeof(int8_t),
+                "IOType bool size assumption failed");
   typedef int8_t Type;
 };
 template <>
@@ -215,42 +215,42 @@ struct IOType<double, 2> {
 };
 
 /**
- * @struct TxN_t
- *
- * @brief Internal data structure that is used to define a facade for vectorized
- * loads/stores across the most common POD types. The goal of his file is to
- * provide with CUDA programmers, an easy way to have compiler issue vectorized
- * load or store instructions to memory (either global or shared). Vectorized
- * accesses to memory are important as they'll utilize its resources
- * efficiently,
- * when compared to their non-vectorized counterparts. Obviously, for whatever
- * reasons if one is unable to issue such vectorized operations, one can always
- * fallback to using POD types.
- *
- * Example demonstrating the use of load operations, performing math on such
- * loaded data and finally storing it back.
- * @code{.cu}
- * TxN_t<uint8_t,8> mydata1, mydata2;
- * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio;
- * mydata1.load(ptr1, idx);
- * mydata2.load(ptr2, idx);
- * #pragma unroll
- * for(int i=0;i<mydata1.Ratio;++i) {
- *     mydata1.val.data[i] += mydata2.val.data[i];
- * }
- * mydata1.store(ptr1, idx);
- * @endcode
- *
- * By doing as above, the interesting thing is that the code effectively remains
- * almost the same, in case one wants to upgrade to TxN_t<uint16_t,16> type.
- * Only change required is to replace variable declaration appropriately.
- *
- * Obviously, it's caller's responsibility to take care of pointer alignment!
- *
- * @tparam math_ the data-type in which the compute/math needs to happen
- * @tparam veclen_ the number of 'math_' types to be loaded/stored per
- * instruction
- */
+     * @struct TxN_t
+     *
+     * @brief Internal data structure that is used to define a facade for vectorized
+     * loads/stores across the most common POD types. The goal of his file is to
+     * provide with CUDA programmers, an easy way to have compiler issue vectorized
+     * load or store instructions to memory (either global or shared). Vectorized
+     * accesses to memory are important as they'll utilize its resources
+     * efficiently,
+     * when compared to their non-vectorized counterparts. Obviously, for whatever
+     * reasons if one is unable to issue such vectorized operations, one can always
+     * fallback to using POD types.
+     *
+     * Example demonstrating the use of load operations, performing math on such
+     * loaded data and finally storing it back.
+     * @code{.cu}
+     * TxN_t<uint8_t,8> mydata1, mydata2;
+     * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio;
+     * mydata1.load(ptr1, idx);
+     * mydata2.load(ptr2, idx);
+     * #pragma unroll
+     * for(int i=0;i<mydata1.Ratio;++i) {
+     *     mydata1.val.data[i] += mydata2.val.data[i];
+     * }
+     * mydata1.store(ptr1, idx);
+     * @endcode
+     *
+     * By doing as above, the interesting thing is that the code effectively remains
+     * almost the same, in case one wants to upgrade to TxN_t<uint16_t,16> type.
+     * Only change required is to replace variable declaration appropriately.
+     *
+     * Obviously, it's caller's responsibility to take care of pointer alignment!
+     *
+     * @tparam math_ the data-type in which the compute/math needs to happen
+     * @tparam veclen_ the number of 'math_' types to be loaded/stored per
+     * instruction
+     */
 template <typename math_, int veclen_>
 struct TxN_t {
   /** underlying math data type */
@@ -274,8 +274,7 @@ struct TxN_t {
    * @brief Fill the contents of this structure with a constant value
    * @param _val the constant to be filled
    */
-  DI void fill(math_t _val)
-  {
+  DI void fill(math_t _val) {
 #pragma unroll
     for (int i = 0; i < Ratio; ++i) {
       val.data[i] = _val;
@@ -300,24 +299,21 @@ struct TxN_t {
    * @{
    */
   template <typename idx_t = int>
-  DI void load(const math_t* ptr, idx_t idx)
-  {
-    const io_t* bptr = reinterpret_cast<const io_t*>(&ptr[idx]);
-    val.internal     = __ldg(bptr);
+  DI void load(const math_t *ptr, idx_t idx) {
+    const io_t *bptr = reinterpret_cast<const io_t *>(&ptr[idx]);
+    val.internal = __ldg(bptr);
   }
 
   template <typename idx_t = int>
-  DI void load(math_t* ptr, idx_t idx)
-  {
-    io_t* bptr   = reinterpret_cast<io_t*>(&ptr[idx]);
+  DI void load(math_t *ptr, idx_t idx) {
+    io_t *bptr = reinterpret_cast<io_t *>(&ptr[idx]);
     val.internal = *bptr;
   }
 
   template <typename idx_t = int>
-  DI void store(math_t* ptr, idx_t idx)
-  {
-    io_t* bptr = reinterpret_cast<io_t*>(&ptr[idx]);
-    *bptr      = val.internal;
+  DI void store(math_t *ptr, idx_t idx) {
+    io_t *bptr = reinterpret_cast<io_t *>(&ptr[idx]);
+    *bptr = val.internal;
   }
   /** @} */
 };
@@ -334,17 +330,11 @@ struct TxN_t<math_, 0> {
 
   DI void fill(math_t _val) {}
   template <typename idx_t = int>
-  DI void load(const math_t* ptr, idx_t idx)
-  {
-  }
+  DI void load(const math_t *ptr, idx_t idx) {}
   template <typename idx_t = int>
-  DI void load(math_t* ptr, idx_t idx)
-  {
-  }
+  DI void load(math_t *ptr, idx_t idx) {}
   template <typename idx_t = int>
-  DI void store(math_t* ptr, idx_t idx)
-  {
-  }
+  DI void store(math_t *ptr, idx_t idx) {}
 };
 
 }  // namespace raft
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
index 284a873dec..4ff6cdf5fa 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster_solvers.cu
@@ -23,8 +23,7 @@
 
 namespace raft {
 
-TEST(Raft, ClusterSolvers)
-{
+TEST(Raft, ClusterSolvers) {
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -41,7 +40,7 @@ TEST(Raft, ClusterSolvers)
   index_type d{10};
   index_type k{5};
 
-  // nullptr expected to trigger exceptions:
+  //nullptr expected to trigger exceptions:
   //
   value_type* eigvecs{nullptr};
   index_type* codes{nullptr};
@@ -50,11 +49,11 @@ TEST(Raft, ClusterSolvers)
 
   kmeans_solver_t<index_type, value_type> cluster_solver{cfg};
 
-  EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d, eigvecs, codes));
+  EXPECT_ANY_THROW(cluster_solver.solve(h, thrust::cuda::par.on(stream), n, d,
+                                        eigvecs, codes));
 }
 
-TEST(Raft, ModularitySolvers)
-{
+TEST(Raft, ModularitySolvers) {
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -69,7 +68,7 @@ TEST(Raft, ModularitySolvers)
   value_type tol{1.0e-10};
   bool reorthog{true};
 
-  // nullptr expected to trigger exceptions:
+  //nullptr expected to trigger exceptions:
   //
   index_type* clusters{nullptr};
   value_type* eigvals{nullptr};
@@ -83,18 +82,21 @@ TEST(Raft, ModularitySolvers)
 
   index_type k{5};
 
-  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol,
+                                                            seed};
   kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
 
   auto stream = h.get_stream();
-  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
+  sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
+                                             nullptr, 0,       0};
   auto t_exe_p = thrust::cuda::par.on(stream);
 
   EXPECT_ANY_THROW(spectral::modularity_maximization(
     h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
 
   value_type modularity{0};
-  EXPECT_ANY_THROW(spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity));
+  EXPECT_ANY_THROW(
+    spectral::analyzeModularity(h, t_exe_p, sm, k, clusters, modularity));
 }
 
 }  // namespace raft
diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp
index 150767992f..c14d880efd 100644
--- a/cpp/test/cudart_utils.cpp
+++ b/cpp/test/cudart_utils.cpp
@@ -20,8 +20,7 @@
 
 namespace raft {
 
-TEST(Raft, Utils)
-{
+TEST(Raft, Utils) {
   ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!"));
   ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception);
   ASSERT_THROW(THROW("Should throw!"), exception);
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index 9ed32b80ef..e2ed2c01dc 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -25,42 +25,30 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-__global__ void naiveDistanceAdjKernel(bool* dist,
-                                       const DataType* x,
-                                       const DataType* y,
-                                       int m,
-                                       int n,
-                                       int k,
-                                       DataType eps,
-                                       bool isRowMajor)
-{
+__global__ void naiveDistanceAdjKernel(bool *dist, const DataType *x,
+                                       const DataType *y, int m, int n, int k,
+                                       DataType eps, bool isRowMajor) {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
     auto diff = x[xidx] - y[yidx];
     acc += diff * diff;
   }
-  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc <= eps;
 }
 
 template <typename DataType>
-void naiveDistanceAdj(bool* dist,
-                      const DataType* x,
-                      const DataType* y,
-                      int m,
-                      int n,
-                      int k,
-                      DataType eps,
-                      bool isRowMajor)
-{
+void naiveDistanceAdj(bool *dist, const DataType *x, const DataType *y, int m,
+                      int n, int k, DataType eps, bool isRowMajor) {
   static const dim3 TPB(16, 32, 1);
   dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
-  naiveDistanceAdjKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, eps, isRowMajor);
+  naiveDistanceAdjKernel<DataType>
+    <<<nblks, TPB>>>(dist, x, y, m, n, k, eps, isRowMajor);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -73,21 +61,21 @@ struct DistanceAdjInputs {
 };
 
 template <typename DataType>
-::std::ostream& operator<<(::std::ostream& os, const DistanceAdjInputs<DataType>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const DistanceAdjInputs<DataType> &dims) {
   return os;
 }
 
 template <typename DataType>
-class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataType>> {
+class DistanceAdjTest
+  : public ::testing::TestWithParam<DistanceAdjInputs<DataType>> {
  public:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<DistanceAdjInputs<DataType>>::GetParam();
     raft::random::Rng r(params.seed);
-    int m           = params.m;
-    int n           = params.n;
-    int k           = params.k;
+    int m = params.m;
+    int n = params.n;
+    int k = params.k;
     bool isRowMajor = params.isRowMajor;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
@@ -101,23 +89,25 @@ class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataTy
     DataType threshold = params.eps;
 
     naiveDistanceAdj(dist_ref, x, y, m, n, k, threshold, isRowMajor);
-    char* workspace = nullptr;
-    size_t worksize = raft::distance::
-      getWorkspaceSize<raft::distance::DistanceType::L2Expanded, DataType, DataType, bool>(
-        x, y, m, n, k);
-    if (worksize != 0) { raft::allocate(workspace, worksize); }
+    char *workspace = nullptr;
+    size_t worksize =
+      raft::distance::getWorkspaceSize<raft::distance::DistanceType::L2Expanded,
+                                       DataType, DataType, bool>(x, y, m, n, k);
+    if (worksize != 0) {
+      raft::allocate(workspace, worksize);
+    }
 
     auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) {
       return d_val <= threshold;
     };
-    raft::distance::distance<raft::distance::DistanceType::L2Expanded, DataType, DataType, bool>(
+    raft::distance::distance<raft::distance::DistanceType::L2Expanded, DataType,
+                             DataType, bool>(
       x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor);
     CUDA_CHECK(cudaStreamDestroy(stream));
     CUDA_CHECK(cudaFree(workspace));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(x));
     CUDA_CHECK(cudaFree(y));
     CUDA_CHECK(cudaFree(dist_ref));
@@ -141,13 +131,13 @@ const std::vector<DistanceAdjInputs<float>> inputsf = {
   {10.0f, 1024, 1024, 32, false, 1234ULL},
 };
 typedef DistanceAdjTest<float> DistanceAdjTestF;
-TEST_P(DistanceAdjTestF, Result)
-{
+TEST_P(DistanceAdjTestF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::Compare<bool>()));
 }
-INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF,
+                        ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceAdjInputs<double>> inputsd = {
   {0.01, 1024, 1024, 32, true, 1234ULL},
@@ -160,13 +150,13 @@ const std::vector<DistanceAdjInputs<double>> inputsd = {
   {10.0, 1024, 1024, 32, false, 1234ULL},
 };
 typedef DistanceAdjTest<double> DistanceAdjTestD;
-TEST_P(DistanceAdjTestD, Result)
-{
+TEST_P(DistanceAdjTestD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::Compare<bool>()));
 }
-INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD,
+                        ::testing::ValuesIn(inputsd));
 
 }  // namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu
index c812a1985d..10bc4d1899 100644
--- a/cpp/test/distance/dist_canberra.cu
+++ b/cpp/test/distance/dist_canberra.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceCanberra : public DistanceTest<raft::distance::DistanceType::Canberra, DataType> {
-};
+class DistanceCanberra
+  : public DistanceTest<raft::distance::DistanceType::Canberra, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceCanberra<float> DistanceCanberraF;
-TEST_P(DistanceCanberraF, Result)
-{
+TEST_P(DistanceCanberraF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF,
+                        ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceCanberra<double> DistanceCanberraD;
-TEST_P(DistanceCanberraD, Result)
-{
+TEST_P(DistanceCanberraD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD,
+                        ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu
index 0a4a69f059..6a2b02863a 100644
--- a/cpp/test/distance/dist_chebyshev.cu
+++ b/cpp/test/distance/dist_chebyshev.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceLinf : public DistanceTest<raft::distance::DistanceType::Linf, DataType> {
-};
+class DistanceLinf
+  : public DistanceTest<raft::distance::DistanceType::Linf, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceLinf<float> DistanceLinfF;
-TEST_P(DistanceLinfF, Result)
-{
+TEST_P(DistanceLinfF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF,
+                        ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceLinf<double> DistanceLinfD;
-TEST_P(DistanceLinfD, Result)
-{
+TEST_P(DistanceLinfD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD,
+                        ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu
index f7510c17b1..291c4196f9 100644
--- a/cpp/test/distance/dist_cos.cu
+++ b/cpp/test/distance/dist_cos.cu
@@ -21,8 +21,9 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceExpCos : public DistanceTest<raft::distance::DistanceType::CosineExpanded, DataType> {
-};
+class DistanceExpCos
+  : public DistanceTest<raft::distance::DistanceType::CosineExpanded,
+                        DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,13 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceExpCos<float> DistanceExpCosF;
-TEST_P(DistanceExpCosF, Result)
-{
+TEST_P(DistanceExpCosF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
+                          raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF,
+                        ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -54,13 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceExpCos<double> DistanceExpCosD;
-TEST_P(DistanceExpCosD, Result)
-{
+TEST_P(DistanceExpCosD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
+                          raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD,
+                        ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu
index e90d0e83dc..46e7ded0ec 100644
--- a/cpp/test/distance/dist_euc_exp.cu
+++ b/cpp/test/distance/dist_euc_exp.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceEucExpTest : public DistanceTest<raft::distance::DistanceType::L2Expanded, DataType> {
-};
+class DistanceEucExpTest
+  : public DistanceTest<raft::distance::DistanceType::L2Expanded, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,13 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucExpTest<float> DistanceEucExpTestF;
-TEST_P(DistanceEucExpTestF, Result)
-{
+TEST_P(DistanceEucExpTestF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
+                          raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF,
+                        ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -54,13 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucExpTest<double> DistanceEucExpTestD;
-TEST_P(DistanceEucExpTestD, Result)
-{
+TEST_P(DistanceEucExpTestD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
+                          raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD,
+                        ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu
index 90412a9cb2..92f424647d 100644
--- a/cpp/test/distance/dist_euc_unexp.cu
+++ b/cpp/test/distance/dist_euc_unexp.cu
@@ -36,13 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucUnexpTest<float> DistanceEucUnexpTestF;
-TEST_P(DistanceEucUnexpTestF, Result)
-{
+TEST_P(DistanceEucUnexpTestF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
+                          raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF,
+                        ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,13 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucUnexpTest<double> DistanceEucUnexpTestD;
-TEST_P(DistanceEucUnexpTestD, Result)
-{
+TEST_P(DistanceEucUnexpTestD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(dist_ref, dist, m, n,
+                          raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD,
+                        ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu
index 95b1908dc1..39dc7aaeff 100644
--- a/cpp/test/distance/dist_hellinger.cu
+++ b/cpp/test/distance/dist_hellinger.cu
@@ -22,8 +22,8 @@ namespace distance {
 
 template <typename DataType>
 class DistanceHellingerExp
-  : public DistanceTest<raft::distance::DistanceType::HellingerExpanded, DataType> {
-};
+  : public DistanceTest<raft::distance::DistanceType::HellingerExpanded,
+                        DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceHellingerExp<float> DistanceHellingerExpF;
-TEST_P(DistanceHellingerExpF, Result)
-{
+TEST_P(DistanceHellingerExpF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF,
+                        ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceHellingerExp<double> DistanceHellingerExpD;
-TEST_P(DistanceHellingerExpD, Result)
-{
+TEST_P(DistanceHellingerExpD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD,
+                        ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu
index d14f8d8a0b..bd32837e45 100644
--- a/cpp/test/distance/dist_l1.cu
+++ b/cpp/test/distance/dist_l1.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceUnexpL1 : public DistanceTest<raft::distance::DistanceType::L1, DataType> {
-};
+class DistanceUnexpL1
+  : public DistanceTest<raft::distance::DistanceType::L1, DataType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceUnexpL1<float> DistanceUnexpL1F;
-TEST_P(DistanceUnexpL1F, Result)
-{
+TEST_P(DistanceUnexpL1F, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F,
+                        ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceUnexpL1<double> DistanceUnexpL1D;
-TEST_P(DistanceUnexpL1D, Result)
-{
+TEST_P(DistanceUnexpL1D, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D,
+                        ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu
index cc6a5f60de..42b8e294ac 100644
--- a/cpp/test/distance/dist_minkowski.cu
+++ b/cpp/test/distance/dist_minkowski.cu
@@ -21,7 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceLpUnexp : public DistanceTest<raft::distance::DistanceType::LpUnexpanded, DataType> {
+class DistanceLpUnexp
+  : public DistanceTest<raft::distance::DistanceType::LpUnexpanded, DataType> {
 };
 
 const std::vector<DistanceInputs<float>> inputsf = {
@@ -35,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL, 3.0f},
 };
 typedef DistanceLpUnexp<float> DistanceLpUnexpF;
-TEST_P(DistanceLpUnexpF, Result)
-{
+TEST_P(DistanceLpUnexpF, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF,
+                        ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL, 4.0},
@@ -55,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL, 3.0},
 };
 typedef DistanceLpUnexp<double> DistanceLpUnexpD;
-TEST_P(DistanceLpUnexpD, Result)
-{
+TEST_P(DistanceLpUnexpD, Result) {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    raft::devArrMatch(dist_ref, dist, m, n, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(dist_ref, dist, m, n,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD,
+                        ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index a99d307abb..fc7b064205 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -25,52 +25,43 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-__global__ void naiveDistanceKernel(DataType* dist,
-                                    const DataType* x,
-                                    const DataType* y,
-                                    int m,
-                                    int n,
-                                    int k,
+__global__ void naiveDistanceKernel(DataType *dist, const DataType *x,
+                                    const DataType *y, int m, int n, int k,
                                     raft::distance::DistanceType type,
-                                    bool isRowMajor)
-{
+                                    bool isRowMajor) {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
     auto diff = x[xidx] - y[yidx];
     acc += diff * diff;
   }
   if (type == raft::distance::DistanceType::L2SqrtExpanded ||
       type == raft::distance::DistanceType::L2SqrtUnexpanded)
     acc = raft::mySqrt(acc);
-  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType>
-__global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist,
-                                                    const DataType* x,
-                                                    const DataType* y,
-                                                    int m,
-                                                    int n,
-                                                    int k,
-                                                    raft::distance::DistanceType type,
-                                                    bool isRowMajor)
-{
+__global__ void naiveL1_Linf_CanberraDistanceKernel(
+  DataType *dist, const DataType *x, const DataType *y, int m, int n, int k,
+  raft::distance::DistanceType type, bool isRowMajor) {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) { return; }
+  if (midx >= m || nidx >= n) {
+    return;
+  }
 
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a    = x[xidx];
-    auto b    = y[yidx];
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a = x[xidx];
+    auto b = y[yidx];
     auto diff = (a > b) ? (a - b) : (b - a);
     if (type == raft::distance::DistanceType::Linf) {
       acc = raft::myMax(acc, diff);
@@ -84,27 +75,29 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist,
     }
   }
 
-  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType>
-__global__ void naiveCosineDistanceKernel(
-  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
-{
+__global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x,
+                                          const DataType *y, int m, int n,
+                                          int k, bool isRowMajor) {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) { return; }
+  if (midx >= m || nidx >= n) {
+    return;
+  }
 
-  DataType acc_a  = DataType(0);
-  DataType acc_b  = DataType(0);
+  DataType acc_a = DataType(0);
+  DataType acc_b = DataType(0);
   DataType acc_ab = DataType(0);
 
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a   = x[xidx];
-    auto b   = y[yidx];
+    auto a = x[xidx];
+    auto b = y[yidx];
     acc_a += a * a;
     acc_b += b * b;
     acc_ab += a * b;
@@ -113,74 +106,64 @@ __global__ void naiveCosineDistanceKernel(
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
 
   // Use 1.0 - (cosine similarity) to calc the distance
-  dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b));
+  dist[outidx] =
+    (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b));
 }
 
 template <typename DataType>
-__global__ void naiveHellingerDistanceKernel(
-  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
-{
+__global__ void naiveHellingerDistanceKernel(DataType *dist, const DataType *x,
+                                             const DataType *y, int m, int n,
+                                             int k, bool isRowMajor) {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) { return; }
+  if (midx >= m || nidx >= n) {
+    return;
+  }
 
   DataType acc_ab = DataType(0);
 
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a   = x[xidx];
-    auto b   = y[yidx];
+    auto a = x[xidx];
+    auto b = y[yidx];
     acc_ab += raft::mySqrt(a) * raft::mySqrt(b);
   }
 
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
 
   // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
-  acc_ab         = 1 - acc_ab;
+  acc_ab = 1 - acc_ab;
   auto rectifier = (!signbit(acc_ab));
-  dist[outidx]   = raft::mySqrt(rectifier * acc_ab);
+  dist[outidx] = raft::mySqrt(rectifier * acc_ab);
 }
 
 template <typename DataType>
-__global__ void naiveLpUnexpDistanceKernel(DataType* dist,
-                                           const DataType* x,
-                                           const DataType* y,
-                                           int m,
-                                           int n,
-                                           int k,
-                                           bool isRowMajor,
-                                           DataType p)
-{
+__global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x,
+                                           const DataType *y, int m, int n,
+                                           int k, bool isRowMajor, DataType p) {
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a    = x[xidx];
-    auto b    = y[yidx];
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a = x[xidx];
+    auto b = y[yidx];
     auto diff = raft::L1Op<DataType>()(a - b);
     acc += raft::myPow(diff, p);
   }
   auto one_over_p = 1 / p;
-  acc             = raft::myPow(acc, one_over_p);
-  int outidx      = isRowMajor ? midx * n + nidx : midx + m * nidx;
-  dist[outidx]    = acc;
+  acc = raft::myPow(acc, one_over_p);
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx] = acc;
 }
 
 template <typename DataType>
-void naiveDistance(DataType* dist,
-                   const DataType* x,
-                   const DataType* y,
-                   int m,
-                   int n,
-                   int k,
-                   raft::distance::DistanceType type,
-                   bool isRowMajor,
-                   DataType metric_arg = 2.0f)
-{
+void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m,
+                   int n, int k, raft::distance::DistanceType type,
+                   bool isRowMajor, DataType metric_arg = 2.0f) {
   static const dim3 TPB(16, 32, 1);
   dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
 
@@ -195,19 +178,23 @@ void naiveDistance(DataType* dist,
     case raft::distance::DistanceType::L2Unexpanded:
     case raft::distance::DistanceType::L2SqrtExpanded:
     case raft::distance::DistanceType::L2Expanded:
-      naiveDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, type, isRowMajor);
+      naiveDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, type, isRowMajor);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      naiveCosineDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveCosineDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      naiveHellingerDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveHellingerDistanceKernel<DataType>
+        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
       naiveLpUnexpDistanceKernel<DataType>
         <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor, metric_arg);
       break;
-    default: FAIL() << "should be here\n";
+    default:
+      FAIL() << "should be here\n";
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -222,47 +209,37 @@ struct DistanceInputs {
 };
 
 template <typename DataType>
-::std::ostream& operator<<(::std::ostream& os, const DistanceInputs<DataType>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const DistanceInputs<DataType> &dims) {
   return os;
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
-void distanceLauncher(DataType* x,
-                      DataType* y,
-                      DataType* dist,
-                      DataType* dist2,
-                      int m,
-                      int n,
-                      int k,
-                      DistanceInputs<DataType>& params,
-                      DataType threshold,
-                      char* workspace,
-                      size_t worksize,
-                      cudaStream_t stream,
-                      bool isRowMajor,
-                      DataType metric_arg = 2.0f)
-{
+void distanceLauncher(DataType *x, DataType *y, DataType *dist, DataType *dist2,
+                      int m, int n, int k, DistanceInputs<DataType> &params,
+                      DataType threshold, char *workspace, size_t worksize,
+                      cudaStream_t stream, bool isRowMajor,
+                      DataType metric_arg = 2.0f) {
   auto fin_op = [dist2, threshold] __device__(DataType d_val, int g_d_idx) {
     dist2[g_d_idx] = (d_val < threshold) ? 0.f : d_val;
     return d_val;
   };
   raft::distance::distance<distanceType, DataType, DataType, DataType>(
-    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
+    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor,
+    metric_arg);
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
 class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
  public:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<DistanceInputs<DataType>>::GetParam();
     raft::random::Rng r(params.seed);
-    int m               = params.m;
-    int n               = params.n;
-    int k               = params.k;
+    int m = params.m;
+    int n = params.n;
+    int k = params.k;
     DataType metric_arg = params.metric_arg;
-    bool isRowMajor     = params.isRowMajor;
+    bool isRowMajor = params.isRowMajor;
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     raft::allocate(x, m * k);
@@ -279,33 +256,25 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
       r.uniform(y, n * k, DataType(-1.0), DataType(1.0), stream);
     }
 
-    naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor, metric_arg);
-    char* workspace = nullptr;
+    naiveDistance(dist_ref, x, y, m, n, k, distanceType, isRowMajor,
+                  metric_arg);
+    char *workspace = nullptr;
     size_t worksize =
-      raft::distance::getWorkspaceSize<distanceType, DataType, DataType, DataType>(x, y, m, n, k);
-    if (worksize != 0) { raft::allocate(workspace, worksize); }
+      raft::distance::getWorkspaceSize<distanceType, DataType, DataType,
+                                       DataType>(x, y, m, n, k);
+    if (worksize != 0) {
+      raft::allocate(workspace, worksize);
+    }
 
     DataType threshold = -10000.f;
-    distanceLauncher<distanceType, DataType>(x,
-                                             y,
-                                             dist,
-                                             dist2,
-                                             m,
-                                             n,
-                                             k,
-                                             params,
-                                             threshold,
-                                             workspace,
-                                             worksize,
-                                             stream,
-                                             isRowMajor,
-                                             metric_arg);
+    distanceLauncher<distanceType, DataType>(x, y, dist, dist2, m, n, k, params,
+                                             threshold, workspace, worksize,
+                                             stream, isRowMajor, metric_arg);
     CUDA_CHECK(cudaStreamDestroy(stream));
     CUDA_CHECK(cudaFree(workspace));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(x));
     CUDA_CHECK(cudaFree(y));
     CUDA_CHECK(cudaFree(dist_ref));
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index a7b763a2bc..4573a070b6 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -29,40 +29,40 @@ template <typename LabelT, typename DataT>
 struct CubKVPMinReduce {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+  DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) {
+    return b.value < a.value ? b : a;
+  }
 
-  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
+  DI KVP operator()(const KVP &a, const KVP &b) {
+    return b.value < a.value ? b : a;
+  }
 
 };  // KVPMinReduce
 
 template <typename DataT, bool Sqrt, typename ReduceOpT, int NWARPS>
-__global__ void naiveKernel(cub::KeyValuePair<int, DataT>* min,
-                            DataT* x,
-                            DataT* y,
-                            int m,
-                            int n,
-                            int k,
-                            int* workspace,
-                            DataT maxVal)
-{
-  int midx  = threadIdx.y + blockIdx.y * blockDim.y;
-  int nidx  = threadIdx.x + blockIdx.x * blockDim.x;
+__global__ void naiveKernel(cub::KeyValuePair<int, DataT> *min, DataT *x,
+                            DataT *y, int m, int n, int k, int *workspace,
+                            DataT maxVal) {
+  int midx = threadIdx.y + blockIdx.y * blockDim.y;
+  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
   DataT acc = DataT(0);
   for (int i = 0; i < k; ++i) {
-    int xidx  = i + midx * k;
-    int yidx  = i + nidx * k;
+    int xidx = i + midx * k;
+    int yidx = i + nidx * k;
     auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx];
     acc += diff * diff;
   }
-  if (Sqrt) { acc = raft::mySqrt(acc); }
+  if (Sqrt) {
+    acc = raft::mySqrt(acc);
+  }
   ReduceOpT redOp;
   typedef cub::WarpReduce<cub::KeyValuePair<int, DataT>> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp[NWARPS];
   int warpId = threadIdx.x / raft::WarpSize;
   cub::KeyValuePair<int, DataT> tmp;
-  tmp.key   = nidx;
+  tmp.key = nidx;
   tmp.value = midx >= m || nidx >= n ? maxVal : acc;
-  tmp       = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce<int, DataT>());
+  tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce<int, DataT>());
   if (threadIdx.x % raft::WarpSize == 0 && midx < m) {
     while (atomicCAS(workspace + midx, 0, 1) == 1)
       ;
@@ -74,15 +74,8 @@ __global__ void naiveKernel(cub::KeyValuePair<int, DataT>* min,
 }
 
 template <typename DataT, bool Sqrt>
-void naive(cub::KeyValuePair<int, DataT>* min,
-           DataT* x,
-           DataT* y,
-           int m,
-           int n,
-           int k,
-           int* workspace,
-           cudaStream_t stream)
-{
+void naive(cub::KeyValuePair<int, DataT> *min, DataT *x, DataT *y, int m, int n,
+           int k, int *workspace, cudaStream_t stream) {
   static const dim3 TPB(32, 16, 1);
   dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1);
   CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
@@ -92,7 +85,8 @@ void naive(cub::KeyValuePair<int, DataT>* min,
     <<<blks, 256, 0, stream>>>(min, m, std::numeric_limits<DataT>::max(), op);
   CUDA_CHECK(cudaGetLastError());
   naiveKernel<DataT, Sqrt, MinAndDistanceReduceOp<int, DataT>, 16>
-    <<<nblks, TPB, 0, stream>>>(min, x, y, m, n, k, workspace, std::numeric_limits<DataT>::max());
+    <<<nblks, TPB, 0, stream>>>(min, x, y, m, n, k, workspace,
+                                std::numeric_limits<DataT>::max());
   CUDA_CHECK(cudaGetLastError());
 }
 
@@ -106,8 +100,7 @@ struct Inputs {
 template <typename DataT, bool Sqrt>
 class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
  public:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<Inputs<DataT>>::GetParam();
     raft::random::Rng r(params.seed);
     int m = params.m;
@@ -128,8 +121,7 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
     raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, true, stream);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaStreamDestroy(stream));
     CUDA_CHECK(cudaFree(x));
@@ -144,38 +136,25 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
  protected:
   Inputs<DataT> params;
   DataT *x, *y, *xn, *yn;
-  char* workspace;
-  cub::KeyValuePair<int, DataT>*min, *min_ref;
+  char *workspace;
+  cub::KeyValuePair<int, DataT> *min, *min_ref;
   cudaStream_t stream;
 
-  virtual void generateGoldenResult()
-  {
+  virtual void generateGoldenResult() {
     int m = params.m;
     int n = params.n;
     int k = params.k;
-    naive<DataT, Sqrt>(min_ref, x, y, m, n, k, (int*)workspace, stream);
+    naive<DataT, Sqrt>(min_ref, x, y, m, n, k, (int *)workspace, stream);
   }
 
-  void runTest(cub::KeyValuePair<int, DataT>* out)
-  {
+  void runTest(cub::KeyValuePair<int, DataT> *out) {
     int m = params.m;
     int n = params.n;
     int k = params.k;
     MinAndDistanceReduceOp<int, DataT> redOp;
-    fusedL2NN<DataT, cub::KeyValuePair<int, DataT>, int>(out,
-                                                         x,
-                                                         y,
-                                                         xn,
-                                                         yn,
-                                                         m,
-                                                         n,
-                                                         k,
-                                                         (void*)workspace,
-                                                         redOp,
-                                                         raft::distance::KVPMinReduce<int, DataT>(),
-                                                         Sqrt,
-                                                         true,
-                                                         stream);
+    fusedL2NN<DataT, cub::KeyValuePair<int, DataT>, int>(
+      out, x, y, xn, yn, m, n, k, (void *)workspace, redOp,
+      raft::distance::KVPMinReduce<int, DataT>(), Sqrt, true, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 };
@@ -184,10 +163,9 @@ template <typename T>
 struct CompareApproxAbsKVP {
   typedef typename cub::KeyValuePair<int, T> KVP;
   CompareApproxAbsKVP(T eps_) : eps(eps_) {}
-  bool operator()(const KVP& a, const KVP& b) const
-  {
-    T diff  = raft::abs(raft::abs(a.value) - raft::abs(b.value));
-    T m     = std::max(raft::abs(a.value), raft::abs(b.value));
+  bool operator()(const KVP &a, const KVP &b) const {
+    T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value));
+    T m = std::max(raft::abs(a.value), raft::abs(b.value));
     T ratio = m >= eps ? diff / m : diff;
     return (ratio <= eps);
   }
@@ -199,20 +177,17 @@ struct CompareApproxAbsKVP {
 template <typename T>
 struct CompareExactKVP {
   typedef typename cub::KeyValuePair<int, T> KVP;
-  bool operator()(const KVP& a, const KVP& b) const
-  {
+  bool operator()(const KVP &a, const KVP &b) const {
     if (a.value != b.value) return false;
     return true;
   }
 };
 
 template <typename K, typename V, typename L>
-::testing::AssertionResult devArrMatch(const cub::KeyValuePair<K, V>* expected,
-                                       const cub::KeyValuePair<K, V>* actual,
-                                       size_t size,
-                                       L eq_compare,
-                                       cudaStream_t stream = 0)
-{
+::testing::AssertionResult devArrMatch(const cub::KeyValuePair<K, V> *expected,
+                                       const cub::KeyValuePair<K, V> *actual,
+                                       size_t size, L eq_compare,
+                                       cudaStream_t stream = 0) {
   typedef typename cub::KeyValuePair<K, V> KVP;
   std::shared_ptr<KVP> exp_h(new KVP[size]);
   std::shared_ptr<KVP> act_h(new KVP[size]);
@@ -224,42 +199,47 @@ template <typename K, typename V, typename L>
     auto act = act_h.get()[i];
     if (!eq_compare(exp, act)) {
       return ::testing::AssertionFailure()
-             << "actual=" << act.key << "," << act.value << " != expected=" << exp.key << ","
-             << exp.value << " @" << i;
+             << "actual=" << act.key << "," << act.value
+             << " != expected=" << exp.key << "," << exp.value << " @" << i;
     }
   }
   return ::testing::AssertionSuccess();
 }
 
 const std::vector<Inputs<float>> inputsf = {
-  {0.001f, 32, 32, 32, 1234ULL},   {0.001f, 32, 64, 32, 1234ULL},   {0.001f, 64, 32, 32, 1234ULL},
-  {0.001f, 64, 64, 32, 1234ULL},   {0.001f, 128, 32, 32, 1234ULL},  {0.001f, 128, 64, 32, 1234ULL},
+  {0.001f, 32, 32, 32, 1234ULL},   {0.001f, 32, 64, 32, 1234ULL},
+  {0.001f, 64, 32, 32, 1234ULL},   {0.001f, 64, 64, 32, 1234ULL},
+  {0.001f, 128, 32, 32, 1234ULL},  {0.001f, 128, 64, 32, 1234ULL},
   {0.001f, 128, 128, 64, 1234ULL}, {0.001f, 64, 128, 128, 1234ULL},
 
-  {0.001f, 32, 32, 34, 1234ULL},   {0.001f, 32, 64, 34, 1234ULL},   {0.001f, 64, 32, 34, 1234ULL},
-  {0.001f, 64, 64, 34, 1234ULL},   {0.001f, 128, 32, 34, 1234ULL},  {0.001f, 128, 64, 34, 1234ULL},
+  {0.001f, 32, 32, 34, 1234ULL},   {0.001f, 32, 64, 34, 1234ULL},
+  {0.001f, 64, 32, 34, 1234ULL},   {0.001f, 64, 64, 34, 1234ULL},
+  {0.001f, 128, 32, 34, 1234ULL},  {0.001f, 128, 64, 34, 1234ULL},
   {0.001f, 128, 128, 66, 1234ULL}, {0.001f, 64, 128, 130, 1234ULL},
 
-  {0.001f, 32, 32, 33, 1234ULL},   {0.001f, 32, 64, 33, 1234ULL},   {0.001f, 64, 32, 33, 1234ULL},
-  {0.001f, 64, 64, 33, 1234ULL},   {0.001f, 128, 32, 33, 1234ULL},  {0.001f, 128, 64, 33, 1234ULL},
+  {0.001f, 32, 32, 33, 1234ULL},   {0.001f, 32, 64, 33, 1234ULL},
+  {0.001f, 64, 32, 33, 1234ULL},   {0.001f, 64, 64, 33, 1234ULL},
+  {0.001f, 128, 32, 33, 1234ULL},  {0.001f, 128, 64, 33, 1234ULL},
   {0.001f, 128, 128, 65, 1234ULL}, {0.001f, 64, 128, 129, 1234ULL},
 
   {0.006f, 1805, 134, 2, 1234ULL},
 };
 typedef FusedL2NNTest<float, false> FusedL2NNTestF_Sq;
-TEST_P(FusedL2NNTestF_Sq, Result)
-{
+TEST_P(FusedL2NNTestF_Sq, Result) {
   runTest(min);
-  ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(min_ref, min, params.m,
+                          CompareApproxAbsKVP<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq,
+                        ::testing::ValuesIn(inputsf));
 typedef FusedL2NNTest<float, true> FusedL2NNTestF_Sqrt;
-TEST_P(FusedL2NNTestF_Sqrt, Result)
-{
+TEST_P(FusedL2NNTestF_Sqrt, Result) {
   runTest(min);
-  ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(min_ref, min, params.m,
+                          CompareApproxAbsKVP<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt,
+                        ::testing::ValuesIn(inputsf));
 
 const std::vector<Inputs<double>> inputsd = {
   {0.00001, 32, 32, 32, 1234ULL},   {0.00001, 32, 64, 32, 1234ULL},
@@ -280,38 +260,38 @@ const std::vector<Inputs<double>> inputsd = {
   {0.00001, 1805, 134, 2, 1234ULL},
 };
 typedef FusedL2NNTest<double, false> FusedL2NNTestD_Sq;
-TEST_P(FusedL2NNTestD_Sq, Result)
-{
+TEST_P(FusedL2NNTestD_Sq, Result) {
   runTest(min);
-  ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(min_ref, min, params.m,
+                          CompareApproxAbsKVP<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq,
+                        ::testing::ValuesIn(inputsd));
 typedef FusedL2NNTest<double, true> FusedL2NNTestD_Sqrt;
-TEST_P(FusedL2NNTestD_Sqrt, Result)
-{
+TEST_P(FusedL2NNTestD_Sqrt, Result) {
   runTest(min);
-  ASSERT_TRUE(devArrMatch(min_ref, min, params.m, CompareApproxAbsKVP<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(min_ref, min, params.m,
+                          CompareApproxAbsKVP<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt,
+                        ::testing::ValuesIn(inputsd));
 
 /// This is to test output determinism of the prim
 template <typename DataT, bool Sqrt>
 class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
-  void SetUp() override
-  {
+  void SetUp() override {
     FusedL2NNTest<DataT, Sqrt>::SetUp();
     int m = this->params.m;
     raft::allocate(min1, m);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     FusedL2NNTest<DataT, Sqrt>::TearDown();
     CUDA_CHECK(cudaFree(min1));
   }
 
  protected:
-  cub::KeyValuePair<int, DataT>* min1;
+  cub::KeyValuePair<int, DataT> *min1;
 
   static const int NumRepeats = 100;
 
@@ -319,46 +299,46 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
 };
 
 typedef FusedL2NNDetTest<float, false> FusedL2NNDetTestF_Sq;
-TEST_P(FusedL2NNDetTestF_Sq, Result)
-{
+TEST_P(FusedL2NNDetTestF_Sq, Result) {
   runTest(min);  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1);
     ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP<float>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq,
+                        ::testing::ValuesIn(inputsf));
 typedef FusedL2NNDetTest<float, true> FusedL2NNDetTestF_Sqrt;
-TEST_P(FusedL2NNDetTestF_Sqrt, Result)
-{
+TEST_P(FusedL2NNDetTestF_Sqrt, Result) {
   runTest(min);  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1);
     ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP<float>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt,
+                        ::testing::ValuesIn(inputsf));
 
 typedef FusedL2NNDetTest<double, false> FusedL2NNDetTestD_Sq;
-TEST_P(FusedL2NNDetTestD_Sq, Result)
-{
+TEST_P(FusedL2NNDetTestD_Sq, Result) {
   runTest(min);  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1);
     ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP<double>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq,
+                        ::testing::ValuesIn(inputsd));
 typedef FusedL2NNDetTest<double, true> FusedL2NNDetTestD_Sqrt;
-TEST_P(FusedL2NNDetTestD_Sqrt, Result)
-{
+TEST_P(FusedL2NNDetTestD_Sqrt, Result) {
   runTest(min);  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1);
     ASSERT_TRUE(devArrMatch(min, min1, params.m, CompareExactKVP<double>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt,
+                        ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index e14841eb54..e6ee09262e 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -23,8 +23,7 @@
 
 namespace raft {
 
-TEST(Raft, EigenSolvers)
-{
+TEST(Raft, EigenSolvers) {
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -36,10 +35,10 @@ TEST(Raft, EigenSolvers)
   index_type* ro{nullptr};
   index_type* ci{nullptr};
   value_type* vs{nullptr};
-  index_type nnz   = 0;
+  index_type nnz = 0;
   index_type nrows = 0;
-  auto stream      = h.get_stream();
-  auto t_exe_pol   = thrust::cuda::par.on(stream);
+  auto stream = h.get_stream();
+  auto t_exe_pol = thrust::cuda::par.on(stream);
 
   sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
@@ -50,7 +49,7 @@ TEST(Raft, EigenSolvers)
   value_type tol{1.0e-10};
   bool reorthog{true};
 
-  // nullptr expected to trigger exceptions:
+  //nullptr expected to trigger exceptions:
   //
   value_type* eigvals{nullptr};
   value_type* eigvecs{nullptr};
@@ -61,13 +60,14 @@ TEST(Raft, EigenSolvers)
 
   lanczos_solver_t<index_type, value_type> eig_solver{cfg};
 
-  EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
+  EXPECT_ANY_THROW(
+    eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
 
-  EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs));
+  EXPECT_ANY_THROW(
+    eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs));
 }
 
-TEST(Raft, SpectralSolvers)
-{
+TEST(Raft, SpectralSolvers) {
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -82,7 +82,7 @@ TEST(Raft, SpectralSolvers)
   value_type tol{1.0e-10};
   bool reorthog{true};
 
-  // nullptr expected to trigger exceptions:
+  //nullptr expected to trigger exceptions:
   //
   index_type* clusters{nullptr};
   value_type* eigvals{nullptr};
@@ -96,19 +96,22 @@ TEST(Raft, SpectralSolvers)
 
   index_type k{5};
 
-  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol,
+                                                            seed};
   kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
 
   auto stream = h.get_stream();
 
   auto t_exe_p = thrust::cuda::par.on(stream);
-  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
-  EXPECT_ANY_THROW(
-    spectral::partition(h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
+  sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
+                                             nullptr, 0,       0};
+  EXPECT_ANY_THROW(spectral::partition(
+    h, t_exe_p, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
 
   value_type edgeCut{0};
   value_type cost{0};
-  EXPECT_ANY_THROW(spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost));
+  EXPECT_ANY_THROW(
+    spectral::analyzePartition(h, t_exe_p, sm, k, clusters, edgeCut, cost));
 }
 
 }  // namespace raft
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 8023fca319..4cb9809844 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -22,8 +22,7 @@
 
 namespace raft {
 
-TEST(Raft, HandleDefault)
-{
+TEST(Raft, HandleDefault) {
   handle_t h;
   ASSERT_EQ(0, h.get_num_internal_streams());
   ASSERT_EQ(0, h.get_device());
@@ -34,8 +33,7 @@ TEST(Raft, HandleDefault)
   ASSERT_NE(nullptr, h.get_cusparse_handle());
 }
 
-TEST(Raft, Handle)
-{
+TEST(Raft, Handle) {
   handle_t h(4);
   ASSERT_EQ(4, h.get_num_internal_streams());
   cudaStream_t stream;
@@ -46,15 +44,13 @@ TEST(Raft, Handle)
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, GetInternalStreams)
-{
+TEST(Raft, GetInternalStreams) {
   handle_t h(4);
   auto streams = h.get_internal_streams();
   ASSERT_EQ(4U, streams.size());
 }
 
-TEST(Raft, GetHandleFromPool)
-{
+TEST(Raft, GetHandleFromPool) {
   handle_t parent(4);
 
   handle_t child(parent, 2);
@@ -68,8 +64,7 @@ TEST(Raft, GetHandleFromPool)
   ASSERT_EQ(parent.get_device(), child.get_device());
 }
 
-TEST(Raft, GetHandleFromPoolPerf)
-{
+TEST(Raft, GetHandleFromPoolPerf) {
   handle_t parent(100);
   auto start = curTimeMillis();
   for (int i = 0; i < parent.get_num_internal_streams(); i++) {
@@ -81,13 +76,13 @@ TEST(Raft, GetHandleFromPoolPerf)
   ASSERT_LE(curTimeMillis() - start, 10);
 }
 
-TEST(Raft, GetHandleStreamViews)
-{
+TEST(Raft, GetHandleStreamViews) {
   handle_t parent(4);
 
   handle_t child(parent, 2);
   ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view());
-  ASSERT_EQ(parent.get_internal_stream_view(2).value(), child.get_stream_view().value());
+  ASSERT_EQ(parent.get_internal_stream_view(2).value(),
+            child.get_stream_view().value());
   EXPECT_FALSE(child.get_stream_view().is_default());
 }
 }  // namespace raft
diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp
index d883de59fe..830d085a40 100644
--- a/cpp/test/integer_utils.cpp
+++ b/cpp/test/integer_utils.cpp
@@ -20,8 +20,7 @@
 
 namespace raft {
 
-TEST(Raft, rounding_up)
-{
+TEST(Raft, rounding_up) {
   ASSERT_EQ(raft::div_rounding_up_safe(5, 3), 2);
   ASSERT_EQ(raft::div_rounding_up_safe(0, 3), 0);
   ASSERT_EQ(raft::div_rounding_up_safe(7, 8), 1);
@@ -30,8 +29,7 @@ TEST(Raft, rounding_up)
   ASSERT_EQ(raft::div_rounding_up_unsafe(7, 8), 1);
 }
 
-TEST(Raft, is_a_power_of_two)
-{
+TEST(Raft, is_a_power_of_two) {
   ASSERT_EQ(raft::is_a_power_of_two(1 << 5), true);
   ASSERT_EQ(raft::is_a_power_of_two((1 << 5) + 1), false);
 }
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index 209bb0355a..dc2846fdba 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -36,8 +36,7 @@ class labelTest : public ::testing::Test {
 };
 
 typedef labelTest MakeMonotonicTest;
-TEST_F(MakeMonotonicTest, Result)
-{
+TEST_F(MakeMonotonicTest, Result) {
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
 
@@ -49,14 +48,17 @@ TEST_F(MakeMonotonicTest, Result)
   raft::allocate(actual, m, true);
   raft::allocate(expected, m, true);
 
-  float* data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0};
+  float *data_h =
+    new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0};
 
-  float* expected_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0};
+  float *expected_h =
+    new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0};
 
   raft::update_device(data, data_h, m, stream);
   raft::update_device(expected, expected_h, m, stream);
 
-  std::shared_ptr<raft::mr::device::allocator> allocator(new raft::mr::device::default_allocator);
+  std::shared_ptr<raft::mr::device::allocator> allocator(
+    new raft::mr::device::default_allocator);
   make_monotonic(actual, data, m, stream, allocator);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -71,36 +73,37 @@ TEST_F(MakeMonotonicTest, Result)
   delete expected_h;
 }
 
-TEST(labelTest, Classlabels)
-{
+TEST(labelTest, Classlabels) {
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
-  std::shared_ptr<raft::mr::device::allocator> allocator(new raft::mr::device::default_allocator);
+  std::shared_ptr<raft::mr::device::allocator> allocator(
+    new raft::mr::device::default_allocator);
 
   int n_rows = 6;
-  float* y_d;
+  float *y_d;
   raft::allocate(y_d, n_rows);
 
   float y_h[] = {2, -1, 1, 2, 1, 1};
   raft::update_device(y_d, y_h, n_rows, stream);
 
   int n_classes;
-  float* y_unique_d;
+  float *y_unique_d;
   getUniquelabels(y_d, n_rows, &y_unique_d, &n_classes, stream, allocator);
 
   ASSERT_EQ(n_classes, 3);
 
   float y_unique_exp[] = {-1, 1, 2};
-  EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes, raft::Compare<float>(), stream));
+  EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d, n_classes,
+                              raft::Compare<float>(), stream));
 
-  float* y_relabeled_d;
+  float *y_relabeled_d;
   raft::allocate(y_relabeled_d, n_rows);
 
   getOvrlabels(y_d, n_rows, y_unique_d, n_classes, y_relabeled_d, 2, stream);
 
   float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1};
-  EXPECT_TRUE(
-    devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows, raft::Compare<float>(), stream));
+  EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d, n_rows,
+                              raft::Compare<float>(), stream));
 
   CUDA_CHECK(cudaStreamDestroy(stream));
   CUDA_CHECK(cudaFree(y_d));
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index 3d930ff22e..a2f14a8dbc 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -39,7 +39,8 @@ struct MergeLabelsInputs {
 };
 
 template <typename Index_>
-class MergeLabelsTest : public ::testing::TestWithParam<MergeLabelsInputs<Index_>> {
+class MergeLabelsTest
+  : public ::testing::TestWithParam<MergeLabelsInputs<Index_>> {
  protected:
   MergeLabelsTest()
     : params(::testing::TestWithParam<MergeLabelsInputs<Index_>>::GetParam()),
@@ -49,23 +50,25 @@ class MergeLabelsTest : public ::testing::TestWithParam<MergeLabelsInputs<Index_
       expected(params.N, stream),
       R(params.N, stream),
       mask(params.N, stream),
-      m(1, stream)
-  {
-  }
-
-  void Run()
-  {
-    raft::update_device(labels_a.data(), params.labels_a.data(), params.N, stream);
-    raft::update_device(labels_b.data(), params.labels_b.data(), params.N, stream);
-    raft::update_device(expected.data(), params.expected.data(), params.N, stream);
-    raft::update_device(mask.data(), reinterpret_cast<bool*>(params.mask.data()), params.N, stream);
-
-    merge_labels(
-      labels_a.data(), labels_b.data(), mask.data(), R.data(), m.data(), params.N, stream);
+      m(1, stream) {}
+
+  void Run() {
+    raft::update_device(labels_a.data(), params.labels_a.data(), params.N,
+                        stream);
+    raft::update_device(labels_b.data(), params.labels_b.data(), params.N,
+                        stream);
+    raft::update_device(expected.data(), params.expected.data(), params.N,
+                        stream);
+    raft::update_device(mask.data(),
+                        reinterpret_cast<bool *>(params.mask.data()), params.N,
+                        stream);
+
+    merge_labels(labels_a.data(), labels_b.data(), mask.data(), R.data(),
+                 m.data(), params.N, stream);
 
     cudaStreamSynchronize(stream);
-    ASSERT_TRUE(raft::devArrMatch<Index_>(
-      expected.data(), labels_a.data(), params.N, raft::Compare<Index_>()));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(expected.data(), labels_a.data(),
+                                          params.N, raft::Compare<Index_>()));
   }
 
  protected:
@@ -82,14 +85,22 @@ TEST_P(MergeLabelsTestI, Result) { Run(); }
 using MergeLabelsTestL = MergeLabelsTest<int64_t>;
 TEST_P(MergeLabelsTestL, Result) { Run(); }
 
-constexpr int MAX32     = std::numeric_limits<int>::max();
+constexpr int MAX32 = std::numeric_limits<int>::max();
 constexpr int64_t MAX64 = std::numeric_limits<int64_t>::max();
 
 const std::vector<MergeLabelsInputs<int>> merge_inputs_32 = {
   {4, {1, 1, 3, MAX32}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}},
   {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-  {6, {1, 2, 1, 4, 5, MAX32}, {1, 2, MAX32, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}},
-  {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}},
+  {6,
+   {1, 2, 1, 4, 5, MAX32},
+   {1, 2, MAX32, 4, 5, 4},
+   {1, 1, 0, 1, 1, 0},
+   {1, 2, 1, 4, 5, 4}},
+  {6,
+   {1, 2, 2, 2, 2, 6},
+   {1, 1, 1, 5, 5, 5},
+   {1, 1, 1, 1, 1, 1},
+   {1, 1, 1, 1, 1, 1}},
   {8,
    {1, 1, 3, 3, MAX32, 1, 3, MAX32},
    {1, 2, 3, 2, MAX32, 2, 2, 2},
@@ -105,8 +116,16 @@ const std::vector<MergeLabelsInputs<int>> merge_inputs_32 = {
 const std::vector<MergeLabelsInputs<int64_t>> merge_inputs_64 = {
   {4, {1, 1, 3, MAX64}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}},
   {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-  {6, {1, 2, 1, 4, 5, MAX64}, {1, 2, MAX64, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}},
-  {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}},
+  {6,
+   {1, 2, 1, 4, 5, MAX64},
+   {1, 2, MAX64, 4, 5, 4},
+   {1, 1, 0, 1, 1, 0},
+   {1, 2, 1, 4, 5, 4}},
+  {6,
+   {1, 2, 2, 2, 2, 6},
+   {1, 1, 1, 5, 5, 5},
+   {1, 1, 1, 1, 1, 1},
+   {1, 1, 1, 1, 1, 1}},
   {8,
    {1, 1, 3, 3, MAX64, 1, 3, MAX64},
    {1, 2, 3, 2, MAX64, 2, 2, 2},
@@ -119,8 +138,10 @@ const std::vector<MergeLabelsInputs<int64_t>> merge_inputs_64 = {
    {1, 1, 1, 1, 1, 7, 7, 7}},
 };
 
-INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, ::testing::ValuesIn(merge_inputs_32));
-INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, ::testing::ValuesIn(merge_inputs_64));
+INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI,
+                        ::testing::ValuesIn(merge_inputs_32));
+INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL,
+                        ::testing::ValuesIn(merge_inputs_64));
 
 }  // namespace label
 }  // namespace raft
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index 61c7182c72..04f473f836 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -29,11 +29,11 @@
 #include <raft/lap/lap.cuh>
 #include <random>
 
-#define PROBLEMSIZE  1000  // Number of rows/columns
-#define BATCHSIZE    10    // Number of problems in the batch
-#define COSTRANGE    1000
+#define PROBLEMSIZE 1000  // Number of rows/columns
+#define BATCHSIZE 10      // Number of problems in the batch
+#define COSTRANGE 1000
 #define PROBLEMCOUNT 1
-#define REPETITIONS  1
+#define REPETITIONS 1
 
 #define SEED 01010001
 
@@ -43,43 +43,41 @@ namespace raft {
 
 // Function for generating problem with uniformly distributed integer costs between [0, COSTRANGE].
 template <typename weight_t>
-void generateProblem(weight_t* cost_matrix, int SP, int N, int costrange)
-{
+void generateProblem(weight_t *cost_matrix, int SP, int N, int costrange) {
   long N2 = SP * N * N;
 
   std::uniform_int_distribution<int> distribution(0, costrange);
 
   for (long i = 0; i < N2; i++) {
-    int val        = distribution(generator);
+    int val = distribution(generator);
     cost_matrix[i] = (weight_t)val;
   }
 }
 
 template <typename vertex_t, typename weight_t>
-void hungarian_test(int problemsize,
-                    int costrange,
-                    int problemcount,
-                    int repetitions,
-                    int batchsize,
-                    weight_t epsilon,
-                    bool verbose = false)
-{
+void hungarian_test(int problemsize, int costrange, int problemcount,
+                    int repetitions, int batchsize, weight_t epsilon,
+                    bool verbose = false) {
   raft::handle_t handle;
 
-  weight_t* h_cost = new weight_t[batchsize * problemsize * problemsize];
+  weight_t *h_cost = new weight_t[batchsize * problemsize * problemsize];
 
   for (int j = 0; j < problemcount; j++) {
     generateProblem(h_cost, batchsize, problemsize, costrange);
 
     raft::mr::device::buffer<weight_t> elements_v(
-      handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize * problemsize);
+      handle.get_device_allocator(), handle.get_stream(),
+      batchsize * problemsize * problemsize);
     raft::mr::device::buffer<vertex_t> row_assignment_v(
-      handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize);
+      handle.get_device_allocator(), handle.get_stream(),
+      batchsize * problemsize);
     raft::mr::device::buffer<vertex_t> col_assignment_v(
-      handle.get_device_allocator(), handle.get_stream(), batchsize * problemsize);
+      handle.get_device_allocator(), handle.get_stream(),
+      batchsize * problemsize);
 
-    raft::update_device(
-      elements_v.data(), h_cost, batchsize * problemsize * problemsize, handle.get_stream());
+    raft::update_device(elements_v.data(), h_cost,
+                        batchsize * problemsize * problemsize,
+                        handle.get_stream());
 
     for (int i = 0; i < repetitions; i++) {
       float start = omp_get_wtime();
@@ -89,18 +87,20 @@ void hungarian_test(int problemsize,
         handle, problemsize, batchsize, epsilon);
 
       // Solve LAP(s) for given cost matrix
-      lpx.solve(elements_v.data(), row_assignment_v.data(), col_assignment_v.data());
+      lpx.solve(elements_v.data(), row_assignment_v.data(),
+                col_assignment_v.data());
 
       float end = omp_get_wtime();
 
       float total_time = (end - start);
 
       if (verbose) {
-        // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual
-        // objectives. At optimality both values should match.
+        // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual objectives. At optimality both values should match.
         for (int k = 0; k < batchsize; k++) {
-          std::cout << j << ":" << i << ":" << k << ":" << lpx.getPrimalObjectiveValue(k) << ":"
-                    << lpx.getDualObjectiveValue(k) << ":" << total_time << std::endl;
+          std::cout << j << ":" << i << ":" << k << ":"
+                    << lpx.getPrimalObjectiveValue(k) << ":"
+                    << lpx.getDualObjectiveValue(k) << ":" << total_time
+                    << std::endl;
         }
       }
     }
@@ -109,38 +109,34 @@ void hungarian_test(int problemsize,
   delete[] h_cost;
 }
 
-TEST(Raft, HungarianIntFloat)
-{
-  hungarian_test<int, float>(
-    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6});
+TEST(Raft, HungarianIntFloat) {
+  hungarian_test<int, float>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
+                             BATCHSIZE, float{1e-6});
 }
 
-TEST(Raft, HungarianIntDouble)
-{
-  hungarian_test<int, double>(
-    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6});
+TEST(Raft, HungarianIntDouble) {
+  hungarian_test<int, double>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
+                              BATCHSIZE, double{1e-6});
 }
 
-TEST(Raft, HungarianIntLong)
-{
-  hungarian_test<int, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0});
+TEST(Raft, HungarianIntLong) {
+  hungarian_test<int, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
+                            BATCHSIZE, long{0});
 }
 
-TEST(Raft, HungarianLongFloat)
-{
-  hungarian_test<long, float>(
-    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6});
+TEST(Raft, HungarianLongFloat) {
+  hungarian_test<long, float>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
+                              BATCHSIZE, float{1e-6});
 }
 
-TEST(Raft, HungarianLongDouble)
-{
-  hungarian_test<long, double>(
-    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6});
+TEST(Raft, HungarianLongDouble) {
+  hungarian_test<long, double>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT,
+                               REPETITIONS, BATCHSIZE, double{1e-6});
 }
 
-TEST(Raft, HungarianLongLong)
-{
-  hungarian_test<long, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0});
+TEST(Raft, HungarianLongLong) {
+  hungarian_test<long, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
+                             BATCHSIZE, long{0});
 }
 
 }  // namespace raft
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index 38e189f27e..2fc9d4e30f 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -27,8 +27,7 @@ namespace linalg {
 template <typename InT, typename OutT = InT>
 class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<AddInputs<InT, OutT>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
@@ -43,8 +42,7 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     add<InT, OutT>(out, in1, in2, len, stream);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(in1));
     CUDA_CHECK(cudaFree(in2));
@@ -53,10 +51,9 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void compare()
-  {
-    ASSERT_TRUE(
-      raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox<OutT>(params.tolerance)));
+  void compare() {
+    ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len,
+                                  raft::CompareApprox<OutT>(params.tolerance)));
   }
 
  protected:
diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh
index 1d9352bfc1..137419758f 100644
--- a/cpp/test/linalg/add.cuh
+++ b/cpp/test/linalg/add.cuh
@@ -23,17 +23,18 @@ namespace raft {
 namespace linalg {
 
 template <typename InT, typename OutT = InT>
-__global__ void naiveAddElemKernel(OutT* out, const InT* in1, const InT* in2, int len)
-{
+__global__ void naiveAddElemKernel(OutT *out, const InT *in1, const InT *in2,
+                                   int len) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = OutT(in1[idx] + in2[idx]); }
+  if (idx < len) {
+    out[idx] = OutT(in1[idx] + in2[idx]);
+  }
 }
 
 template <typename InT, typename OutT = InT>
-void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len)
-{
+void naiveAddElem(OutT *out, const InT *in1, const InT *in2, int len) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(len, TPB);
+  int nblks = raft::ceildiv(len, TPB);
   naiveAddElemKernel<InT, OutT><<<nblks, TPB>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -46,8 +47,8 @@ struct AddInputs {
 };
 
 template <typename InT, typename OutT = InT>
-::std::ostream& operator<<(::std::ostream& os, const AddInputs<InT, OutT>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const AddInputs<InT, OutT> &dims) {
   return os;
 }
 
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index 078c41356a..3ae4f86066 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -29,19 +29,20 @@ namespace linalg {
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename IdxType, typename OutType>
-void binaryOpLaunch(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
-{
+void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2,
+                    IdxType len, cudaStream_t stream) {
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; },
+    stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-class BinaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>> {
+class BinaryOpTest
+  : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>> {
  protected:
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>>::GetParam();
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      BinaryOpInputs<InType, IdxType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
 
     cudaStream_t stream;
@@ -58,8 +59,7 @@ class BinaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxT
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(in1));
     CUDA_CHECK(cudaFree(in2));
     CUDA_CHECK(cudaFree(out_ref));
@@ -72,61 +72,67 @@ class BinaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxT
   OutType *out_ref, *out;
 };
 
-const std::vector<BinaryOpInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<BinaryOpInputs<float, int>> inputsf_i32 = {
+  {0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, int> BinaryOpTestF_i32;
-TEST_P(BinaryOpTestF_i32, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
+TEST_P(BinaryOpTestF_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32,
+                         ::testing::ValuesIn(inputsf_i32));
 
-const std::vector<BinaryOpInputs<float, size_t>> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<BinaryOpInputs<float, size_t>> inputsf_i64 = {
+  {0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, size_t> BinaryOpTestF_i64;
-TEST_P(BinaryOpTestF_i64, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
+TEST_P(BinaryOpTestF_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64,
+                         ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<BinaryOpInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, int, double> BinaryOpTestF_i32_D;
-TEST_P(BinaryOpTestF_i32_D, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(BinaryOpTestF_i32_D, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D,
+                         ::testing::ValuesIn(inputsf_i32_d));
 
-const std::vector<BinaryOpInputs<double, int>> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL}};
+const std::vector<BinaryOpInputs<double, int>> inputsd_i32 = {
+  {0.00000001, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<double, int> BinaryOpTestD_i32;
-TEST_P(BinaryOpTestD_i32, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(BinaryOpTestD_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32,
+                         ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<BinaryOpInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<double, size_t> BinaryOpTestD_i64;
-TEST_P(BinaryOpTestD_i64, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(BinaryOpTestD_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64,
+                         ::testing::ValuesIn(inputsd_i64));
 
 template <typename math_t>
 class BinaryOpAlignment : public ::testing::Test {
  protected:
-  BinaryOpAlignment()
-  {
+  BinaryOpAlignment() {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
   }
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
  public:
-  void Misaligned()
-  {
+  void Misaligned() {
     // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly
     // chosen.
     int n = 1024;
@@ -136,12 +142,8 @@ class BinaryOpAlignment : public ::testing::Test {
     CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream));
     CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream));
     raft::linalg::binaryOp(
-      z.data() + 9,
-      x.data() + 137,
-      y.data() + 19,
-      256,
-      [] __device__(math_t x, math_t y) { return x + y; },
-      stream);
+      z.data() + 9, x.data() + 137, y.data() + 19, 256,
+      [] __device__(math_t x, math_t y) { return x + y; }, stream);
   }
 
   raft::handle_t handle;
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
index 97cb3ecb24..fd8ed6dd1e 100644
--- a/cpp/test/linalg/binary_op.cuh
+++ b/cpp/test/linalg/binary_op.cuh
@@ -24,17 +24,18 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename IdxType>
-__global__ void naiveAddKernel(OutType* out, const InType* in1, const InType* in2, IdxType len)
-{
+__global__ void naiveAddKernel(OutType *out, const InType *in1,
+                               const InType *in2, IdxType len) {
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
-  if (idx < len) { out[idx] = static_cast<OutType>(in1[idx] + in2[idx]); }
+  if (idx < len) {
+    out[idx] = static_cast<OutType>(in1[idx] + in2[idx]);
+  }
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-void naiveAdd(OutType* out, const InType* in1, const InType* in2, IdxType len)
-{
+void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) {
   static const IdxType TPB = 64;
-  IdxType nblks            = raft::ceildiv(len, TPB);
+  IdxType nblks = raft::ceildiv(len, TPB);
   naiveAddKernel<InType, OutType, IdxType><<<nblks, TPB>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -47,8 +48,8 @@ struct BinaryOpInputs {
 };
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs<InType, IdxType, OutType>& d)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const BinaryOpInputs<InType, IdxType, OutType> &d) {
   return os;
 }
 
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 5bbe3166cf..00236d53fa 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -36,8 +36,7 @@ class CholeskyR1Test : public ::testing::Test {
       L(allocator, handle.get_stream(), n_rows * n_rows),
       L_exp(allocator, handle.get_stream(), n_rows * n_rows),
       devInfo(allocator, handle.get_stream(), 1),
-      workspace(allocator, handle.get_stream())
-  {
+      workspace(allocator, handle.get_stream()) {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(G.data(), G_host, n_rows * n_rows, stream);
@@ -49,58 +48,55 @@ class CholeskyR1Test : public ::testing::Test {
     int n_bytes = 0;
     // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace
     // requirements.
-    raft::linalg::choleskyRank1Update(
-      handle, L.data(), n_rows, n_rows, nullptr, &n_bytes, CUBLAS_FILL_MODE_LOWER, stream);
+    raft::linalg::choleskyRank1Update(handle, L.data(), n_rows, n_rows, nullptr,
+                                      &n_bytes, CUBLAS_FILL_MODE_LOWER, stream);
     Lwork = std::max(Lwork * sizeof(math_t), (size_t)n_bytes);
     workspace.resize(Lwork, stream);
   }
 
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
-  void testR1Update()
-  {
+  void testR1Update() {
     int n = n_rows * n_rows;
-    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER};
+    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER,
+                                           CUBLAS_FILL_MODE_UPPER};
     for (auto uplo : fillmode) {
       raft::copy(L.data(), G.data(), n, stream);
       for (int rank = 1; rank <= n_rows; rank++) {
         std::stringstream ss;
-        ss << "Rank " << rank << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper");
+        ss << "Rank " << rank
+           << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper");
         SCOPED_TRACE(ss.str());
 
         // Expected solution using Cholesky factorization from scratch
         raft::copy(L_exp.data(), G.data(), n, stream);
-        CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(solver_handle,
-                                                     uplo,
-                                                     rank,
-                                                     L_exp.data(),
-                                                     n_rows,
-                                                     (math_t*)workspace.data(),
-                                                     Lwork,
-                                                     devInfo.data(),
-                                                     stream));
+        CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(
+          solver_handle, uplo, rank, L_exp.data(), n_rows,
+          (math_t*)workspace.data(), Lwork, devInfo.data(), stream));
 
         // Incremental Cholesky factorization using rank one updates.
-        raft::linalg::choleskyRank1Update(
-          handle, L.data(), rank, n_rows, workspace.data(), &Lwork, uplo, stream);
+        raft::linalg::choleskyRank1Update(handle, L.data(), rank, n_rows,
+                                          workspace.data(), &Lwork, uplo,
+                                          stream);
 
-        ASSERT_TRUE(raft::devArrMatch(
-          L_exp.data(), L.data(), n_rows * rank, raft::CompareApprox<math_t>(3e-3)));
+        ASSERT_TRUE(raft::devArrMatch(L_exp.data(), L.data(), n_rows * rank,
+                                      raft::CompareApprox<math_t>(3e-3)));
       }
     }
   }
 
-  void testR1Error()
-  {
+  void testR1Error() {
     raft::update_device(G.data(), G2_host, 4, stream);
-    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER};
+    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER,
+                                           CUBLAS_FILL_MODE_UPPER};
     for (auto uplo : fillmode) {
       raft::copy(L.data(), G.data(), 4, stream);
       ASSERT_NO_THROW(raft::linalg::choleskyRank1Update(
         handle, L.data(), 1, 2, workspace.data(), &Lwork, uplo, stream));
-      ASSERT_THROW(raft::linalg::choleskyRank1Update(
-                     handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream),
-                   raft::exception);
+      ASSERT_THROW(
+        raft::linalg::choleskyRank1Update(
+          handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream),
+        raft::exception);
 
       math_t eps = std::numeric_limits<math_t>::epsilon();
       ASSERT_NO_THROW(raft::linalg::choleskyRank1Update(
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index 2760d522bc..e45f5651b4 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -33,8 +33,8 @@ struct coalescedReductionInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const coalescedReductionInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const coalescedReductionInputs<T> &dims) {
   return os;
 }
 
@@ -42,18 +42,17 @@ template <typename T>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename T>
-void coalescedReductionLaunch(
-  T* dots, const T* data, int cols, int rows, cudaStream_t stream, bool inplace = false)
-{
-  coalescedReduction(
-    dots, data, cols, rows, (T)0, stream, inplace, [] __device__(T in, int i) { return in * in; });
+void coalescedReductionLaunch(T *dots, const T *data, int cols, int rows,
+                              cudaStream_t stream, bool inplace = false) {
+  coalescedReduction(dots, data, cols, rows, (T)0, stream, inplace,
+                     [] __device__(T in, int i) { return in * in; });
 }
 
 template <typename T>
-class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductionInputs<T>> {
+class coalescedReductionTest
+  : public ::testing::TestWithParam<coalescedReductionInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<coalescedReductionInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
@@ -74,8 +73,7 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(dots_exp));
     CUDA_CHECK(cudaFree(dots_act));
@@ -86,36 +84,34 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
   T *data, *dots_exp, *dots_act;
 };
 
-const std::vector<coalescedReductionInputs<float>> inputsf = {{0.000002f, 1024, 32, 1234ULL},
-                                                              {0.000002f, 1024, 64, 1234ULL},
-                                                              {0.000002f, 1024, 128, 1234ULL},
-                                                              {0.000002f, 1024, 256, 1234ULL}};
+const std::vector<coalescedReductionInputs<float>> inputsf = {
+  {0.000002f, 1024, 32, 1234ULL},
+  {0.000002f, 1024, 64, 1234ULL},
+  {0.000002f, 1024, 128, 1234ULL},
+  {0.000002f, 1024, 256, 1234ULL}};
 
-const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 1024, 32, 1234ULL},
-                                                               {0.000000001, 1024, 64, 1234ULL},
-                                                               {0.000000001, 1024, 128, 1234ULL},
-                                                               {0.000000001, 1024, 256, 1234ULL}};
+const std::vector<coalescedReductionInputs<double>> inputsd = {
+  {0.000000001, 1024, 32, 1234ULL},
+  {0.000000001, 1024, 64, 1234ULL},
+  {0.000000001, 1024, 128, 1234ULL},
+  {0.000000001, 1024, 256, 1234ULL}};
 
 typedef coalescedReductionTest<float> coalescedReductionTestF;
-TEST_P(coalescedReductionTestF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    dots_exp, dots_act, params.rows, raft::CompareApprox<float>(params.tolerance)));
+TEST_P(coalescedReductionTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef coalescedReductionTest<double> coalescedReductionTestD;
-TEST_P(coalescedReductionTestD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    dots_exp, dots_act, params.rows, raft::CompareApprox<double>(params.tolerance)));
+TEST_P(coalescedReductionTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(coalescedReductionTests,
-                        coalescedReductionTestF,
+INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestF,
                         ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(coalescedReductionTests,
-                        coalescedReductionTestD,
+INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestD,
                         ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index d8995ffa0a..2396558939 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -25,27 +25,30 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naiveDivideKernel(Type* out, const Type* in, Type scalar, int len)
-{
+__global__ void naiveDivideKernel(Type *out, const Type *in, Type scalar,
+                                  int len) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = in[idx] / scalar; }
+  if (idx < len) {
+    out[idx] = in[idx] / scalar;
+  }
 }
 
 template <typename Type>
-void naiveDivide(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream)
-{
+void naiveDivide(Type *out, const Type *in, Type scalar, int len,
+                 cudaStream_t stream) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(len, TPB);
+  int nblks = raft::ceildiv(len, TPB);
   naiveDivideKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename T>
-class DivideTest : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>> {
+class DivideTest
+  : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>> {
  protected:
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam();
+  void SetUp() override {
+    params =
+      ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
     cudaStream_t stream;
@@ -60,8 +63,7 @@ class DivideTest : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(in));
     CUDA_CHECK(cudaFree(out_ref));
     CUDA_CHECK(cudaFree(out));
@@ -72,21 +74,25 @@ class DivideTest : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T
   T *in, *out_ref, *out;
 };
 
-const std::vector<UnaryOpInputs<float>> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<UnaryOpInputs<float>> inputsf = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef DivideTest<float> DivideTestF;
-TEST_P(DivideTestF, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox<float>(params.tolerance)));
+TEST_P(DivideTestF, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF,
+                         ::testing::ValuesIn(inputsf));
 
 typedef DivideTest<double> DivideTestD;
-const std::vector<UnaryOpInputs<double>> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-TEST_P(DivideTestD, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox<double>(params.tolerance)));
+const std::vector<UnaryOpInputs<double>> inputsd = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+TEST_P(DivideTestD, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD,
+                         ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 5cad657dab..159d288174 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -35,16 +35,14 @@ struct EigInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const EigInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os, const EigInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
 class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     raft::handle_t handle;
     stream = handle.get_stream();
 
@@ -53,8 +51,8 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
     int len = params.len;
 
     raft::allocate(cov_matrix, len);
-    T cov_matrix_h[] = {
-      1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0};
+    T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
+                        0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
     raft::update_device(cov_matrix, cov_matrix_h, len, stream);
 
@@ -63,23 +61,10 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
     raft::allocate(eig_vectors_jacobi, len);
     raft::allocate(eig_vals_jacobi, params.n_col);
 
-    T eig_vectors_ref_h[] = {0.2790,
-                             -0.6498,
-                             0.6498,
-                             -0.2789,
-                             -0.5123,
-                             0.4874,
-                             0.4874,
-                             -0.5123,
-                             0.6498,
-                             0.2789,
-                             -0.2789,
-                             -0.6498,
-                             0.4874,
-                             0.5123,
-                             0.5123,
-                             0.4874};
-    T eig_vals_ref_h[]    = {0.0614, 0.1024, 0.3096, 3.5266};
+    T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874,
+                             0.4874, -0.5123, 0.6498, 0.2789,  -0.2789, -0.6498,
+                             0.4874, 0.5123,  0.5123, 0.4874};
+    T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266};
 
     raft::allocate(eig_vectors_ref, len);
     raft::allocate(eig_vals_ref, params.n_col);
@@ -87,19 +72,13 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
     raft::update_device(eig_vectors_ref, eig_vectors_ref_h, len, stream);
     raft::update_device(eig_vals_ref, eig_vals_ref_h, params.n_col, stream);
 
-    eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals, stream);
+    eigDC(handle, cov_matrix, params.n_row, params.n_col, eig_vectors, eig_vals,
+          stream);
 
-    T tol      = 1.e-7;
+    T tol = 1.e-7;
     int sweeps = 15;
-    eigJacobi(handle,
-              cov_matrix,
-              params.n_row,
-              params.n_col,
-              eig_vectors_jacobi,
-              eig_vals_jacobi,
-              stream,
-              tol,
-              sweeps);
+    eigJacobi(handle, cov_matrix, params.n_row, params.n_col,
+              eig_vectors_jacobi, eig_vals_jacobi, stream, tol, sweeps);
 
     // test code for comparing two methods
     len = params.n * params.n;
@@ -111,20 +90,14 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
 
     r.uniform(cov_matrix_large, len, T(-1.0), T(1.0), stream);
 
-    eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large, eig_vals_large, stream);
-    eigJacobi(handle,
-              cov_matrix_large,
-              params.n,
-              params.n,
-              eig_vectors_jacobi_large,
-              eig_vals_jacobi_large,
-              stream,
-              tol,
+    eigDC(handle, cov_matrix_large, params.n, params.n, eig_vectors_large,
+          eig_vals_large, stream);
+    eigJacobi(handle, cov_matrix_large, params.n, params.n,
+              eig_vectors_jacobi_large, eig_vals_jacobi_large, stream, tol,
               sweeps);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(cov_matrix));
     CUDA_CHECK(cudaFree(eig_vectors));
     CUDA_CHECK(cudaFree(eig_vectors_jacobi));
@@ -136,95 +109,89 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
 
  protected:
   EigInputs<T> params;
-  T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals, *eig_vals_jacobi,
-    *eig_vals_ref;
+  T *cov_matrix, *eig_vectors, *eig_vectors_jacobi, *eig_vectors_ref, *eig_vals,
+    *eig_vals_jacobi, *eig_vals_ref;
 
-  T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large, *eig_vals_large,
-    *eig_vals_jacobi_large;
+  T *cov_matrix_large, *eig_vectors_large, *eig_vectors_jacobi_large,
+    *eig_vals_large, *eig_vals_jacobi_large;
 
   cudaStream_t stream;
 };
 
-const std::vector<EigInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigInputs<float>> inputsf2 = {
+  {0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
 
-const std::vector<EigInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigInputs<double>> inputsd2 = {
+  {0.001, 4 * 4, 4, 4, 1234ULL, 256}};
 
 typedef EigTest<float> EigTestValF;
-TEST_P(EigTestValF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestValF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestValD;
-TEST_P(EigTestValD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestValD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecF;
-TEST_P(EigTestVecF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_ref, eig_vectors, params.len, raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestVecF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecD;
-TEST_P(EigTestVecD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_ref, eig_vectors, params.len, raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestVecD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vectors_ref, eig_vectors, params.len,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestValJacobiF;
-TEST_P(EigTestValJacobiF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vals_ref, eig_vals_jacobi, params.n_col, raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestValJacobiF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestValJacobiD;
-TEST_P(EigTestValJacobiD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vals_ref, eig_vals_jacobi, params.n_col, raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestValJacobiD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vals_ref, eig_vals_jacobi, params.n_col,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecJacobiF;
-TEST_P(EigTestVecJacobiF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref,
-                                eig_vectors_jacobi,
-                                params.len,
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestVecJacobiF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecJacobiD;
-TEST_P(EigTestVecJacobiD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref,
-                                eig_vectors_jacobi,
-                                params.len,
-                                raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestVecJacobiD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vectors_ref, eig_vectors_jacobi, params.len,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecCompareF;
-TEST_P(EigTestVecCompareF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(eig_vectors_large,
-                                eig_vectors_jacobi_large,
-                                (params.n * params.n),
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestVecCompareF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n),
+    raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecCompareD;
-TEST_P(EigTestVecCompareD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(eig_vectors_large,
-                                eig_vectors_jacobi_large,
-                                (params.n * params.n),
-                                raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestVecCompareD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(
+    eig_vectors_large, eig_vectors_jacobi_large, (params.n * params.n),
+    raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2));
@@ -235,13 +202,17 @@ INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecF, ::testing::ValuesIn(inputsf2));
 
 INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF,
+                         ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD,
+                         ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF,
+                         ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD,
+                         ::testing::ValuesIn(inputsd2));
 
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index b3cfb19174..b3980f281d 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -37,44 +37,32 @@ struct EigSelInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const EigSelInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os, const EigSelInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
 class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     raft::handle_t handle;
     stream = handle.get_stream();
 
-    params  = ::testing::TestWithParam<EigSelInputs<T>>::GetParam();
+    params = ::testing::TestWithParam<EigSelInputs<T>>::GetParam();
     int len = params.len;
 
     raft::allocate(cov_matrix, len);
-    T cov_matrix_h[] = {
-      1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0};
+    T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
+                        0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
     raft::update_device(cov_matrix, cov_matrix_h, len, stream);
 
     raft::allocate(eig_vectors, 12);
     raft::allocate(eig_vals, params.n_col);
 
-    T eig_vectors_ref_h[] = {-0.5123,
-                             0.4874,
-                             0.4874,
-                             -0.5123,
-                             0.6498,
-                             0.2789,
-                             -0.2789,
-                             -0.6498,
-                             0.4874,
-                             0.5123,
-                             0.5123,
-                             0.4874};
-    T eig_vals_ref_h[]    = {0.1024, 0.3096, 3.5266, 3.5266};
+    T eig_vectors_ref_h[] = {-0.5123, 0.4874,  0.4874, -0.5123, 0.6498, 0.2789,
+                             -0.2789, -0.6498, 0.4874, 0.5123,  0.5123, 0.4874};
+    T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266};
 
     raft::allocate(eig_vectors_ref, 12);
     raft::allocate(eig_vals_ref, params.n_col);
@@ -82,19 +70,11 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
     raft::update_device(eig_vectors_ref, eig_vectors_ref_h, 12, stream);
     raft::update_device(eig_vals_ref, eig_vals_ref_h, 4, stream);
 
-    eigSelDC(handle,
-             cov_matrix,
-             params.n_row,
-             params.n_col,
-             3,
-             eig_vectors,
-             eig_vals,
-             EigVecMemUsage::OVERWRITE_INPUT,
-             stream);
+    eigSelDC(handle, cov_matrix, params.n_row, params.n_col, 3, eig_vectors,
+             eig_vals, EigVecMemUsage::OVERWRITE_INPUT, stream);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(cov_matrix));
     CUDA_CHECK(cudaFree(eig_vectors));
     CUDA_CHECK(cudaFree(eig_vals));
@@ -109,45 +89,51 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
   cudaStream_t stream;
 };
 
-const std::vector<EigSelInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigSelInputs<float>> inputsf2 = {
+  {0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
 
-const std::vector<EigSelInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigSelInputs<double>> inputsd2 = {
+  {0.001, 4 * 4, 4, 4, 1234ULL, 256}};
 
 typedef EigSelTest<float> EigSelTestValF;
-TEST_P(EigSelTestValF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigSelTestValF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigSelTest<double> EigSelTestValD;
-TEST_P(EigSelTestValD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vals_ref, eig_vals, params.n_col, raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigSelTestValD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vals_ref, eig_vals, params.n_col,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigSelTest<float> EigSelTestVecF;
-TEST_P(EigSelTestVecF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_ref, eig_vectors, 12, raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigSelTestVecF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vectors_ref, eig_vectors, 12,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigSelTest<double> EigSelTestVecD;
-TEST_P(EigSelTestVecD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_ref, eig_vectors, 12, raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigSelTestVecD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(eig_vectors_ref, eig_vectors, 12,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF,
+                         ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD,
+                         ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF,
+                         ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD,
+                         ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index f0e04403e8..572951c557 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -26,17 +26,19 @@ namespace linalg {
 //// Testing unary ops
 
 template <typename Type>
-__global__ void naiveScaleKernel(Type* out, const Type* in, Type scalar, int len)
-{
+__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar,
+                                 int len) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = scalar * in[idx]; }
+  if (idx < len) {
+    out[idx] = scalar * in[idx];
+  }
 }
 
 template <typename Type>
-void naiveScale(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream)
-{
+void naiveScale(Type *out, const Type *in, Type scalar, int len,
+                cudaStream_t stream) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(len, TPB);
+  int nblks = raft::ceildiv(len, TPB);
   naiveScaleKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -50,19 +52,19 @@ struct ScalarMultiplyInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const ScalarMultiplyInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const ScalarMultiplyInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
-class ScalarMultiplyTest : public ::testing::TestWithParam<ScalarMultiplyInputs<T>> {
+class ScalarMultiplyTest
+  : public ::testing::TestWithParam<ScalarMultiplyInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<ScalarMultiplyInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
-    int len  = params.len;
+    int len = params.len;
     T scalar = params.scalar;
 
     cudaStream_t stream;
@@ -76,8 +78,7 @@ class ScalarMultiplyTest : public ::testing::TestWithParam<ScalarMultiplyInputs<
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(in));
     CUDA_CHECK(cudaFree(out_ref));
     CUDA_CHECK(cudaFree(out));
@@ -88,41 +89,46 @@ class ScalarMultiplyTest : public ::testing::TestWithParam<ScalarMultiplyInputs<
   T *in, *out_ref, *out;
 };
 
-const std::vector<ScalarMultiplyInputs<float>> inputsf1 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<ScalarMultiplyInputs<float>> inputsf1 = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 
 const std::vector<ScalarMultiplyInputs<double>> inputsd1 = {
   {0.00000001, 1024 * 1024, 2.0, 1234ULL}};
 
 typedef ScalarMultiplyTest<float> ScalarMultiplyTestF;
-TEST_P(ScalarMultiplyTestF, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
+TEST_P(ScalarMultiplyTestF, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<float>(params.tolerance)));
 }
 
 typedef ScalarMultiplyTest<double> ScalarMultiplyTestD;
-TEST_P(ScalarMultiplyTestD, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(ScalarMultiplyTestD, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, ::testing::ValuesIn(inputsf1));
+INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF,
+                         ::testing::ValuesIn(inputsf1));
 
-INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, ::testing::ValuesIn(inputsd1));
+INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD,
+                         ::testing::ValuesIn(inputsd1));
 
 //// Testing binary ops
 
 template <typename Type>
-__global__ void naiveAddKernel(Type* out, const Type* in1, const Type* in2, int len)
-{
+__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2,
+                               int len) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = in1[idx] + in2[idx]; }
+  if (idx < len) {
+    out[idx] = in1[idx] + in2[idx];
+  }
 }
 
 template <typename Type>
-void naiveAdd(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream)
-{
+void naiveAdd(Type *out, const Type *in1, const Type *in2, int len,
+              cudaStream_t stream) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(len, TPB);
+  int nblks = raft::ceildiv(len, TPB);
   naiveAddKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -135,16 +141,15 @@ struct EltwiseAddInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const EltwiseAddInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const EltwiseAddInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
 class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<EltwiseAddInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
 
@@ -162,8 +167,7 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(in1));
     CUDA_CHECK(cudaFree(in2));
     CUDA_CHECK(cudaFree(out_ref));
@@ -175,25 +179,29 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
   T *in1, *in2, *out_ref, *out;
 };
 
-const std::vector<EltwiseAddInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<EltwiseAddInputs<float>> inputsf2 = {
+  {0.000001f, 1024 * 1024, 1234ULL}};
 
-const std::vector<EltwiseAddInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
+const std::vector<EltwiseAddInputs<double>> inputsd2 = {
+  {0.00000001, 1024 * 1024, 1234ULL}};
 
 typedef EltwiseAddTest<float> EltwiseAddTestF;
-TEST_P(EltwiseAddTestF, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
+TEST_P(EltwiseAddTestF, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<float>(params.tolerance)));
 }
 
 typedef EltwiseAddTest<double> EltwiseAddTestD;
-TEST_P(EltwiseAddTestD, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(EltwiseAddTestD, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF,
+                         ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD,
+                         ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index e95dbbc502..cecfc5eb8e 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -36,9 +36,9 @@ struct GemmLayoutInputs {
 
 // Reference GEMM implementation.
 template <typename T>
-__global__ void naiveGemm(
-  T* Z, T* X, T* Y, int M, int N, int K, bool isZColMajor, bool isXColMajor, bool isYColMajor)
-{
+__global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K,
+                          bool isZColMajor, bool isXColMajor,
+                          bool isYColMajor) {
   int tidx = blockIdx.x * blockDim.x + threadIdx.x;
   int tidy = blockIdx.y * blockDim.y + threadIdx.y;
 
@@ -51,7 +51,7 @@ __global__ void naiveGemm(
         temp += X[xIndex] * Y[yIndex];
       }
       int zIndex = isZColMajor ? m + n * M : m * N + n;
-      Z[zIndex]  = temp;
+      Z[zIndex] = temp;
     }
   }
 }
@@ -59,8 +59,7 @@ __global__ void naiveGemm(
 template <typename T>
 class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<GemmLayoutInputs<T>>::GetParam();
 
     raft::handle_t handle;
@@ -73,8 +72,8 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
     // Dimensions of Y : K x N
     // Dimensions of Z : M x N
 
-    T* X = NULL;  // Argument X
-    T* Y = NULL;  // Argument Y
+    T *X = NULL;  // Argument X
+    T *Y = NULL;  // Argument Y
 
     size_t xElems = params.M * params.K;
     size_t yElems = params.K * params.N;
@@ -88,35 +87,27 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
     r.uniform(X, xElems, T(-10.0), T(10.0), stream);
     r.uniform(Y, yElems, T(-10.0), T(10.0), stream);
 
-    dim3 blocks(raft::ceildiv<int>(params.M, 128), raft::ceildiv<int>(params.N, 4), 1);
+    dim3 blocks(raft::ceildiv<int>(params.M, 128),
+                raft::ceildiv<int>(params.N, 4), 1);
     dim3 threads(128, 4, 1);
 
-    naiveGemm<<<blocks, threads>>>(
-      refZ, X, Y, params.M, params.N, params.K, params.zLayout, params.xLayout, params.yLayout);
-
-    gemm(handle,
-         Z,
-         X,
-         Y,
-         params.M,
-         params.N,
-         params.K,
-         params.zLayout,
-         params.xLayout,
-         params.yLayout,
-         stream);
+    naiveGemm<<<blocks, threads>>>(refZ, X, Y, params.M, params.N, params.K,
+                                   params.zLayout, params.xLayout,
+                                   params.yLayout);
+
+    gemm(handle, Z, X, Y, params.M, params.N, params.K, params.zLayout,
+         params.xLayout, params.yLayout, stream);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(refZ));
     CUDA_CHECK(cudaFree(Z));
   }
 
  protected:
   GemmLayoutInputs<T> params;
-  T* refZ = NULL;  // Reference result for comparison
-  T* Z    = NULL;  // Computed result
+  T *refZ = NULL;  // Reference result for comparison
+  T *Z = NULL;     // Computed result
 };
 
 const std::vector<GemmLayoutInputs<float>> inputsf = {
@@ -140,20 +131,22 @@ const std::vector<GemmLayoutInputs<double>> inputsd = {
   {50, 80, 60, false, false, false, 893038ULL}};
 
 typedef GemmLayoutTest<float> GemmLayoutTestF;
-TEST_P(GemmLayoutTestF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox<float>(1e-4)));
+TEST_P(GemmLayoutTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N,
+                                raft::CompareApprox<float>(1e-4)));
 }
 
 typedef GemmLayoutTest<double> GemmLayoutTestD;
-TEST_P(GemmLayoutTestD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox<float>(1e-6)));
+TEST_P(GemmLayoutTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N,
+                                raft::CompareApprox<float>(1e-6)));
 }
 
-INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF,
+                         ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD,
+                         ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 0e33d9758f..227bce6a48 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -26,22 +26,13 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename IdxType, typename OutType>
-void mapLaunch(OutType* out,
-               const InType* in1,
-               const InType* in2,
-               const InType* in3,
-               InType scalar,
-               IdxType len,
-               cudaStream_t stream)
-{
+void mapLaunch(OutType *out, const InType *in1, const InType *in2,
+               const InType *in3, InType scalar, IdxType len,
+               cudaStream_t stream) {
   map(
-    out,
-    len,
+    out, len,
     [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; },
-    stream,
-    in1,
-    in2,
-    in3);
+    stream, in1, in2, in3);
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
@@ -53,15 +44,10 @@ struct MapInputs {
 };
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void create_ref(OutType* out_ref,
-                const InType* in1,
-                const InType* in2,
-                const InType* in3,
-                InType scalar,
-                IdxType len,
-                cudaStream_t stream)
-{
-  InType* tmp;
+void create_ref(OutType *out_ref, const InType *in1, const InType *in2,
+                const InType *in3, InType scalar, IdxType len,
+                cudaStream_t stream) {
+  InType *tmp;
   allocate(tmp, len);
   eltwiseAdd(tmp, in1, in2, len, stream);
   eltwiseAdd(out_ref, tmp, in3, len, stream);
@@ -70,11 +56,12 @@ void create_ref(OutType* out_ref,
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>> {
+class MapTest
+  : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>> {
  protected:
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>>::GetParam();
+  void SetUp() override {
+    params =
+      ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
 
     cudaStream_t stream;
@@ -94,8 +81,7 @@ class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutTy
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(in1));
     CUDA_CHECK(cudaFree(in2));
     CUDA_CHECK(cudaFree(in3));
@@ -109,47 +95,55 @@ class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutTy
   OutType *out_ref, *out;
 };
 
-const std::vector<MapInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL, 3.2}};
+const std::vector<MapInputs<float, int>> inputsf_i32 = {
+  {0.000001f, 1024 * 1024, 1234ULL, 3.2}};
 typedef MapTest<float, int> MapTestF_i32;
-TEST_P(MapTestF_i32, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
+TEST_P(MapTestF_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32,
+                         ::testing::ValuesIn(inputsf_i32));
 
-const std::vector<MapInputs<float, size_t>> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL, 9.4}};
+const std::vector<MapInputs<float, size_t>> inputsf_i64 = {
+  {0.000001f, 1024 * 1024, 1234ULL, 9.4}};
 typedef MapTest<float, size_t> MapTestF_i64;
-TEST_P(MapTestF_i64, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
+TEST_P(MapTestF_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64,
+                         ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MapInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 1234ULL, 5.9}};
 typedef MapTest<float, int, double> MapTestF_i32_D;
-TEST_P(MapTestF_i32_D, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(MapTestF_i32_D, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D,
+                         ::testing::ValuesIn(inputsf_i32_d));
 
-const std::vector<MapInputs<double, int>> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL, 7.5}};
+const std::vector<MapInputs<double, int>> inputsd_i32 = {
+  {0.00000001, 1024 * 1024, 1234ULL, 7.5}};
 typedef MapTest<double, int> MapTestD_i32;
-TEST_P(MapTestD_i32, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(MapTestD_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32,
+                         ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MapInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 1234ULL, 5.2}};
 typedef MapTest<double, size_t> MapTestD_i64;
-TEST_P(MapTestD_i64, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(MapTestD_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64,
+                         ::testing::ValuesIn(inputsd_i64));
 
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index a1b82e7644..6e146fa4bb 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -25,18 +25,21 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename MapOp>
-__global__ void naiveMapReduceKernel(OutType* out, const InType* in, size_t len, MapOp map)
-{
+__global__ void naiveMapReduceKernel(OutType *out, const InType *in, size_t len,
+                                     MapOp map) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { raft::myAtomicAdd(out, (OutType)map(in[idx])); }
+  if (idx < len) {
+    raft::myAtomicAdd(out, (OutType)map(in[idx]));
+  }
 }
 
 template <typename InType, typename OutType, typename MapOp>
-void naiveMapReduce(OutType* out, const InType* in, size_t len, MapOp map, cudaStream_t stream)
-{
+void naiveMapReduce(OutType *out, const InType *in, size_t len, MapOp map,
+                    cudaStream_t stream) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(len, (size_t)TPB);
-  naiveMapReduceKernel<InType, OutType, MapOp><<<nblks, TPB, 0, stream>>>(out, in, len, map);
+  int nblks = raft::ceildiv(len, (size_t)TPB);
+  naiveMapReduceKernel<InType, OutType, MapOp>
+    <<<nblks, TPB, 0, stream>>>(out, in, len, map);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -48,8 +51,7 @@ struct MapReduceInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const MapReduceInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os, const MapReduceInputs<T> &dims) {
   return os;
 }
 
@@ -57,9 +59,8 @@ template <typename T>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename OutType>
-void mapReduceLaunch(
-  OutType* out_ref, OutType* out, const InType* in, size_t len, cudaStream_t stream)
-{
+void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in,
+                     size_t len, cudaStream_t stream) {
   auto op = [] __device__(InType in) { return in; };
   naiveMapReduce(out_ref, in, len, op, stream);
   mapThenSumReduce(out, len, op, 0, in);
@@ -68,8 +69,7 @@ void mapReduceLaunch(
 template <typename InType, typename OutType>
 class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<MapReduceInputs<InType>>::GetParam();
     raft::random::Rng r(params.seed);
     auto len = params.len;
@@ -84,8 +84,7 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(in));
     CUDA_CHECK(cudaFree(out_ref));
     CUDA_CHECK(cudaFree(out));
@@ -93,44 +92,48 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
 
  protected:
   MapReduceInputs<InType> params;
-  InType* in;
+  InType *in;
   OutType *out_ref, *out;
 };
 
-const std::vector<MapReduceInputs<float>> inputsf = {{0.001f, 1024 * 1024, 1234ULL}};
+const std::vector<MapReduceInputs<float>> inputsf = {
+  {0.001f, 1024 * 1024, 1234ULL}};
 typedef MapReduceTest<float, float> MapReduceTestFF;
-TEST_P(MapReduceTestFF, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<float>(params.tolerance)));
+TEST_P(MapReduceTestFF, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF,
+                         ::testing::ValuesIn(inputsf));
 
 typedef MapReduceTest<float, double> MapReduceTestFD;
-TEST_P(MapReduceTestFD, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(MapReduceTestFD, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD,
+                         ::testing::ValuesIn(inputsf));
 
-const std::vector<MapReduceInputs<double>> inputsd = {{0.000001, 1024 * 1024, 1234ULL}};
+const std::vector<MapReduceInputs<double>> inputsd = {
+  {0.000001, 1024 * 1024, 1234ULL}};
 typedef MapReduceTest<double, double> MapReduceTestDD;
-TEST_P(MapReduceTestDD, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(MapReduceTestDD, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD,
+                         ::testing::ValuesIn(inputsd));
 
 template <typename T>
 class MapGenericReduceTest : public ::testing::Test {
-  using InType  = typename T::first_type;
+  using InType = typename T::first_type;
   using OutType = typename T::second_type;
 
  protected:
   MapGenericReduceTest()
     : allocator(handle.get_device_allocator()),
       input(allocator, handle.get_stream(), n),
-      output(allocator, handle.get_stream(), 1)
-  {
+      output(allocator, handle.get_stream(), 1) {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     initInput(input.data(), input.size(), stream);
@@ -139,8 +142,7 @@ class MapGenericReduceTest : public ::testing::Test {
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
  public:
-  void initInput(InType* input, int n, cudaStream_t stream)
-  {
+  void initInput(InType *input, int n, cudaStream_t stream) {
     raft::random::Rng r(137);
     r.uniform(input, n, InType(2), InType(3), stream);
     InType val = 1;
@@ -149,19 +151,21 @@ class MapGenericReduceTest : public ::testing::Test {
     raft::update_device(input + 337, &val, 1, stream);
   }
 
-  void testMin()
-  {
-    auto op               = [] __device__(InType in) { return in; };
+  void testMin() {
+    auto op = [] __device__(InType in) { return in; };
     const OutType neutral = std::numeric_limits<InType>::max();
-    mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, input.data());
-    EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, raft::Compare<OutType>()));
+    mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream,
+                  input.data());
+    EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1,
+                                  raft::Compare<OutType>()));
   }
-  void testMax()
-  {
-    auto op               = [] __device__(InType in) { return in; };
+  void testMax() {
+    auto op = [] __device__(InType in) { return in; };
     const OutType neutral = std::numeric_limits<InType>::min();
-    mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, input.data());
-    EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, raft::Compare<OutType>()));
+    mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream,
+                  input.data());
+    EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1,
+                                  raft::Compare<OutType>()));
   }
 
  protected:
@@ -174,7 +178,8 @@ class MapGenericReduceTest : public ::testing::Test {
 };
 
 using IoTypePair =
-  ::testing::Types<std::pair<float, float>, std::pair<float, double>, std::pair<double, double>>;
+  ::testing::Types<std::pair<float, float>, std::pair<float, double>,
+                   std::pair<double, double>>;
 
 TYPED_TEST_CASE(MapGenericReduceTest, IoTypePair);
 TYPED_TEST(MapGenericReduceTest, min) { this->testMin(); }
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 6ad9bfba10..aa46c78b0f 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -32,8 +32,8 @@ struct MatVecOpInputs {
 };
 
 template <typename T, typename IdxType>
-::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs<T, IdxType>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const MatVecOpInputs<T, IdxType> &dims) {
   return os;
 }
 
@@ -41,48 +41,26 @@ template <typename T, typename IdxType>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename T, typename IdxType>
-void matrixVectorOpLaunch(T* out,
-                          const T* in,
-                          const T* vec1,
-                          const T* vec2,
-                          IdxType D,
-                          IdxType N,
-                          bool rowMajor,
-                          bool bcastAlongRows,
-                          bool useTwoVectors,
-                          cudaStream_t stream)
-{
+void matrixVectorOpLaunch(T *out, const T *in, const T *vec1, const T *vec2,
+                          IdxType D, IdxType N, bool rowMajor,
+                          bool bcastAlongRows, bool useTwoVectors,
+                          cudaStream_t stream) {
   if (useTwoVectors) {
     matrixVectorOp(
-      out,
-      in,
-      vec1,
-      vec2,
-      D,
-      N,
-      rowMajor,
-      bcastAlongRows,
-      [] __device__(T a, T b, T c) { return a + b + c; },
-      stream);
+      out, in, vec1, vec2, D, N, rowMajor, bcastAlongRows,
+      [] __device__(T a, T b, T c) { return a + b + c; }, stream);
   } else {
     matrixVectorOp(
-      out,
-      in,
-      vec1,
-      D,
-      N,
-      rowMajor,
-      bcastAlongRows,
-      [] __device__(T a, T b) { return a + b; },
-      stream);
+      out, in, vec1, D, N, rowMajor, bcastAlongRows,
+      [] __device__(T a, T b) { return a + b; }, stream);
   }
 }
 
 template <typename T, typename IdxType>
-class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>> {
+class MatVecOpTest
+  : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>::GetParam();
     raft::random::Rng r(params.seed);
     IdxType N = params.rows, D = params.cols;
@@ -100,25 +78,18 @@ class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>>
     r.uniform(vec1, vecLen, (T)-1.0, (T)1.0, stream);
     r.uniform(vec2, vecLen, (T)-1.0, (T)1.0, stream);
     if (params.useTwoVectors) {
-      naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor, params.bcastAlongRows, (T)1.0);
+      naiveMatVec(out_ref, in, vec1, vec2, D, N, params.rowMajor,
+                  params.bcastAlongRows, (T)1.0);
     } else {
-      naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor, params.bcastAlongRows, (T)1.0);
+      naiveMatVec(out_ref, in, vec1, D, N, params.rowMajor,
+                  params.bcastAlongRows, (T)1.0);
     }
-    matrixVectorOpLaunch(out,
-                         in,
-                         vec1,
-                         vec2,
-                         D,
-                         N,
-                         params.rowMajor,
-                         params.bcastAlongRows,
-                         params.useTwoVectors,
-                         stream);
+    matrixVectorOpLaunch(out, in, vec1, vec2, D, N, params.rowMajor,
+                         params.bcastAlongRows, params.useTwoVectors, stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(vec1));
     CUDA_CHECK(cudaFree(vec2));
     CUDA_CHECK(cudaFree(out));
@@ -150,23 +121,23 @@ const std::vector<MatVecOpInputs<float, int>> inputsf_i32 = {
   {0.00001f, 1024, 32, false, false, true, 1234ULL},
   {0.00001f, 1024, 64, false, false, true, 1234ULL}};
 typedef MatVecOpTest<float, int> MatVecOpTestF_i32;
-TEST_P(MatVecOpTestF_i32, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox<float>(params.tolerance)));
+TEST_P(MatVecOpTestF_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
+                          CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32,
+                         ::testing::ValuesIn(inputsf_i32));
 
 const std::vector<MatVecOpInputs<float, size_t>> inputsf_i64 = {
   {0.00001f, 2500, 250, false, false, false, 1234ULL},
   {0.00001f, 2500, 250, false, false, true, 1234ULL}};
 typedef MatVecOpTest<float, size_t> MatVecOpTestF_i64;
-TEST_P(MatVecOpTestF_i64, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox<float>(params.tolerance)));
+TEST_P(MatVecOpTestF_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
+                          CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64,
+                         ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MatVecOpInputs<double, int>> inputsd_i32 = {
   {0.0000001, 1024, 32, true, true, false, 1234ULL},
@@ -187,23 +158,23 @@ const std::vector<MatVecOpInputs<double, int>> inputsd_i32 = {
   {0.0000001, 1024, 32, false, false, true, 1234ULL},
   {0.0000001, 1024, 64, false, false, true, 1234ULL}};
 typedef MatVecOpTest<double, int> MatVecOpTestD_i32;
-TEST_P(MatVecOpTestD_i32, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox<double>(params.tolerance)));
+TEST_P(MatVecOpTestD_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
+                          CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32,
+                         ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MatVecOpInputs<double, size_t>> inputsd_i64 = {
   {0.0000001, 2500, 250, false, false, false, 1234ULL},
   {0.0000001, 2500, 250, false, false, true, 1234ULL}};
 typedef MatVecOpTest<double, size_t> MatVecOpTestD_i64;
-TEST_P(MatVecOpTestD_i64, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out_ref, out, params.rows * params.cols, CompareApprox<double>(params.tolerance)));
+TEST_P(MatVecOpTestD_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.rows * params.cols,
+                          CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64,
+                         ::testing::ValuesIn(inputsd_i64));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index 5f9c6f1ef3..69c45c9866 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -22,15 +22,9 @@ namespace raft {
 namespace linalg {
 
 template <typename Type, typename IdxType = int>
-__global__ void naiveMatVecKernel(Type* out,
-                                  const Type* mat,
-                                  const Type* vec,
-                                  IdxType D,
-                                  IdxType N,
-                                  bool rowMajor,
-                                  bool bcastAlongRows,
-                                  Type scalar)
-{
+__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec,
+                                  IdxType D, IdxType N, bool rowMajor,
+                                  bool bcastAlongRows, Type scalar) {
   IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
   IdxType len = N * D;
   IdxType col;
@@ -43,37 +37,27 @@ __global__ void naiveMatVecKernel(Type* out,
   } else {
     col = idx / N;
   }
-  if (idx < len) { out[idx] = mat[idx] + scalar * vec[col]; }
+  if (idx < len) {
+    out[idx] = mat[idx] + scalar * vec[col];
+  }
 }
 
 template <typename Type, typename IdxType = int>
-void naiveMatVec(Type* out,
-                 const Type* mat,
-                 const Type* vec,
-                 IdxType D,
-                 IdxType N,
-                 bool rowMajor,
-                 bool bcastAlongRows,
-                 Type scalar)
-{
+void naiveMatVec(Type *out, const Type *mat, const Type *vec, IdxType D,
+                 IdxType N, bool rowMajor, bool bcastAlongRows, Type scalar) {
   static const IdxType TPB = 64;
-  IdxType len              = N * D;
-  IdxType nblks            = raft::ceildiv(len, TPB);
-  naiveMatVecKernel<Type><<<nblks, TPB>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
+  IdxType len = N * D;
+  IdxType nblks = raft::ceildiv(len, TPB);
+  naiveMatVecKernel<Type>
+    <<<nblks, TPB>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type, typename IdxType = int>
-__global__ void naiveMatVecKernel(Type* out,
-                                  const Type* mat,
-                                  const Type* vec1,
-                                  const Type* vec2,
-                                  IdxType D,
-                                  IdxType N,
-                                  bool rowMajor,
-                                  bool bcastAlongRows,
-                                  Type scalar)
-{
+__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1,
+                                  const Type *vec2, IdxType D, IdxType N,
+                                  bool rowMajor, bool bcastAlongRows,
+                                  Type scalar) {
   IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
   IdxType len = N * D;
   IdxType col;
@@ -86,25 +70,20 @@ __global__ void naiveMatVecKernel(Type* out,
   } else {
     col = idx / N;
   }
-  if (idx < len) { out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; }
+  if (idx < len) {
+    out[idx] = mat[idx] + scalar * vec1[col] + vec2[col];
+  }
 }
 
 template <typename Type, typename IdxType = int>
-void naiveMatVec(Type* out,
-                 const Type* mat,
-                 const Type* vec1,
-                 const Type* vec2,
-                 IdxType D,
-                 IdxType N,
-                 bool rowMajor,
-                 bool bcastAlongRows,
-                 Type scalar)
-{
+void naiveMatVec(Type *out, const Type *mat, const Type *vec1, const Type *vec2,
+                 IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows,
+                 Type scalar) {
   static const IdxType TPB = 64;
-  IdxType len              = N * D;
-  IdxType nblks            = raft::ceildiv(len, TPB);
-  naiveMatVecKernel<Type>
-    <<<nblks, TPB>>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar);
+  IdxType len = N * D;
+  IdxType nblks = raft::ceildiv(len, TPB);
+  naiveMatVecKernel<Type><<<nblks, TPB>>>(out, mat, vec1, vec2, D, N, rowMajor,
+                                          bcastAlongRows, scalar);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index 6c38d89891..1d3e753de3 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -27,8 +27,7 @@ namespace linalg {
 template <typename T>
 class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<UnaryOpInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
@@ -44,8 +43,7 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(in));
     CUDA_CHECK(cudaFree(out_ref));
     CUDA_CHECK(cudaFree(out));
@@ -56,21 +54,25 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
   T *in, *out_ref, *out;
 };
 
-const std::vector<UnaryOpInputs<float>> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<UnaryOpInputs<float>> inputsf = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef MultiplyTest<float> MultiplyTestF;
-TEST_P(MultiplyTestF, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MultiplyTestF, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF,
+                         ::testing::ValuesIn(inputsf));
 
 typedef MultiplyTest<double> MultiplyTestD;
-const std::vector<UnaryOpInputs<double>> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-TEST_P(MultiplyTestD, Result)
-{
-  ASSERT_TRUE(devArrMatch(out_ref, out, params.len, raft::CompareApprox<double>(params.tolerance)));
+const std::vector<UnaryOpInputs<double>> inputsd = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+TEST_P(MultiplyTestD, Result) {
+  ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                          raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD,
+                         ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 35bc72dee4..acc25addd0 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -34,19 +34,17 @@ struct NormInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const NormInputs<T>& I)
-{
-  os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.type << ", "
-     << I.do_sqrt << ", " << I.seed << '}' << std::endl;
+::std::ostream &operator<<(::std::ostream &os, const NormInputs<T> &I) {
+  os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", "
+     << I.type << ", " << I.do_sqrt << ", " << I.seed << '}' << std::endl;
   return os;
 }
 
 ///// Row-wise norm test definitions
 template <typename Type>
-__global__ void naiveRowNormKernel(
-  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt)
-{
-  Type acc     = (Type)0;
+__global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N,
+                                   NormType type, bool do_sqrt) {
+  Type acc = (Type)0;
   int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
   if (rowStart < N) {
     for (int i = 0; i < D; ++i) {
@@ -61,20 +59,19 @@ __global__ void naiveRowNormKernel(
 }
 
 template <typename Type>
-void naiveRowNorm(
-  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream)
-{
+void naiveRowNorm(Type *dots, const Type *data, int D, int N, NormType type,
+                  bool do_sqrt, cudaStream_t stream) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(N, TPB);
-  naiveRowNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
+  int nblks = raft::ceildiv(N, TPB);
+  naiveRowNormKernel<Type>
+    <<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename T>
 class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
  public:
-  void SetUp() override
-  {
+  void SetUp() override {
     CUDA_CHECK(cudaStreamCreate(&stream));
     params = ::testing::TestWithParam<NormInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
@@ -85,18 +82,19 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     raft::allocate(dots_exp, rows);
     raft::allocate(dots_act, rows);
     r.uniform(data, len, T(-1.0), T(1.0), stream);
-    naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream);
+    naiveRowNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt,
+                 stream);
     if (params.do_sqrt) {
       auto fin_op = [] __device__(T in) { return raft::mySqrt(in); };
-      rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, fin_op);
+      rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream,
+              fin_op);
     } else {
       rowNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream);
     }
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(dots_exp));
     CUDA_CHECK(cudaFree(dots_act));
@@ -111,11 +109,10 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
 
 ///// Column-wise norm test definitisons
 template <typename Type>
-__global__ void naiveColNormKernel(
-  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt)
-{
+__global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N,
+                                   NormType type, bool do_sqrt) {
   int colID = threadIdx.x + blockIdx.x * blockDim.x;
-  if (colID > D) return;  // avoid out-of-bounds thread
+  if (colID > D) return;  //avoid out-of-bounds thread
 
   Type acc = 0;
   for (int i = 0; i < N; i++) {
@@ -127,20 +124,19 @@ __global__ void naiveColNormKernel(
 }
 
 template <typename Type>
-void naiveColNorm(
-  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream)
-{
+void naiveColNorm(Type *dots, const Type *data, int D, int N, NormType type,
+                  bool do_sqrt, cudaStream_t stream) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(D, TPB);
-  naiveColNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
+  int nblks = raft::ceildiv(D, TPB);
+  naiveColNormKernel<Type>
+    <<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename T>
 class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
  public:
-  void SetUp() override
-  {
+  void SetUp() override {
     CUDA_CHECK(cudaStreamCreate(&stream));
     params = ::testing::TestWithParam<NormInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
@@ -152,18 +148,19 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
     raft::allocate(dots_exp, cols);
     raft::allocate(dots_act, cols);
 
-    naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt, stream);
+    naiveColNorm(dots_exp, data, cols, rows, params.type, params.do_sqrt,
+                 stream);
     if (params.do_sqrt) {
       auto fin_op = [] __device__(T in) { return raft::mySqrt(in); };
-      colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream, fin_op);
+      colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream,
+              fin_op);
     } else {
       colNorm(dots_act, data, cols, rows, params.type, params.rowMajor, stream);
     }
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(dots_exp));
     CUDA_CHECK(cudaFree(dots_act));
@@ -177,23 +174,24 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
 };
 
 ///// Row- and column-wise tests
-const std::vector<NormInputs<float>> inputsf = {{0.00001f, 1024, 32, L1Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL},
-                                                {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL},
-
-                                                {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL},
-                                                {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}};
+const std::vector<NormInputs<float>> inputsf = {
+  {0.00001f, 1024, 32, L1Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL},
+  {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL},
+
+  {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL},
+  {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}};
 
 const std::vector<NormInputs<double>> inputsd = {
   {0.00000001, 1024, 32, L1Norm, false, true, 1234ULL},
@@ -215,22 +213,22 @@ const std::vector<NormInputs<double>> inputsd = {
   {0.00000001, 1024, 256, L2Norm, true, true, 1234ULL}};
 
 typedef RowNormTest<float> RowNormTestF;
-TEST_P(RowNormTestF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    dots_exp, dots_act, params.rows, raft::CompareApprox<float>(params.tolerance)));
+TEST_P(RowNormTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef RowNormTest<double> RowNormTestD;
-TEST_P(RowNormTestD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    dots_exp, dots_act, params.rows, raft::CompareApprox<double>(params.tolerance)));
+TEST_P(RowNormTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.rows,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF,
+                        ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD,
+                        ::testing::ValuesIn(inputsd));
 
 const std::vector<NormInputs<float>> inputscf = {
   {0.00001f, 32, 1024, L1Norm, false, true, 1234ULL},
@@ -271,22 +269,22 @@ const std::vector<NormInputs<double>> inputscd = {
   {0.00000001, 256, 1024, L2Norm, true, true, 1234ULL}};
 
 typedef ColNormTest<float> ColNormTestF;
-TEST_P(ColNormTestF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    dots_exp, dots_act, params.cols, raft::CompareApprox<float>(params.tolerance)));
+TEST_P(ColNormTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef ColNormTest<double> ColNormTestD;
-TEST_P(ColNormTestD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    dots_exp, dots_act, params.cols, raft::CompareApprox<double>(params.tolerance)));
+TEST_P(ColNormTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(dots_exp, dots_act, params.cols,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, ::testing::ValuesIn(inputscf));
+INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF,
+                        ::testing::ValuesIn(inputscf));
 
-INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, ::testing::ValuesIn(inputscd));
+INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD,
+                        ::testing::ValuesIn(inputscd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index 85c84777e4..9082397265 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -34,8 +34,8 @@ struct ReduceInputs {
 };
 
 template <typename InType, typename OutType>
-::std::ostream& operator<<(::std::ostream& os, const ReduceInputs<InType, OutType>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const ReduceInputs<InType, OutType> &dims) {
   return os;
 }
 
@@ -43,55 +43,45 @@ template <typename InType, typename OutType>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename OutType>
-void reduceLaunch(OutType* dots,
-                  const InType* data,
-                  int cols,
-                  int rows,
-                  bool rowMajor,
-                  bool alongRows,
-                  bool inplace,
-                  cudaStream_t stream)
-{
-  reduce(dots,
-         data,
-         cols,
-         rows,
-         (OutType)0,
-         rowMajor,
-         alongRows,
-         stream,
-         inplace,
-         [] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
+void reduceLaunch(OutType *dots, const InType *data, int cols, int rows,
+                  bool rowMajor, bool alongRows, bool inplace,
+                  cudaStream_t stream) {
+  reduce(
+    dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace,
+    [] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
 }
 
 template <typename InType, typename OutType>
-class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
+class ReduceTest
+  : public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     CUDA_CHECK(cudaStreamCreate(&stream));
-    params = ::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam();
+    params =
+      ::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
-    outlen  = params.alongRows ? rows : cols;
+    outlen = params.alongRows ? rows : cols;
     raft::allocate(data, len);
     raft::allocate(dots_exp, outlen);
     raft::allocate(dots_act, outlen);
     r.uniform(data, len, InType(-1.0), InType(1.0), stream);
-    naiveReduction(dots_exp, data, cols, rows, params.rowMajor, params.alongRows, stream);
+    naiveReduction(dots_exp, data, cols, rows, params.rowMajor,
+                   params.alongRows, stream);
 
     // Perform reduction with default inplace = false first
-    reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, false, stream);
+    reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows,
+                 false, stream);
     // Add to result with inplace = true next, which shouldn't affect
     // in the case of coalescedReduction!
     if (!(params.rowMajor ^ params.alongRows)) {
-      reduceLaunch(dots_act, data, cols, rows, params.rowMajor, params.alongRows, true, stream);
+      reduceLaunch(dots_act, data, cols, rows, params.rowMajor,
+                   params.alongRows, true, stream);
     }
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(dots_exp));
     CUDA_CHECK(cudaFree(dots_act));
@@ -100,7 +90,7 @@ class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType>
 
  protected:
   ReduceInputs<InType, OutType> params;
-  InType* data;
+  InType *data;
   OutType *dots_exp, *dots_act;
   int outlen;
   cudaStream_t stream;
@@ -161,31 +151,31 @@ const std::vector<ReduceInputs<float, double>> inputsfd = {
   {0.000002f, 1024, 256, false, false, 1234ULL}};
 
 typedef ReduceTest<float, float> ReduceTestFF;
-TEST_P(ReduceTestFF, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox<float>(params.tolerance)));
+TEST_P(ReduceTestFF, Result) {
+  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
+                          raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef ReduceTest<double, double> ReduceTestDD;
-TEST_P(ReduceTestDD, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox<double>(params.tolerance)));
+TEST_P(ReduceTestDD, Result) {
+  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
+                          raft::CompareApprox<double>(params.tolerance)));
 }
 
 typedef ReduceTest<float, double> ReduceTestFD;
-TEST_P(ReduceTestFD, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(dots_exp, dots_act, outlen, raft::CompareApprox<double>(params.tolerance)));
+TEST_P(ReduceTestFD, Result) {
+  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, outlen,
+                          raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, ::testing::ValuesIn(inputsff));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF,
+                        ::testing::ValuesIn(inputsff));
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, ::testing::ValuesIn(inputsdd));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD,
+                        ::testing::ValuesIn(inputsdd));
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, ::testing::ValuesIn(inputsfd));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD,
+                        ::testing::ValuesIn(inputsfd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 86f9c2d8b8..30a9c2e271 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -26,69 +26,52 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType>
-__global__ void naiveCoalescedReductionKernel(OutType* dots, const InType* data, int D, int N)
-{
-  OutType acc  = (OutType)0;
+__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data,
+                                              int D, int N) {
+  OutType acc = (OutType)0;
   int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
   if (rowStart < N) {
     for (int i = 0; i < D; ++i) {
-      acc += static_cast<OutType>(data[rowStart * D + i] * data[rowStart * D + i]);
+      acc +=
+        static_cast<OutType>(data[rowStart * D + i] * data[rowStart * D + i]);
     }
     dots[rowStart] = 2 * acc;
   }
 }
 
 template <typename InType, typename OutType>
-void naiveCoalescedReduction(OutType* dots, const InType* data, int D, int N, cudaStream_t stream)
-{
+void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N,
+                             cudaStream_t stream) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(N, TPB);
-  naiveCoalescedReductionKernel<InType, OutType><<<nblks, TPB, 0, stream>>>(dots, data, D, N);
+  int nblks = raft::ceildiv(N, TPB);
+  naiveCoalescedReductionKernel<InType, OutType>
+    <<<nblks, TPB, 0, stream>>>(dots, data, D, N);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename InType, typename OutType>
-void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t stream)
-{
-  // computes a MLCommon unary op on data (squares it), then computes Ax
+void unaryAndGemv(OutType *dots, const InType *data, int D, int N,
+                  cudaStream_t stream) {
+  //computes a MLCommon unary op on data (squares it), then computes Ax
   //(A input matrix and x column vector) to sum columns
   thrust::device_vector<OutType> sq(D * N);
   raft::linalg::unaryOp(
-    thrust::raw_pointer_cast(sq.data()),
-    data,
-    D * N,
-    [] __device__(InType v) { return static_cast<OutType>(v * v); },
-    stream);
+    thrust::raw_pointer_cast(sq.data()), data, D * N,
+    [] __device__(InType v) { return static_cast<OutType>(v * v); }, stream);
   cublasHandle_t handle;
   CUBLAS_CHECK(cublasCreate(&handle));
-  thrust::device_vector<OutType> ones(N, 1);  // column vector [1...1]
+  thrust::device_vector<OutType> ones(N, 1);  //column vector [1...1]
   OutType alpha = 1, beta = 0;
-  CUBLAS_CHECK(raft::linalg::cublasgemv(handle,
-                                        CUBLAS_OP_N,
-                                        D,
-                                        N,
-                                        &alpha,
-                                        thrust::raw_pointer_cast(sq.data()),
-                                        D,
-                                        thrust::raw_pointer_cast(ones.data()),
-                                        1,
-                                        &beta,
-                                        dots,
-                                        1,
-                                        stream));
+  CUBLAS_CHECK(raft::linalg::cublasgemv(
+    handle, CUBLAS_OP_N, D, N, &alpha, thrust::raw_pointer_cast(sq.data()), D,
+    thrust::raw_pointer_cast(ones.data()), 1, &beta, dots, 1, stream));
   CUDA_CHECK(cudaDeviceSynchronize());
   CUBLAS_CHECK(cublasDestroy(handle));
 }
 
 template <typename InType, typename OutType>
-void naiveReduction(OutType* dots,
-                    const InType* data,
-                    int D,
-                    int N,
-                    bool rowMajor,
-                    bool alongRows,
-                    cudaStream_t stream)
-{
+void naiveReduction(OutType *dots, const InType *data, int D, int N,
+                    bool rowMajor, bool alongRows, cudaStream_t stream) {
   if (rowMajor && alongRows) {
     naiveCoalescedReduction(dots, data, D, N, stream);
   } else if (rowMajor && !alongRows) {
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 57699cb050..b27fa2ac1a 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -32,17 +32,17 @@ struct stridedReductionInputs {
 };
 
 template <typename T>
-void stridedReductionLaunch(T* dots, const T* data, int cols, int rows, cudaStream_t stream)
-{
-  stridedReduction(
-    dots, data, cols, rows, (T)0, stream, false, [] __device__(T in, int i) { return in * in; });
+void stridedReductionLaunch(T *dots, const T *data, int cols, int rows,
+                            cudaStream_t stream) {
+  stridedReduction(dots, data, cols, rows, (T)0, stream, false,
+                   [] __device__(T in, int i) { return in * in; });
 }
 
 template <typename T>
-class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInputs<T>> {
+class stridedReductionTest
+  : public ::testing::TestWithParam<stridedReductionInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     CUDA_CHECK(cudaStreamCreate(&stream));
     params = ::testing::TestWithParam<stridedReductionInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
@@ -50,17 +50,16 @@ class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInp
     int len = rows * cols;
 
     raft::allocate(data, len);
-    raft::allocate(dots_exp, cols);  // expected dot products (from test)
-    raft::allocate(dots_act, cols);  // actual dot products (from prim)
+    raft::allocate(dots_exp, cols);  //expected dot products (from test)
+    raft::allocate(dots_act, cols);  //actual dot products (from prim)
     r.uniform(data, len, T(-1.0), T(1.0),
-              stream);  // initialize matrix to random
+              stream);  //initialize matrix to random
 
     unaryAndGemv(dots_exp, data, cols, rows, stream);
     stridedReductionLaunch(dots_act, data, cols, rows, stream);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(dots_exp));
     CUDA_CHECK(cudaFree(dots_act));
@@ -73,33 +72,35 @@ class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInp
   cudaStream_t stream;
 };
 
-const std::vector<stridedReductionInputs<float>> inputsf = {{0.00001f, 1024, 32, 1234ULL},
-                                                            {0.00001f, 1024, 64, 1234ULL},
-                                                            {0.00001f, 1024, 128, 1234ULL},
-                                                            {0.00001f, 1024, 256, 1234ULL}};
+const std::vector<stridedReductionInputs<float>> inputsf = {
+  {0.00001f, 1024, 32, 1234ULL},
+  {0.00001f, 1024, 64, 1234ULL},
+  {0.00001f, 1024, 128, 1234ULL},
+  {0.00001f, 1024, 256, 1234ULL}};
 
-const std::vector<stridedReductionInputs<double>> inputsd = {{0.000000001, 1024, 32, 1234ULL},
-                                                             {0.000000001, 1024, 64, 1234ULL},
-                                                             {0.000000001, 1024, 128, 1234ULL},
-                                                             {0.000000001, 1024, 256, 1234ULL}};
+const std::vector<stridedReductionInputs<double>> inputsd = {
+  {0.000000001, 1024, 32, 1234ULL},
+  {0.000000001, 1024, 64, 1234ULL},
+  {0.000000001, 1024, 128, 1234ULL},
+  {0.000000001, 1024, 256, 1234ULL}};
 
 typedef stridedReductionTest<float> stridedReductionTestF;
-TEST_P(stridedReductionTestF, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(dots_exp, dots_act, params.cols, raft::CompareApprox<float>(params.tolerance)));
+TEST_P(stridedReductionTestF, Result) {
+  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols,
+                          raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef stridedReductionTest<double> stridedReductionTestD;
-TEST_P(stridedReductionTestD, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(dots_exp, dots_act, params.cols, raft::CompareApprox<double>(params.tolerance)));
+TEST_P(stridedReductionTestD, Result) {
+  ASSERT_TRUE(devArrMatch(dots_exp, dots_act, params.cols,
+                          raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF,
+                        ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD,
+                        ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 4295b91f3e..ced3f65fdd 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -24,34 +24,39 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naiveSubtractElemKernel(Type* out, const Type* in1, const Type* in2, int len)
-{
+__global__ void naiveSubtractElemKernel(Type *out, const Type *in1,
+                                        const Type *in2, int len) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = in1[idx] - in2[idx]; }
+  if (idx < len) {
+    out[idx] = in1[idx] - in2[idx];
+  }
 }
 
 template <typename Type>
-void naiveSubtractElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream)
-{
+void naiveSubtractElem(Type *out, const Type *in1, const Type *in2, int len,
+                       cudaStream_t stream) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(len, TPB);
+  int nblks = raft::ceildiv(len, TPB);
   naiveSubtractElemKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void naiveSubtractScalarKernel(Type* out, const Type* in1, const Type in2, int len)
-{
+__global__ void naiveSubtractScalarKernel(Type *out, const Type *in1,
+                                          const Type in2, int len) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = in1[idx] - in2; }
+  if (idx < len) {
+    out[idx] = in1[idx] - in2;
+  }
 }
 
 template <typename Type>
-void naiveSubtractScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream)
-{
+void naiveSubtractScalar(Type *out, const Type *in1, const Type in2, int len,
+                         cudaStream_t stream) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(len, TPB);
-  naiveSubtractScalarKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+  int nblks = raft::ceildiv(len, TPB);
+  naiveSubtractScalarKernel<Type>
+    <<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -63,16 +68,14 @@ struct SubtractInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const SubtractInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os, const SubtractInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
 class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<SubtractInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
@@ -95,8 +98,7 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(in1));
     CUDA_CHECK(cudaFree(in2));
     CUDA_CHECK(cudaFree(out_ref));
@@ -108,33 +110,35 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
   T *in1, *in2, *out_ref, *out;
 };
 
-const std::vector<SubtractInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<SubtractInputs<float>> inputsf2 = {
+  {0.000001f, 1024 * 1024, 1234ULL}};
 
-const std::vector<SubtractInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
+const std::vector<SubtractInputs<double>> inputsd2 = {
+  {0.00000001, 1024 * 1024, 1234ULL}};
 
 typedef SubtractTest<float> SubtractTestF;
-TEST_P(SubtractTestF, Result)
-{
-  ASSERT_TRUE(
-    raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox<float>(params.tolerance)));
+TEST_P(SubtractTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len,
+                                raft::CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(
-    raft::devArrMatch(out_ref, in1, params.len, raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef SubtractTest<double> SubtractTestD;
-TEST_P(SubtractTestD, Result)
-{
-  ASSERT_TRUE(
-    raft::devArrMatch(out_ref, out, params.len, raft::CompareApprox<double>(params.tolerance)));
+TEST_P(SubtractTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(out_ref, out, params.len,
+                                raft::CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(
-    raft::devArrMatch(out_ref, in1, params.len, raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(out_ref, in1, params.len,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF,
+                         ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD,
+                         ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index e9e1a6dc02..fff321768f 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -35,21 +35,19 @@ struct SvdInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const SvdInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os, const SvdInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
 class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     raft::handle_t handle;
 
     params = ::testing::TestWithParam<SvdInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
-    int len             = params.len;
+    int len = params.len;
     cudaStream_t stream = handle.get_stream();
     raft::allocate(data, len);
 
@@ -58,7 +56,7 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0};
     raft::update_device(data, data_h, len, stream);
 
-    int left_evl  = params.n_row * params.n_col;
+    int left_evl = params.n_row * params.n_col;
     int right_evl = params.n_col * params.n_col;
 
     raft::allocate(left_eig_vectors_qr, left_evl);
@@ -69,7 +67,8 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     // allocate(right_eig_vectors_trans_jacobi, right_evl);
     // allocate(sing_vals_jacobi, params.n_col);
 
-    T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, 0.488195, 0.110706, -0.865685};
+    T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695,
+                                  0.488195,  0.110706,  -0.865685};
 
     T right_eig_vectors_ref_h[] = {-0.638636, -0.769509, -0.769509, 0.638636};
 
@@ -79,25 +78,18 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     raft::allocate(right_eig_vectors_ref, right_evl);
     raft::allocate(sing_vals_ref, params.n_col);
 
-    raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl, stream);
-    raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h, right_evl, stream);
+    raft::update_device(left_eig_vectors_ref, left_eig_vectors_ref_h, left_evl,
+                        stream);
+    raft::update_device(right_eig_vectors_ref, right_eig_vectors_ref_h,
+                        right_evl, stream);
     raft::update_device(sing_vals_ref, sing_vals_ref_h, params.n_col, stream);
 
-    svdQR(handle,
-          data,
-          params.n_row,
-          params.n_col,
-          sing_vals_qr,
-          left_eig_vectors_qr,
-          right_eig_vectors_trans_qr,
-          true,
-          true,
-          true,
+    svdQR(handle, data, params.n_row, params.n_col, sing_vals_qr,
+          left_eig_vectors_qr, right_eig_vectors_trans_qr, true, true, true,
           stream);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(left_eig_vectors_qr));
     CUDA_CHECK(cudaFree(right_eig_vectors_trans_qr));
@@ -109,71 +101,69 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
 
  protected:
   SvdInputs<T> params;
-  T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr, *left_eig_vectors_ref,
-    *right_eig_vectors_ref, *sing_vals_ref;
+  T *data, *left_eig_vectors_qr, *right_eig_vectors_trans_qr, *sing_vals_qr,
+    *left_eig_vectors_ref, *right_eig_vectors_ref, *sing_vals_ref;
 };
 
-const std::vector<SvdInputs<float>> inputsf2 = {{0.00001f, 3 * 2, 3, 2, 1234ULL}};
+const std::vector<SvdInputs<float>> inputsf2 = {
+  {0.00001f, 3 * 2, 3, 2, 1234ULL}};
 
-const std::vector<SvdInputs<double>> inputsd2 = {{0.00001, 3 * 2, 3, 2, 1234ULL}};
+const std::vector<SvdInputs<double>> inputsd2 = {
+  {0.00001, 3 * 2, 3, 2, 1234ULL}};
 
 typedef SvdTest<float> SvdTestValF;
-TEST_P(SvdTestValF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    sing_vals_ref, sing_vals_qr, params.n_col, raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(SvdTestValF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestValD;
-TEST_P(SvdTestValD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    sing_vals_ref, sing_vals_qr, params.n_col, raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(SvdTestValD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(sing_vals_ref, sing_vals_qr, params.n_col,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef SvdTest<float> SvdTestLeftVecF;
-TEST_P(SvdTestLeftVecF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref,
-                                left_eig_vectors_qr,
-                                params.n_row * params.n_col,
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(SvdTestLeftVecF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(
+    left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col,
+    raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestLeftVecD;
-TEST_P(SvdTestLeftVecD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref,
-                                left_eig_vectors_qr,
-                                params.n_row * params.n_col,
-                                raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(SvdTestLeftVecD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(
+    left_eig_vectors_ref, left_eig_vectors_qr, params.n_row * params.n_col,
+    raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef SvdTest<float> SvdTestRightVecF;
-TEST_P(SvdTestRightVecF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref,
-                                right_eig_vectors_trans_qr,
-                                params.n_col * params.n_col,
-                                raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(SvdTestRightVecF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr,
+                      params.n_col * params.n_col,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestRightVecD;
-TEST_P(SvdTestRightVecD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref,
-                                right_eig_vectors_trans_qr,
-                                params.n_col * params.n_col,
-                                raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(SvdTestRightVecD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(right_eig_vectors_ref, right_eig_vectors_trans_qr,
+                      params.n_col * params.n_col,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValF, ::testing::ValuesIn(inputsf2));
 
 INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF,
+                         ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD,
+                         ::testing::ValuesIn(inputsd2));
 
 // INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF,
 // ::testing::ValuesIn(inputsf2));
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index 659bed04c6..f10b029962 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -34,16 +34,14 @@ struct TranposeInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const TranposeInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os, const TranposeInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
 class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<TranposeInputs<T>>::GetParam();
 
     stream = handle.get_stream();
@@ -65,8 +63,7 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
     transpose(data, params.n_row, stream);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(data_trans));
     CUDA_CHECK(cudaFree(data_trans_ref));
@@ -79,33 +76,39 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
   cudaStream_t stream;
 };
 
-const std::vector<TranposeInputs<float>> inputsf2 = {{0.1f, 3 * 3, 3, 3, 1234ULL}};
+const std::vector<TranposeInputs<float>> inputsf2 = {
+  {0.1f, 3 * 3, 3, 3, 1234ULL}};
 
-const std::vector<TranposeInputs<double>> inputsd2 = {{0.1, 3 * 3, 3, 3, 1234ULL}};
+const std::vector<TranposeInputs<double>> inputsd2 = {
+  {0.1, 3 * 3, 3, 3, 1234ULL}};
 
 typedef TransposeTest<float> TransposeTestValF;
-TEST_P(TransposeTestValF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    data_trans_ref, data_trans, params.len, raft::CompareApproxAbs<float>(params.tolerance)));
-
-  ASSERT_TRUE(raft::devArrMatch(
-    data_trans_ref, data, params.len, raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(TransposeTestValF, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(data_trans_ref, data_trans, params.len,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
+
+  ASSERT_TRUE(
+    raft::devArrMatch(data_trans_ref, data, params.len,
+                      raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef TransposeTest<double> TransposeTestValD;
-TEST_P(TransposeTestValD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    data_trans_ref, data_trans, params.len, raft::CompareApproxAbs<double>(params.tolerance)));
-
-  ASSERT_TRUE(raft::devArrMatch(
-    data_trans_ref, data, params.len, raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(TransposeTestValD, Result) {
+  ASSERT_TRUE(
+    raft::devArrMatch(data_trans_ref, data_trans, params.len,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
+
+  ASSERT_TRUE(
+    raft::devArrMatch(data_trans_ref, data, params.len,
+                      raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF,
+                         ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD,
+                         ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 6349a1907a..666ab8619d 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -28,25 +28,28 @@ namespace linalg {
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename IdxType = int, typename OutType = InType>
-void unaryOpLaunch(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
-{
+void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len,
+                   cudaStream_t stream) {
   if (in == nullptr) {
     auto op = [scalar] __device__(OutType * ptr, IdxType idx) {
       *ptr = static_cast<OutType>(scalar * idx);
     };
     writeOnlyUnaryOp<OutType, decltype(op), IdxType>(out, len, op, stream);
   } else {
-    auto op = [scalar] __device__(InType in) { return static_cast<OutType>(in * scalar); };
+    auto op = [scalar] __device__(InType in) {
+      return static_cast<OutType>(in * scalar);
+    };
     unaryOp<InType, decltype(op), IdxType, OutType>(out, in, len, op, stream);
   }
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
+class UnaryOpTest
+  : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
  protected:
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>>::GetParam();
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      UnaryOpInputs<InType, IdxType, OutType>>::GetParam();
     raft::random::Rng r(params.seed);
     CUDA_CHECK(cudaStreamCreate(&stream));
     auto len = params.len;
@@ -56,8 +59,7 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
     r.uniform(in, len, InType(-1.0), InType(1.0), stream);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaStreamDestroy(stream));
     CUDA_CHECK(cudaFree(in));
@@ -65,18 +67,18 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
     CUDA_CHECK(cudaFree(out));
   }
 
-  virtual void DoTest()
-  {
-    auto len    = params.len;
+  virtual void DoTest() {
+    auto len = params.len;
     auto scalar = params.scalar;
     naiveScale(out_ref, in, scalar, len, stream);
     unaryOpLaunch(out, in, scalar, len, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
-    ASSERT_TRUE(devArrMatch(out_ref, out, params.len, CompareApprox<OutType>(params.tolerance)));
+    ASSERT_TRUE(devArrMatch(out_ref, out, params.len,
+                            CompareApprox<OutType>(params.tolerance)));
   }
 
   UnaryOpInputs<InType, IdxType, OutType> params;
-  InType* in;
+  InType *in;
   OutType *out_ref, *out;
   cudaStream_t stream;
 };
@@ -84,15 +86,14 @@ class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxTyp
 template <typename OutType, typename IdxType>
 class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
  protected:
-  void DoTest() override
-  {
-    auto len    = this->params.len;
+  void DoTest() override {
+    auto len = this->params.len;
     auto scalar = this->params.scalar;
-    naiveScale(this->out_ref, (OutType*)nullptr, scalar, len, this->stream);
-    unaryOpLaunch(this->out, (OutType*)nullptr, scalar, len, this->stream);
+    naiveScale(this->out_ref, (OutType *)nullptr, scalar, len, this->stream);
+    unaryOpLaunch(this->out, (OutType *)nullptr, scalar, len, this->stream);
     CUDA_CHECK(cudaStreamSynchronize(this->stream));
-    ASSERT_TRUE(devArrMatch(
-      this->out_ref, this->out, this->params.len, CompareApprox<OutType>(this->params.tolerance)));
+    ASSERT_TRUE(devArrMatch(this->out_ref, this->out, this->params.len,
+                            CompareApprox<OutType>(this->params.tolerance)));
   }
 };
 
@@ -100,7 +101,8 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
   TEST_P(Name, Result) { DoTest(); } \
   INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs))
 
-const std::vector<UnaryOpInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<UnaryOpInputs<float, int>> inputsf_i32 = {
+  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef UnaryOpTest<float, int> UnaryOpTestF_i32;
 UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32);
 typedef WriteOnlyUnaryOpTest<float, int> WriteOnlyUnaryOpTestF_i32;
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index 3343389af8..be3f1124c5 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -24,8 +24,8 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename IdxType>
-__global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar, IdxType len)
-{
+__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar,
+                                 IdxType len) {
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
   if (idx < len) {
     if (in == nullptr) {
@@ -38,11 +38,12 @@ __global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar,
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-void naiveScale(OutType* out, const InType* in, InType scalar, int len, cudaStream_t stream)
-{
+void naiveScale(OutType *out, const InType *in, InType scalar, int len,
+                cudaStream_t stream) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(len, TPB);
-  naiveScaleKernel<InType, OutType, IdxType><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
+  int nblks = raft::ceildiv(len, TPB);
+  naiveScaleKernel<InType, OutType, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -55,8 +56,8 @@ struct UnaryOpInputs {
 };
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-::std::ostream& operator<<(::std::ostream& os, const UnaryOpInputs<InType, IdxType, OutType>& d)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const UnaryOpInputs<InType, IdxType, OutType> &d) {
   return os;
 }
 
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 9cdd36b252..578139623a 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -24,51 +24,53 @@ namespace raft {
 namespace matrix {
 
 template <typename Type>
-__global__ void nativePowerKernel(Type* in, Type* out, int len)
-{
+__global__ void nativePowerKernel(Type *in, Type *out, int len) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = in[idx] * in[idx]; }
+  if (idx < len) {
+    out[idx] = in[idx] * in[idx];
+  }
 }
 
 template <typename Type>
-void naivePower(Type* in, Type* out, int len, cudaStream_t stream)
-{
+void naivePower(Type *in, Type *out, int len, cudaStream_t stream) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(len, TPB);
+  int nblks = raft::ceildiv(len, TPB);
   nativePowerKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void nativeSqrtKernel(Type* in, Type* out, int len)
-{
+__global__ void nativeSqrtKernel(Type *in, Type *out, int len) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) { out[idx] = sqrt(in[idx]); }
+  if (idx < len) {
+    out[idx] = sqrt(in[idx]);
+  }
 }
 
 template <typename Type>
-void naiveSqrt(Type* in, Type* out, int len)
-{
+void naiveSqrt(Type *in, Type *out, int len) {
   static const int TPB = 64;
-  int nblks            = raft::ceildiv(len, TPB);
+  int nblks = raft::ceildiv(len, TPB);
   nativeSqrtKernel<Type><<<nblks, TPB>>>(in, out, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCount)
-{
+__global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount,
+                                    int colCount) {
   int d_i = blockIdx.x * rowCount;
   int end = d_i + rowCount;
 
   if (blockIdx.x < colCount) {
-    Type max      = 0.0;
+    Type max = 0.0;
     int max_index = 0;
     for (int i = d_i; i < end; i++) {
       Type val = in[i];
-      if (val < 0.0) { val = -val; }
+      if (val < 0.0) {
+        val = -val;
+      }
       if (val > max) {
-        max       = val;
+        max = val;
         max_index = i;
       }
     }
@@ -86,8 +88,7 @@ __global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCo
 }
 
 template <typename Type>
-void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount)
-{
+void naiveSignFlip(Type *in, Type *out, int rowCount, int colCount) {
   naiveSignFlipKernel<Type><<<colCount, 1>>>(in, out, rowCount, colCount);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -102,16 +103,14 @@ struct MathInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const MathInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os, const MathInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
 class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<MathInputs<T>>::GetParam();
     random::Rng r(params.seed);
     int len = params.len;
@@ -155,7 +154,7 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     allocate(in_recip_ref, 4);
     allocate(out_recip, 4);
     // default threshold is 1e-15
-    std::vector<T> in_recip_h     = {0.1, 0.01, -0.01, 0.1e-16};
+    std::vector<T> in_recip_h = {0.1, 0.01, -0.01, 0.1e-16};
     std::vector<T> in_recip_ref_h = {10.0, 100.0, -100.0, 0.0};
     update_device(in_recip, in_recip_h.data(), 4, stream);
     update_device(in_recip_ref, in_recip_ref_h.data(), 4, stream);
@@ -166,7 +165,7 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
 
     reciprocal(in_recip, recip_scalar, 4, stream, true);
 
-    std::vector<T> in_small_val_zero_h     = {0.1, 1e-16, -1e-16, -0.1};
+    std::vector<T> in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1};
     std::vector<T> in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1};
     allocate(in_smallzero, 4);
     allocate(out_smallzero, 4);
@@ -178,8 +177,7 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(in_power));
     CUDA_CHECK(cudaFree(out_power_ref));
     CUDA_CHECK(cudaFree(in_sqrt));
@@ -198,129 +196,137 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
 
  protected:
   MathInputs<T> params;
-  T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio, *out_ratio_ref, *in_sign_flip,
-    *out_sign_flip_ref, *in_recip, *in_recip_ref, *out_recip, *in_smallzero, *out_smallzero,
-    *out_smallzero_ref;
+  T *in_power, *out_power_ref, *in_sqrt, *out_sqrt_ref, *in_ratio,
+    *out_ratio_ref, *in_sign_flip, *out_sign_flip_ref, *in_recip, *in_recip_ref,
+    *out_recip, *in_smallzero, *out_smallzero, *out_smallzero_ref;
 };
 
-const std::vector<MathInputs<float>> inputsf = {{0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}};
+const std::vector<MathInputs<float>> inputsf = {
+  {0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}};
 
-const std::vector<MathInputs<double>> inputsd = {{0.00001, 1024, 1024, 1024 * 1024, 1234ULL}};
+const std::vector<MathInputs<double>> inputsd = {
+  {0.00001, 1024, 1024, 1024 * 1024, 1234ULL}};
 
 typedef MathTest<float> MathPowerTestF;
-TEST_P(MathPowerTestF, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(in_power, out_power_ref, params.len, CompareApprox<float>(params.tolerance)));
+TEST_P(MathPowerTestF, Result) {
+  ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len,
+                          CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathPowerTestD;
-TEST_P(MathPowerTestD, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(in_power, out_power_ref, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(MathPowerTestD, Result) {
+  ASSERT_TRUE(devArrMatch(in_power, out_power_ref, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSqrtTestF;
-TEST_P(MathSqrtTestF, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(in_sqrt, out_sqrt_ref, params.len, CompareApprox<float>(params.tolerance)));
+TEST_P(MathSqrtTestF, Result) {
+  ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len,
+                          CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSqrtTestD;
-TEST_P(MathSqrtTestD, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(in_sqrt, out_sqrt_ref, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(MathSqrtTestD, Result) {
+  ASSERT_TRUE(devArrMatch(in_sqrt, out_sqrt_ref, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathRatioTestF;
-TEST_P(MathRatioTestF, Result)
-{
-  ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, CompareApprox<float>(params.tolerance)));
+TEST_P(MathRatioTestF, Result) {
+  ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4,
+                          CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathRatioTestD;
-TEST_P(MathRatioTestD, Result)
-{
-  ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4, CompareApprox<double>(params.tolerance)));
+TEST_P(MathRatioTestD, Result) {
+  ASSERT_TRUE(devArrMatch(in_ratio, out_ratio_ref, 4,
+                          CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSignFlipTestF;
-TEST_P(MathSignFlipTestF, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    in_sign_flip, out_sign_flip_ref, params.len, CompareApprox<float>(params.tolerance)));
+TEST_P(MathSignFlipTestF, Result) {
+  ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len,
+                          CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSignFlipTestD;
-TEST_P(MathSignFlipTestD, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    in_sign_flip, out_sign_flip_ref, params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(MathSignFlipTestD, Result) {
+  ASSERT_TRUE(devArrMatch(in_sign_flip, out_sign_flip_ref, params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathReciprocalTestF;
-TEST_P(MathReciprocalTestF, Result)
-{
-  ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, CompareApprox<float>(params.tolerance)));
+TEST_P(MathReciprocalTestF, Result) {
+  ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4,
+                          CompareApprox<float>(params.tolerance)));
 
   // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
-  ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3,
+                          CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathReciprocalTestD;
-TEST_P(MathReciprocalTestD, Result)
-{
-  ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4, CompareApprox<double>(params.tolerance)));
+TEST_P(MathReciprocalTestD, Result) {
+  ASSERT_TRUE(devArrMatch(in_recip, in_recip_ref, 4,
+                          CompareApprox<double>(params.tolerance)));
 
   // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
-  ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(out_recip, in_recip_ref, 3,
+                          CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSetSmallZeroTestF;
-TEST_P(MathSetSmallZeroTestF, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(in_smallzero, out_smallzero_ref, 4, CompareApprox<float>(params.tolerance)));
+TEST_P(MathSetSmallZeroTestF, Result) {
+  ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4,
+                          CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(
-    devArrMatch(out_smallzero, out_smallzero_ref, 4, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4,
+                          CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSetSmallZeroTestD;
-TEST_P(MathSetSmallZeroTestD, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(in_smallzero, out_smallzero_ref, 4, CompareApprox<double>(params.tolerance)));
+TEST_P(MathSetSmallZeroTestD, Result) {
+  ASSERT_TRUE(devArrMatch(in_smallzero, out_smallzero_ref, 4,
+                          CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(
-    devArrMatch(out_smallzero, out_smallzero_ref, 4, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(out_smallzero, out_smallzero_ref, 4,
+                          CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF,
+                         ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD,
+                         ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF,
+                         ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD,
+                         ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF,
+                         ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD,
+                         ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF,
+                         ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD,
+                         ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF,
+                         ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD,
+                         ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, ::testing::ValuesIn(inputsf));
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF,
+                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD,
+                         ::testing::ValuesIn(inputsd));
 
 }  // namespace matrix
 }  // namespace raft
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index fc5a418bda..28222c0697 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -32,16 +32,14 @@ struct MatrixInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const MatrixInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os, const MatrixInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
 class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<MatrixInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.n_row * params.n_col;
@@ -56,14 +54,13 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
     // copy(in1, in1_revr, params.n_row, params.n_col);
     // colReverse(in1_revr, params.n_row, params.n_col);
 
-    T* outTrunc;
+    T *outTrunc;
     raft::allocate(outTrunc, 6);
     truncZeroOrigin(in1, params.n_row, outTrunc, 3, 2, stream);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(in1));
     CUDA_CHECK(cudaFree(in2));
     // CUDA_CHECK(cudaFree(in1_revr));
@@ -76,30 +73,31 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
 
 const std::vector<MatrixInputs<float>> inputsf2 = {{0.000001f, 4, 4, 1234ULL}};
 
-const std::vector<MatrixInputs<double>> inputsd2 = {{0.00000001, 4, 4, 1234ULL}};
+const std::vector<MatrixInputs<double>> inputsd2 = {
+  {0.00000001, 4, 4, 1234ULL}};
 
 typedef MatrixTest<float> MatrixTestF;
-TEST_P(MatrixTestF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    in1, in2, params.n_row * params.n_col, raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MatrixTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef MatrixTest<double> MatrixTestD;
-TEST_P(MatrixTestD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    in1, in2, params.n_row * params.n_col, raft::CompareApprox<double>(params.tolerance)));
+TEST_P(MatrixTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(in1, in2, params.n_row * params.n_col,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF,
+                         ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD,
+                         ::testing::ValuesIn(inputsd2));
 
 template <typename T>
 class MatrixCopyRowsTest : public ::testing::Test {
-  using math_t      = typename std::tuple_element<0, T>::type;
-  using idx_t       = typename std::tuple_element<1, T>::type;
+  using math_t = typename std::tuple_element<0, T>::type;
+  using idx_t = typename std::tuple_element<1, T>::type;
   using idx_array_t = typename std::tuple_element<2, T>::type;
 
  protected:
@@ -107,38 +105,42 @@ class MatrixCopyRowsTest : public ::testing::Test {
     : allocator(handle.get_device_allocator()),
       input(allocator, handle.get_stream(), n_cols * n_rows),
       indices(allocator, handle.get_stream(), n_selected),
-      output(allocator, handle.get_stream(), n_cols * n_selected)
-  {
+      output(allocator, handle.get_stream(), n_cols * n_selected) {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(indices.data(), indices_host, n_selected, stream);
     // Init input array
     thrust::counting_iterator<idx_t> first(0);
     thrust::device_ptr<math_t> ptr(input.data());
-    thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows, ptr);
+    thrust::copy(thrust::cuda::par.on(stream), first, first + n_cols * n_rows,
+                 ptr);
   }
 
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
-  void testCopyRows()
-  {
-    copyRows(
-      input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, false);
-    EXPECT_TRUE(raft::devArrMatchHost(
-      output_exp_colmajor, output.data(), n_selected * n_cols, raft::Compare<math_t>()));
-    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, true);
-    EXPECT_TRUE(raft::devArrMatchHost(
-      output_exp_rowmajor, output.data(), n_selected * n_cols, raft::Compare<math_t>()));
+  void testCopyRows() {
+    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(),
+             n_selected, stream, false);
+    EXPECT_TRUE(raft::devArrMatchHost(output_exp_colmajor, output.data(),
+                                      n_selected * n_cols,
+                                      raft::Compare<math_t>()));
+    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(),
+             n_selected, stream, true);
+    EXPECT_TRUE(raft::devArrMatchHost(output_exp_rowmajor, output.data(),
+                                      n_selected * n_cols,
+                                      raft::Compare<math_t>()));
   }
 
  protected:
-  int n_rows     = 10;
-  int n_cols     = 3;
+  int n_rows = 10;
+  int n_cols = 3;
   int n_selected = 5;
 
-  idx_array_t indices_host[5]    = {0, 3, 4, 7, 9};
-  math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, 17, 19, 20, 23, 24, 27, 29};
-  math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, 14, 21, 22, 23, 27, 28, 29};
+  idx_array_t indices_host[5] = {0, 3, 4, 7, 9};
+  math_t output_exp_colmajor[15] = {0,  3,  4,  7,  9,  10, 13, 14,
+                                    17, 19, 20, 23, 24, 27, 29};
+  math_t output_exp_rowmajor[15] = {0,  1,  2,  9,  10, 11, 12, 13,
+                                    14, 21, 22, 23, 27, 28, 29};
   raft::handle_t handle;
   cudaStream_t stream;
   std::shared_ptr<raft::mr::device::allocator> allocator;
@@ -147,10 +149,10 @@ class MatrixCopyRowsTest : public ::testing::Test {
   raft::mr::device::buffer<idx_array_t> indices;
 };
 
-using TypeTuple = ::testing::Types<std::tuple<float, int, int>,
-                                   std::tuple<float, int64_t, int>,
-                                   std::tuple<double, int, int>,
-                                   std::tuple<double, int64_t, int>>;
+using TypeTuple =
+  ::testing::Types<std::tuple<float, int, int>, std::tuple<float, int64_t, int>,
+                   std::tuple<double, int, int>,
+                   std::tuple<double, int64_t, int>>;
 
 TYPED_TEST_CASE(MatrixCopyRowsTest, TypeTuple);
 TYPED_TEST(MatrixCopyRowsTest, CopyRows) { this->testCopyRows(); }
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
index 9ba2c3332b..223efdbfe8 100644
--- a/cpp/test/mr/device/buffer.cpp
+++ b/cpp/test/mr/device/buffer.cpp
@@ -25,8 +25,7 @@ namespace raft {
 namespace mr {
 namespace device {
 
-TEST(Raft, DeviceBufferAlloc)
-{
+TEST(Raft, DeviceBufferAlloc) {
   auto alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
@@ -53,14 +52,13 @@ TEST(Raft, DeviceBufferAlloc)
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, DeviceBufferZeroResize)
-{
+TEST(Raft, DeviceBufferZeroResize) {
   // Create a limiting_resource_adaptor to track allocations
-  auto curr_mr =
-    dynamic_cast<rmm::mr::cuda_memory_resource*>(rmm::mr::get_current_device_resource());
-  auto limit_mr =
-    std::make_shared<rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_memory_resource>>(curr_mr,
-                                                                                        1000);
+  auto curr_mr = dynamic_cast<rmm::mr::cuda_memory_resource*>(
+    rmm::mr::get_current_device_resource());
+  auto limit_mr = std::make_shared<
+    rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_memory_resource>>(curr_mr,
+                                                                       1000);
 
   rmm::mr::set_current_device_resource(limit_mr.get());
 
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp
index aadf05285c..953f65ddfb 100644
--- a/cpp/test/mr/host/buffer.cpp
+++ b/cpp/test/mr/host/buffer.cpp
@@ -24,8 +24,7 @@ namespace raft {
 namespace mr {
 namespace host {
 
-TEST(Raft, HostBuffer)
-{
+TEST(Raft, HostBuffer) {
   auto alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
@@ -52,14 +51,14 @@ TEST(Raft, HostBuffer)
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, DeviceToHostBuffer)
-{
+TEST(Raft, DeviceToHostBuffer) {
   auto d_alloc = std::make_shared<device::default_allocator>();
   auto h_alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
   device::buffer<char> d_buff(d_alloc, stream, 32);
-  CUDA_CHECK(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
+  CUDA_CHECK(
+    cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
   buffer<char> h_buff(h_alloc, d_buff);
   ASSERT_EQ(d_buff.size(), h_buff.size());
   CUDA_CHECK(cudaStreamSynchronize(stream));
diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu
index 5560c61c73..d7aa76500b 100644
--- a/cpp/test/mst.cu
+++ b/cpp/test/mst.cu
@@ -54,8 +54,7 @@ namespace mst {
 // Sequential prims function
 // Returns total weight of MST
 template <typename vertex_t, typename edge_t, typename weight_t>
-weight_t prims(CSRHost<vertex_t, edge_t, weight_t>& csr_h)
-{
+weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
   auto n_vertices = csr_h.offsets.size() - 1;
 
   bool active_vertex[n_vertices];
@@ -64,18 +63,19 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t>& csr_h)
 
   for (auto i = 0; i < n_vertices; i++) {
     active_vertex[i] = false;
-    curr_edge[i]     = INT_MAX;
+    curr_edge[i] = INT_MAX;
   }
   curr_edge[0] = 0;
 
   // function to pick next min vertex-edge
-  auto min_vertex_edge = [](auto* curr_edge, auto* active_vertex, auto n_vertices) {
+  auto min_vertex_edge = [](auto *curr_edge, auto *active_vertex,
+                            auto n_vertices) {
     weight_t min = INT_MAX;
     vertex_t min_vertex;
 
     for (auto v = 0; v < n_vertices; v++) {
       if (!active_vertex[v] && curr_edge[v] < min) {
-        min        = curr_edge[v];
+        min = curr_edge[v];
         min_vertex = v;
       }
     }
@@ -91,13 +91,14 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t>& csr_h)
     active_vertex[curr_v] = true;  // set to active
 
     // iterate through edges of current active vertex
-    auto edge_st  = csr_h.offsets[curr_v];
+    auto edge_st = csr_h.offsets[curr_v];
     auto edge_end = csr_h.offsets[curr_v + 1];
 
     for (auto e = edge_st; e < edge_end; e++) {
       // put edges to be considered for next iteration
       auto neighbor_idx = csr_h.indices[e];
-      if (!active_vertex[neighbor_idx] && csr_h.weights[e] < curr_edge[neighbor_idx]) {
+      if (!active_vertex[neighbor_idx] &&
+          csr_h.weights[e] < curr_edge[neighbor_idx]) {
         curr_edge[neighbor_idx] = csr_h.weights[e];
       }
     }
@@ -113,101 +114,99 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t>& csr_h)
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-class MSTTest : public ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, weight_t>> {
+class MSTTest
+  : public ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, weight_t>> {
  protected:
   std::pair<raft::Graph_COO<vertex_t, edge_t, weight_t>,
             raft::Graph_COO<vertex_t, edge_t, weight_t>>
-  mst_gpu()
-  {
-    edge_t* offsets   = static_cast<edge_t*>(csr_d.offsets.data());
-    vertex_t* indices = static_cast<vertex_t*>(csr_d.indices.data());
-    weight_t* weights = static_cast<weight_t*>(csr_d.weights.data());
+  mst_gpu() {
+    edge_t *offsets = static_cast<edge_t *>(csr_d.offsets.data());
+    vertex_t *indices = static_cast<vertex_t *>(csr_d.indices.data());
+    weight_t *weights = static_cast<weight_t *>(csr_d.weights.data());
 
     v = static_cast<vertex_t>((csr_d.offsets.size() / sizeof(vertex_t)) - 1);
     e = static_cast<edge_t>(csr_d.indices.size() / sizeof(edge_t));
 
-    rmm::device_vector<vertex_t> mst_src(2 * v - 2, std::numeric_limits<vertex_t>::max());
-    rmm::device_vector<vertex_t> mst_dst(2 * v - 2, std::numeric_limits<vertex_t>::max());
+    rmm::device_vector<vertex_t> mst_src(2 * v - 2,
+                                         std::numeric_limits<vertex_t>::max());
+    rmm::device_vector<vertex_t> mst_dst(2 * v - 2,
+                                         std::numeric_limits<vertex_t>::max());
     rmm::device_vector<vertex_t> color(v, 0);
 
-    vertex_t* color_ptr = thrust::raw_pointer_cast(color.data());
+    vertex_t *color_ptr = thrust::raw_pointer_cast(color.data());
 
     if (iterations == 0) {
       MST_solver<vertex_t, edge_t, weight_t, float> symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, true, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
+        true, true, 0);
       auto symmetric_result = symmetric_solver.solve();
 
       MST_solver<vertex_t, edge_t, weight_t, float> non_symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
+        false, true, 0);
       auto non_symmetric_result = non_symmetric_solver.solve();
 
       EXPECT_LE(symmetric_result.n_edges, 2 * v - 2);
       EXPECT_LE(non_symmetric_result.n_edges, v - 1);
 
-      return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result));
+      return std::make_pair(std::move(symmetric_result),
+                            std::move(non_symmetric_result));
     } else {
-      MST_solver<vertex_t, edge_t, weight_t, float> intermediate_solver(handle,
-                                                                        offsets,
-                                                                        indices,
-                                                                        weights,
-                                                                        v,
-                                                                        e,
-                                                                        color_ptr,
-                                                                        handle.get_stream(),
-                                                                        true,
-                                                                        true,
-                                                                        iterations);
+      MST_solver<vertex_t, edge_t, weight_t, float> intermediate_solver(
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
+        true, true, iterations);
       auto intermediate_result = intermediate_solver.solve();
 
       MST_solver<vertex_t, edge_t, weight_t, float> symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, false, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
+        true, false, 0);
       auto symmetric_result = symmetric_solver.solve();
 
       // symmetric_result.n_edges += intermediate_result.n_edges;
-      auto total_edge_size = symmetric_result.n_edges + intermediate_result.n_edges;
+      auto total_edge_size =
+        symmetric_result.n_edges + intermediate_result.n_edges;
       symmetric_result.src.resize(total_edge_size, handle.get_stream());
       symmetric_result.dst.resize(total_edge_size, handle.get_stream());
       symmetric_result.weights.resize(total_edge_size, handle.get_stream());
 
       raft::copy(symmetric_result.src.data() + symmetric_result.n_edges,
-                 intermediate_result.src.data(),
-                 intermediate_result.n_edges,
+                 intermediate_result.src.data(), intermediate_result.n_edges,
                  handle.get_stream());
       raft::copy(symmetric_result.dst.data() + symmetric_result.n_edges,
-                 intermediate_result.dst.data(),
-                 intermediate_result.n_edges,
+                 intermediate_result.dst.data(), intermediate_result.n_edges,
                  handle.get_stream());
       raft::copy(symmetric_result.weights.data() + symmetric_result.n_edges,
                  intermediate_result.weights.data(),
-                 intermediate_result.n_edges,
-                 handle.get_stream());
+                 intermediate_result.n_edges, handle.get_stream());
       symmetric_result.n_edges = total_edge_size;
 
       MST_solver<vertex_t, edge_t, weight_t, float> non_symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
+        false, true, 0);
       auto non_symmetric_result = non_symmetric_solver.solve();
 
       EXPECT_LE(symmetric_result.n_edges, 2 * v - 2);
       EXPECT_LE(non_symmetric_result.n_edges, v - 1);
 
-      return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result));
+      return std::make_pair(std::move(symmetric_result),
+                            std::move(non_symmetric_result));
     }
   }
 
-  void SetUp() override
-  {
-    mst_input  = ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, weight_t>>::GetParam();
+  void SetUp() override {
+    mst_input = ::testing::TestWithParam<
+      MSTTestInput<vertex_t, edge_t, weight_t>>::GetParam();
     iterations = mst_input.iterations;
 
-    csr_d.offsets = rmm::device_buffer(mst_input.csr_h.offsets.data(),
-                                       mst_input.csr_h.offsets.size() * sizeof(edge_t),
-                                       handle.get_stream());
-    csr_d.indices = rmm::device_buffer(mst_input.csr_h.indices.data(),
-                                       mst_input.csr_h.indices.size() * sizeof(vertex_t),
-                                       handle.get_stream());
-    csr_d.weights = rmm::device_buffer(mst_input.csr_h.weights.data(),
-                                       mst_input.csr_h.weights.size() * sizeof(weight_t),
-                                       handle.get_stream());
+    csr_d.offsets = rmm::device_buffer(
+      mst_input.csr_h.offsets.data(),
+      mst_input.csr_h.offsets.size() * sizeof(edge_t), handle.get_stream());
+    csr_d.indices = rmm::device_buffer(
+      mst_input.csr_h.indices.data(),
+      mst_input.csr_h.indices.size() * sizeof(vertex_t), handle.get_stream());
+    csr_d.weights = rmm::device_buffer(
+      mst_input.csr_h.weights.data(),
+      mst_input.csr_h.weights.size() * sizeof(weight_t), handle.get_stream());
   }
 
   void TearDown() override {}
@@ -260,68 +259,41 @@ const std::vector<MSTTestInput<int, int, float>> csr_in_h = {
 const std::vector<CSRHost<int, int, float>> csr_in4_h = {
   {{0, 3, 5, 8, 10, 12, 14, 16},
    {2, 4, 5, 3, 6, 0, 4, 5, 1, 6, 0, 2, 0, 2, 1, 3},
-   {5.0f,
-    9.0f,
-    1.0f,
-    8.0f,
-    7.0f,
-    5.0f,
-    2.0f,
-    6.0f,
-    8.0f,
-    10.0f,
-    9.0f,
-    2.0f,
-    1.0f,
-    6.0f,
-    7.0f,
-    10.0f}}};
+   {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f,
+    1.0f, 6.0f, 7.0f, 10.0f}}};
 
 //  singletons
 const std::vector<CSRHost<int, int, float>> csr_in5_h = {
   {{0, 3, 5, 8, 10, 10, 10, 12, 14, 16, 16},
    {2, 8, 7, 3, 8, 0, 8, 7, 1, 8, 0, 2, 0, 2, 1, 3},
-   {5.0f,
-    9.0f,
-    1.0f,
-    8.0f,
-    7.0f,
-    5.0f,
-    2.0f,
-    6.0f,
-    8.0f,
-    10.0f,
-    9.0f,
-    2.0f,
-    1.0f,
-    6.0f,
-    7.0f,
-    10.0f}}};
+   {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f,
+    1.0f, 6.0f, 7.0f, 10.0f}}};
 
 typedef MSTTest<int, int, float> MSTTestSequential;
-TEST_P(MSTTestSequential, Sequential)
-{
-  auto results_pair          = mst_gpu();
-  auto& symmetric_result     = results_pair.first;
-  auto& non_symmetric_result = results_pair.second;
+TEST_P(MSTTestSequential, Sequential) {
+  auto results_pair = mst_gpu();
+  auto &symmetric_result = results_pair.first;
+  auto &non_symmetric_result = results_pair.second;
 
   // do assertions here
   // in this case, running sequential MST
   auto prims_result = prims(mst_input.csr_h);
 
-  auto symmetric_sum = thrust::reduce(thrust::device,
-                                      symmetric_result.weights.data(),
-                                      symmetric_result.weights.data() + symmetric_result.n_edges);
-  auto non_symmetric_sum =
-    thrust::reduce(thrust::device,
-                   non_symmetric_result.weights.data(),
-                   non_symmetric_result.weights.data() + non_symmetric_result.n_edges);
-
-  ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, raft::CompareApprox<float>(0.1)));
-  ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, raft::CompareApprox<float>(0.1)));
+  auto symmetric_sum =
+    thrust::reduce(thrust::device, symmetric_result.weights.data(),
+                   symmetric_result.weights.data() + symmetric_result.n_edges);
+  auto non_symmetric_sum = thrust::reduce(
+    thrust::device, non_symmetric_result.weights.data(),
+    non_symmetric_result.weights.data() + non_symmetric_result.n_edges);
+
+  ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum,
+                          raft::CompareApprox<float>(0.1)));
+  ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum,
+                          raft::CompareApprox<float>(0.1)));
 }
 
-INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, ::testing::ValuesIn(csr_in_h));
+INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential,
+                         ::testing::ValuesIn(csr_in_h));
 
 }  // namespace mst
 }  // namespace raft
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index 25c8fe5084..af10dcab30 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -38,13 +38,12 @@ enum RandomType {
 };
 
 template <typename T, int TPB>
-__global__ void meanKernel(T* out, const T* data, int len)
-{
+__global__ void meanKernel(T* out, const T* data, int len) {
   typedef cub::BlockReduce<T, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  T val   = tid < len ? data[tid] : T(0);
-  T x     = BlockReduce(temp_storage).Sum(val);
+  T val = tid < len ? data[tid] : T(0);
+  T x = BlockReduce(temp_storage).Sum(val);
   __syncthreads();
   T xx = BlockReduce(temp_storage).Sum(val * val);
   __syncthreads();
@@ -71,8 +70,7 @@ struct RngInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims)
-{
+::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims) {
   return os;
 }
 
@@ -82,30 +80,46 @@ template <typename T>
 template <typename T>
 class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     // Tests are configured with their expected test-values sigma. For example,
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
     num_sigma = 10;
-    params    = ::testing::TestWithParam<RngInputs<T>>::GetParam();
+    params = ::testing::TestWithParam<RngInputs<T>>::GetParam();
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(params.seed, params.gtype);
     allocate(data, params.len);
     allocate(stats, 2, true);
     switch (params.type) {
-      case RNG_Normal: r.normal(data, params.len, params.start, params.end, stream); break;
-      case RNG_LogNormal: r.lognormal(data, params.len, params.start, params.end, stream); break;
-      case RNG_Uniform: r.uniform(data, params.len, params.start, params.end, stream); break;
-      case RNG_Gumbel: r.gumbel(data, params.len, params.start, params.end, stream); break;
-      case RNG_Logistic: r.logistic(data, params.len, params.start, params.end, stream); break;
-      case RNG_Exp: r.exponential(data, params.len, params.start, stream); break;
-      case RNG_Rayleigh: r.rayleigh(data, params.len, params.start, stream); break;
-      case RNG_Laplace: r.laplace(data, params.len, params.start, params.end, stream); break;
+      case RNG_Normal:
+        r.normal(data, params.len, params.start, params.end, stream);
+        break;
+      case RNG_LogNormal:
+        r.lognormal(data, params.len, params.start, params.end, stream);
+        break;
+      case RNG_Uniform:
+        r.uniform(data, params.len, params.start, params.end, stream);
+        break;
+      case RNG_Gumbel:
+        r.gumbel(data, params.len, params.start, params.end, stream);
+        break;
+      case RNG_Logistic:
+        r.logistic(data, params.len, params.start, params.end, stream);
+        break;
+      case RNG_Exp:
+        r.exponential(data, params.len, params.start, stream);
+        break;
+      case RNG_Rayleigh:
+        r.rayleigh(data, params.len, params.start, stream);
+        break;
+      case RNG_Laplace:
+        r.laplace(data, params.len, params.start, params.end, stream);
+        break;
     };
     static const int threads = 128;
     meanKernel<T, threads>
-      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(stats, data, params.len);
+      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(stats, data,
+                                                                   params.len);
     update_host<T>(h_stats, stats, 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
@@ -113,24 +127,23 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(stats));
   }
 
-  void getExpectedMeanVar(T meanvar[2])
-  {
+  void getExpectedMeanVar(T meanvar[2]) {
     switch (params.type) {
       case RNG_Normal:
         meanvar[0] = params.start;
         meanvar[1] = params.end * params.end;
         break;
       case RNG_LogNormal: {
-        auto var   = params.end * params.end;
-        auto mu    = params.start;
+        auto var = params.end * params.end;
+        auto mu = params.start;
         meanvar[0] = raft::myExp(mu + var * T(0.5));
-        meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
+        meanvar[1] =
+          (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
         break;
       }
       case RNG_Uniform:
@@ -154,7 +167,8 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
         break;
       case RNG_Rayleigh:
         meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0));
-        meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
+        meanvar[1] =
+          ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
         break;
       case RNG_Laplace:
         meanvar[0] = params.start;
@@ -245,12 +259,13 @@ const std::vector<RngInputs<float>> inputsf = {
   {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL},
   {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}};
 
-TEST_P(RngTestF, Result)
-{
+TEST_P(RngTestF, Result) {
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0],
+                    CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1],
+                    CompareApprox<float>(num_sigma * params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestF, ::testing::ValuesIn(inputsf));
 
@@ -306,12 +321,13 @@ const std::vector<RngInputs<double>> inputsd = {
   {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL},
   {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL},
   {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}};
-TEST_P(RngTestD, Result)
-{
+TEST_P(RngTestD, Result) {
   double meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<double>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0],
+                    CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1],
+                    CompareApprox<double>(num_sigma * params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd));
 
@@ -319,8 +335,7 @@ INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd));
 // Test for expected variance in mean calculations
 
 template <typename T>
-T quick_mean(const std::vector<T>& d)
-{
+T quick_mean(const std::vector<T>& d) {
   T acc = T(0);
   for (const auto& di : d) {
     acc += di;
@@ -329,9 +344,8 @@ T quick_mean(const std::vector<T>& d)
 }
 
 template <typename T>
-T quick_std(const std::vector<T>& d)
-{
-  T acc    = T(0);
+T quick_std(const std::vector<T>& d) {
+  T acc = T(0);
   T d_mean = quick_mean(d);
   for (const auto& di : d) {
     acc += ((di - d_mean) * (di - d_mean));
@@ -340,8 +354,7 @@ T quick_std(const std::vector<T>& d)
 }
 
 template <typename T>
-std::ostream& operator<<(std::ostream& out, const std::vector<T>& v)
-{
+std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
   if (!v.empty()) {
     out << '[';
     std::copy(v.begin(), v.end(), std::ostream_iterator<T>(out, ", "));
@@ -356,12 +369,11 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v)
 // experiments computing the mean, giving us a distribution of the mean
 // itself. The mean error is simply the standard deviation of this
 // distribution (the standard deviation of the mean).
-TEST(Rng, MeanError)
-{
+TEST(Rng, MeanError) {
   timeb time_struct;
   ftime(&time_struct);
-  int seed            = time_struct.millitm;
-  int num_samples     = 1024;
+  int seed = time_struct.millitm;
+  int num_samples = 1024;
   int num_experiments = 1024;
   float* data;
   float* mean_result;
@@ -379,9 +391,10 @@ TEST(Rng, MeanError)
     Rng r(seed, rtype);
     r.normal(data, len, 3.3f, 0.23f, stream);
     // r.uniform(data, len, -1.0, 2.0);
-    raft::stats::mean(mean_result, data, num_samples, num_experiments, false, false, stream);
-    raft::stats::stddev(
-      std_result, data, mean_result, num_samples, num_experiments, false, false, stream);
+    raft::stats::mean(mean_result, data, num_samples, num_experiments, false,
+                      false, stream);
+    raft::stats::stddev(std_result, data, mean_result, num_samples,
+                        num_experiments, false, false, stream);
     std::vector<float> h_mean_result(num_experiments);
     std::vector<float> h_std_result(num_experiments);
     update_host(h_mean_result.data(), mean_result, num_experiments, stream);
@@ -390,8 +403,8 @@ TEST(Rng, MeanError)
     auto d_mean = quick_mean(h_mean_result);
 
     // std-dev of mean; also known as mean error
-    auto d_std_of_mean            = quick_std(h_mean_result);
-    auto d_std                    = quick_mean(h_std_result);
+    auto d_std_of_mean = quick_std(h_mean_result);
+    auto d_std = quick_mean(h_std_result);
     auto d_std_of_mean_analytical = d_std / std::sqrt(num_samples);
 
     // std::cout << "measured mean error: " << d_std_of_mean << "\n";
@@ -400,7 +413,8 @@ TEST(Rng, MeanError)
     auto diff_expected_vs_measured_mean_error =
       std::abs(d_std_of_mean - d_std / std::sqrt(num_samples));
 
-    ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
+    ASSERT_TRUE(
+      (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
   }
   CUDA_CHECK(cudaStreamDestroy(stream));
   CUDA_CHECK(cudaFree(data));
@@ -413,8 +427,7 @@ TEST(Rng, MeanError)
 template <typename T, int len, int scale>
 class ScaledBernoulliTest : public ::testing::Test {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     Rng r(42);
@@ -425,12 +438,12 @@ class ScaledBernoulliTest : public ::testing::Test {
 
   void TearDown() override { CUDA_CHECK(cudaFree(data)); }
 
-  void rangeCheck()
-  {
+  void rangeCheck() {
     T* h_data = new T[len];
     update_host(h_data, data, len, stream);
-    ASSERT_TRUE(
-      std::none_of(h_data, h_data + len, [](const T& a) { return a < -scale || a > scale; }));
+    ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) {
+      return a < -scale || a > scale;
+    }));
     delete[] h_data;
   }
 
@@ -447,8 +460,7 @@ TEST_F(ScaledBernoulliTest2, RangeCheck) { rangeCheck(); }
 template <typename T, int len>
 class BernoulliTest : public ::testing::Test {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(42);
     allocate(data, len * sizeof(bool), stream);
@@ -457,8 +469,7 @@ class BernoulliTest : public ::testing::Test {
 
   void TearDown() override { CUDA_CHECK(cudaFree(data)); }
 
-  void trueFalseCheck()
-  {
+  void trueFalseCheck() {
     // both true and false values must be present
     bool* h_data = new bool[len];
     update_host(h_data, data, len, stream);
@@ -488,21 +499,21 @@ struct RngNormalTableInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const RngNormalTableInputs<T>& dims)
-{
+::std::ostream& operator<<(::std::ostream& os,
+                           const RngNormalTableInputs<T>& dims) {
   return os;
 }
 
 template <typename T>
-class RngNormalTableTest : public ::testing::TestWithParam<RngNormalTableInputs<T>> {
+class RngNormalTableTest
+  : public ::testing::TestWithParam<RngNormalTableInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     // Tests are configured with their expected test-values sigma. For example,
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
     num_sigma = 10;
-    params    = ::testing::TestWithParam<RngNormalTableInputs<T>>::GetParam();
-    int len   = params.rows * params.cols;
+    params = ::testing::TestWithParam<RngNormalTableInputs<T>>::GetParam();
+    int len = params.rows * params.cols;
 
     cudaStream_t stream;
     CUDA_CHECK(cudaStreamCreate(&stream));
@@ -512,9 +523,11 @@ class RngNormalTableTest : public ::testing::TestWithParam<RngNormalTableInputs<
     allocate(mu_vec, params.cols);
     r.fill(mu_vec, params.cols, params.mu, stream);
     T* sigma_vec = nullptr;
-    r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec, params.sigma, stream);
+    r.normalTable(data, params.rows, params.cols, mu_vec, sigma_vec,
+                  params.sigma, stream);
     static const int threads = 128;
-    meanKernel<T, threads><<<raft::ceildiv(len, threads), threads, 0, stream>>>(stats, data, len);
+    meanKernel<T, threads>
+      <<<raft::ceildiv(len, threads), threads, 0, stream>>>(stats, data, len);
     update_host<T>(h_stats, stats, 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= len;
@@ -522,15 +535,13 @@ class RngNormalTableTest : public ::testing::TestWithParam<RngNormalTableInputs<
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(stats));
     CUDA_CHECK(cudaFree(mu_vec));
   }
 
-  void getExpectedMeanVar(T meanvar[2])
-  {
+  void getExpectedMeanVar(T meanvar[2]) {
     meanvar[0] = params.mu;
     meanvar[1] = params.sigma * params.sigma;
   }
@@ -551,14 +562,16 @@ const std::vector<RngNormalTableInputs<float>> inputsf_t = {
   {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL},
   {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}};
 
-TEST_P(RngNormalTableTestF, Result)
-{
+TEST_P(RngNormalTableTestF, Result) {
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0],
+                    CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1],
+                    CompareApprox<float>(num_sigma * params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, ::testing::ValuesIn(inputsf_t));
+INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF,
+                         ::testing::ValuesIn(inputsf_t));
 
 typedef RngNormalTableTest<double> RngNormalTableTestD;
 const std::vector<RngNormalTableInputs<double>> inputsd_t = {
@@ -568,14 +581,16 @@ const std::vector<RngNormalTableInputs<double>> inputsd_t = {
   {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL},
   {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL},
   {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}};
-TEST_P(RngNormalTableTestD, Result)
-{
+TEST_P(RngNormalTableTestD, Result) {
   double meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<double>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0],
+                    CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1],
+                    CompareApprox<double>(num_sigma * params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, ::testing::ValuesIn(inputsd_t));
+INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD,
+                         ::testing::ValuesIn(inputsd_t));
 
 struct RngAffineInputs {
   int n;
@@ -584,15 +599,13 @@ struct RngAffineInputs {
 
 class RngAffineTest : public ::testing::TestWithParam<RngAffineInputs> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<RngAffineInputs>::GetParam();
     Rng r(params.seed);
     r.affine_transform_params(params.n, a, b);
   }
 
-  void check()
-  {
+  void check() {
     ASSERT_TRUE(gcd(a, params.n) == 1);
     ASSERT_TRUE(0 <= b && b < params.n);
   }
@@ -603,17 +616,13 @@ class RngAffineTest : public ::testing::TestWithParam<RngAffineInputs> {
 };  // RngAffineTest
 
 const std::vector<RngAffineInputs> inputs_affine = {
-  {100, 123456ULL},
-  {100, 1234567890ULL},
-  {101, 123456ULL},
-  {101, 1234567890ULL},
-  {7, 123456ULL},
-  {7, 1234567890ULL},
-  {2568, 123456ULL},
-  {2568, 1234567890ULL},
+  {100, 123456ULL},     {100, 1234567890ULL},  {101, 123456ULL},
+  {101, 1234567890ULL}, {7, 123456ULL},        {7, 1234567890ULL},
+  {2568, 123456ULL},    {2568, 1234567890ULL},
 };
 TEST_P(RngAffineTest, Result) { check(); }
-INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, ::testing::ValuesIn(inputs_affine));
+INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest,
+                         ::testing::ValuesIn(inputs_affine));
 
 }  // namespace random
 }  // namespace raft
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index c77c3df526..92f12206e8 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -27,13 +27,12 @@ namespace random {
 enum RandomType { RNG_Uniform };
 
 template <typename T, int TPB>
-__global__ void meanKernel(float* out, const T* data, int len)
-{
+__global__ void meanKernel(float *out, const T *data, int len) {
   typedef cub::BlockReduce<float, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  int tid   = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
   float val = tid < len ? data[tid] : T(0);
-  float x   = BlockReduce(temp_storage).Sum(val);
+  float x = BlockReduce(temp_storage).Sum(val);
   __syncthreads();
   float xx = BlockReduce(temp_storage).Sum(val * val);
   __syncthreads();
@@ -60,16 +59,14 @@ struct RngInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os, const RngInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
 class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<RngInputs<T>>::GetParam();
     Rng r(params.seed, params.gtype);
 
@@ -78,11 +75,14 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     allocate(data, params.len);
     allocate(stats, 2, true);
     switch (params.type) {
-      case RNG_Uniform: r.uniformInt(data, params.len, params.start, params.end, stream); break;
+      case RNG_Uniform:
+        r.uniformInt(data, params.len, params.start, params.end, stream);
+        break;
     };
     static const int threads = 128;
     meanKernel<T, threads>
-      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(stats, data, params.len);
+      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(stats, data,
+                                                                   params.len);
     update_host<float>(h_stats, stats, 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
@@ -90,14 +90,12 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(stats));
   }
 
-  void getExpectedMeanVar(float meanvar[2])
-  {
+  void getExpectedMeanVar(float meanvar[2]) {
     switch (params.type) {
       case RNG_Uniform:
         meanvar[0] = (params.start + params.end) * 0.5f;
@@ -109,8 +107,8 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
 
  protected:
   RngInputs<T> params;
-  T* data;
-  float* stats;
+  T *data;
+  float *stats;
   float h_stats[2];  // mean, var
 };
 
@@ -122,12 +120,13 @@ const std::vector<RngInputs<uint32_t>> inputs_u32 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestU32, Result)
-{
+TEST_P(RngTestU32, Result) {
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU32, ::testing::ValuesIn(inputs_u32));
 
@@ -139,12 +138,13 @@ const std::vector<RngInputs<uint64_t>> inputs_u64 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestU64, Result)
-{
+TEST_P(RngTestU64, Result) {
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU64, ::testing::ValuesIn(inputs_u64));
 
@@ -156,12 +156,13 @@ const std::vector<RngInputs<int32_t>> inputs_s32 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestS32, Result)
-{
+TEST_P(RngTestS32, Result) {
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS32, ::testing::ValuesIn(inputs_s32));
 
@@ -173,12 +174,13 @@ const std::vector<RngInputs<int64_t>> inputs_s64 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestS64, Result)
-{
+TEST_P(RngTestS64, Result) {
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS64, ::testing::ValuesIn(inputs_s64));
 
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index c258841c3e..d7e52a8958 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -38,16 +38,14 @@ struct SWoRInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const SWoRInputs<T>& dims)
-{
+::std::ostream& operator<<(::std::ostream& os, const SWoRInputs<T>& dims) {
   return os;
 }
 
 template <typename T>
 class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<SWoRInputs<T>>::GetParam();
     CUDA_CHECK(cudaStreamCreate(&stream));
 
@@ -60,14 +58,15 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
     r.uniform(in, params.len, T(-1.0), T(1.0), stream);
     r.uniform(wts, params.len, T(1.0), T(2.0), stream);
     if (params.largeWeightIndex >= 0) {
-      update_device(wts + params.largeWeightIndex, &params.largeWeight, 1, stream);
+      update_device(wts + params.largeWeightIndex, &params.largeWeight, 1,
+                    stream);
     }
-    r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen, params.len, stream);
+    r.sampleWithoutReplacement(handle, out, outIdx, in, wts, params.sampledLen,
+                               params.len, stream);
     update_host(&(h_outIdx[0]), outIdx, params.sampledLen, stream);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaStreamDestroy(stream));
     CUDA_CHECK(cudaFree(in));
@@ -148,14 +147,14 @@ const std::vector<SWoRInputs<float>> inputsf = {
   {1024, 512, 10, 100000.f, GenKiss99, 1234ULL},
 };
 
-TEST_P(SWoRTestF, Result)
-{
+TEST_P(SWoRTestF, Result) {
   std::set<int> occurence;
   for (int i = 0; i < params.sampledLen; ++i) {
     auto val = h_outIdx[i];
     // indices must be in the given range
     ASSERT_TRUE(0 <= val && val < params.len)
-      << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen;
+      << "out-of-range index @i=" << i << " val=" << val
+      << " sampledLen=" << params.sampledLen;
     // indices should not repeat
     ASSERT_TRUE(occurence.find(val) == occurence.end())
       << "repeated index @i=" << i << " idx=" << val;
@@ -163,7 +162,9 @@ TEST_P(SWoRTestF, Result)
   }
   // if there's a skewed distribution, the top index should correspond to the
   // particular item with a large weight
-  if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); }
+  if (params.largeWeightIndex >= 0) {
+    ASSERT_EQ(h_outIdx[0], params.largeWeightIndex);
+  }
 }
 INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf));
 
@@ -230,14 +231,14 @@ const std::vector<SWoRInputs<double>> inputsd = {
   {1024, 512, 10, 100000.0, GenKiss99, 1234ULL},
 };
 
-TEST_P(SWoRTestD, Result)
-{
+TEST_P(SWoRTestD, Result) {
   std::set<int> occurence;
   for (int i = 0; i < params.sampledLen; ++i) {
     auto val = h_outIdx[i];
     // indices must be in the given range
     ASSERT_TRUE(0 <= val && val < params.len)
-      << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen;
+      << "out-of-range index @i=" << i << " val=" << val
+      << " sampledLen=" << params.sampledLen;
     // indices should not repeat
     ASSERT_TRUE(occurence.find(val) == occurence.end())
       << "repeated index @i=" << i << " idx=" << val;
@@ -245,7 +246,9 @@ TEST_P(SWoRTestD, Result)
   }
   // if there's a skewed distribution, the top index should correspond to the
   // particular item with a large weight
-  if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); }
+  if (params.largeWeightIndex >= 0) {
+    ASSERT_EQ(h_outIdx[0], params.largeWeightIndex);
+  }
 }
 INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index e1f814a5b6..713708d4cd 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -44,14 +44,14 @@ struct CSRAddInputs {
 };
 
 template <typename Type_f, typename Index_>
-class CSRAddTest : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>> {
+class CSRAddTest
+  : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>> {
  protected:
-  void SetUp() override
-  {
-    params     = ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>::GetParam();
-    n_rows     = params.matrix_a.row_ind.size();
-    nnz_a      = params.matrix_a.row_ind_ptr.size();
-    nnz_b      = params.matrix_b.row_ind_ptr.size();
+  void SetUp() override {
+    params = ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>::GetParam();
+    n_rows = params.matrix_a.row_ind.size();
+    nnz_a = params.matrix_a.row_ind_ptr.size();
+    nnz_b = params.matrix_b.row_ind_ptr.size();
     nnz_result = params.matrix_verify.row_ind_ptr.size();
 
     cudaStreamCreate(&stream);
@@ -73,61 +73,46 @@ class CSRAddTest : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>
     raft::allocate(values_result, nnz_result);
   }
 
-  void Run()
-  {
-    std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
+  void Run() {
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
 
     raft::update_device(ind_a, params.matrix_a.row_ind.data(), n_rows, stream);
-    raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a, stream);
+    raft::update_device(ind_ptr_a, params.matrix_a.row_ind_ptr.data(), nnz_a,
+                        stream);
     raft::update_device(values_a, params.matrix_a.values.data(), nnz_a, stream);
 
     raft::update_device(ind_b, params.matrix_b.row_ind.data(), n_rows, stream);
-    raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b, stream);
+    raft::update_device(ind_ptr_b, params.matrix_b.row_ind_ptr.data(), nnz_b,
+                        stream);
     raft::update_device(values_b, params.matrix_b.values.data(), nnz_b, stream);
 
-    raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows, stream);
-    raft::update_device(
-      ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(), nnz_result, stream);
-    raft::update_device(values_verify, params.matrix_verify.values.data(), nnz_result, stream);
-
-    Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(ind_a,
-                                                       ind_ptr_a,
-                                                       values_a,
-                                                       nnz_a,
-                                                       ind_b,
-                                                       ind_ptr_b,
-                                                       values_b,
-                                                       nnz_b,
-                                                       n_rows,
-                                                       ind_result,
-                                                       alloc,
-                                                       stream);
+    raft::update_device(ind_verify, params.matrix_verify.row_ind.data(), n_rows,
+                        stream);
+    raft::update_device(ind_ptr_verify, params.matrix_verify.row_ind_ptr.data(),
+                        nnz_result, stream);
+    raft::update_device(values_verify, params.matrix_verify.values.data(),
+                        nnz_result, stream);
+
+    Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(
+      ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b,
+      n_rows, ind_result, alloc, stream);
 
     ASSERT_TRUE(nnz == nnz_result);
-    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_verify, ind_result, n_rows, raft::Compare<Index_>()));
-
-    linalg::csr_add_finalize<Type_f, 32>(ind_a,
-                                         ind_ptr_a,
-                                         values_a,
-                                         nnz_a,
-                                         ind_b,
-                                         ind_ptr_b,
-                                         values_b,
-                                         nnz_b,
-                                         n_rows,
-                                         ind_result,
-                                         ind_ptr_result,
-                                         values_result,
-                                         stream);
-
-    ASSERT_TRUE(
-      raft::devArrMatch<Index_>(ind_ptr_verify, ind_ptr_result, nnz, raft::Compare<Index_>()));
-    ASSERT_TRUE(
-      raft::devArrMatch<Type_f>(values_verify, values_result, nnz, raft::Compare<Type_f>()));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_verify, ind_result, n_rows,
+                                          raft::Compare<Index_>()));
+
+    linalg::csr_add_finalize<Type_f, 32>(
+      ind_a, ind_ptr_a, values_a, nnz_a, ind_b, ind_ptr_b, values_b, nnz_b,
+      n_rows, ind_result, ind_ptr_result, values_result, stream);
+
+    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_ptr_verify, ind_ptr_result, nnz,
+                                          raft::Compare<Index_>()));
+    ASSERT_TRUE(raft::devArrMatch<Type_f>(values_verify, values_result, nnz,
+                                          raft::Compare<Type_f>()));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(ind_a));
     CUDA_CHECK(cudaFree(ind_b));
     CUDA_CHECK(cudaFree(ind_result));
@@ -146,8 +131,8 @@ class CSRAddTest : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>
   CSRAddInputs<Type_f, Index_> params;
   cudaStream_t stream;
   Index_ n_rows, nnz_a, nnz_b, nnz_result;
-  Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b, *ind_ptr_verify,
-    *ind_ptr_result;
+  Index_ *ind_a, *ind_b, *ind_verify, *ind_result, *ind_ptr_a, *ind_ptr_b,
+    *ind_ptr_verify, *ind_ptr_result;
   Type_f *values_a, *values_b, *values_verify, *values_result;
 };
 
@@ -180,8 +165,10 @@ const std::vector<CSRAddInputs<double, int>> csradd_inputs_d = {
     {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, ::testing::ValuesIn(csradd_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, ::testing::ValuesIn(csradd_inputs_d));
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF,
+                        ::testing::ValuesIn(csradd_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD,
+                        ::testing::ValuesIn(csradd_inputs_d));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index 3678d34bbe..d98f9de9c3 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -51,24 +51,26 @@ struct ConnectComponentsInputs {
 };
 
 template <typename value_idx, typename value_t>
-class ConnectComponentsTest
-  : public ::testing::TestWithParam<ConnectComponentsInputs<value_t, value_idx>> {
+class ConnectComponentsTest : public ::testing::TestWithParam<
+                                ConnectComponentsInputs<value_t, value_idx>> {
  protected:
-  void basicTest()
-  {
+  void basicTest() {
     raft::handle_t handle;
 
     auto d_alloc = handle.get_device_allocator();
-    auto stream  = handle.get_stream();
+    auto stream = handle.get_stream();
 
-    params = ::testing::TestWithParam<ConnectComponentsInputs<value_t, value_idx>>::GetParam();
+    params = ::testing::TestWithParam<
+      ConnectComponentsInputs<value_t, value_idx>>::GetParam();
 
-    raft::sparse::COO<value_t, value_idx> out_edges(handle.get_device_allocator(),
-                                                    handle.get_stream());
+    raft::sparse::COO<value_t, value_idx> out_edges(
+      handle.get_device_allocator(), handle.get_stream());
 
-    rmm::device_uvector<value_t> data(params.n_row * params.n_col, handle.get_stream());
+    rmm::device_uvector<value_t> data(params.n_row * params.n_col,
+                                      handle.get_stream());
 
-    raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream());
+    raft::copy(data.data(), params.data.data(), data.size(),
+               handle.get_stream());
 
     rmm::device_uvector<value_idx> indptr(params.n_row + 1, stream);
 
@@ -77,58 +79,44 @@ class ConnectComponentsTest
      */
     raft::sparse::COO<value_t, value_idx> knn_graph_coo(d_alloc, stream);
 
-    raft::sparse::selection::knn_graph(handle,
-                                       data.data(),
-                                       params.n_row,
-                                       params.n_col,
-                                       raft::distance::DistanceType::L2SqrtExpanded,
-                                       knn_graph_coo,
-                                       params.c);
+    raft::sparse::selection::knn_graph(
+      handle, data.data(), params.n_row, params.n_col,
+      raft::distance::DistanceType::L2SqrtExpanded, knn_graph_coo, params.c);
 
-    raft::sparse::convert::sorted_coo_to_csr(
-      knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), params.n_row + 1, d_alloc, stream);
+    raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(),
+                                             knn_graph_coo.nnz, indptr.data(),
+                                             params.n_row + 1, d_alloc, stream);
 
     /**
      * 2. Construct MST, sorted by weights
      */
     rmm::device_uvector<value_idx> colors(params.n_row, stream);
 
-    auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(handle,
-                                                                         indptr.data(),
-                                                                         knn_graph_coo.cols(),
-                                                                         knn_graph_coo.vals(),
-                                                                         params.n_row,
-                                                                         knn_graph_coo.nnz,
-                                                                         colors.data(),
-                                                                         stream,
-                                                                         false,
-                                                                         true);
+    auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(
+      handle, indptr.data(), knn_graph_coo.cols(), knn_graph_coo.vals(),
+      params.n_row, knn_graph_coo.nnz, colors.data(), stream, false, true);
 
     /**
      * 3. connect_components to fix connectivities
      */
-    raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> red_op(colors.data(), params.n_row);
+    raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> red_op(
+      colors.data(), params.n_row);
     raft::linkage::connect_components<value_idx, value_t>(
-      handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op);
+      handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col,
+      red_op);
 
     /**
      * Construct final edge list
      */
     rmm::device_uvector<value_idx> indptr2(params.n_row + 1, stream);
 
-    raft::sparse::convert::sorted_coo_to_csr(
-      out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, d_alloc, stream);
+    raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz,
+                                             indptr2.data(), params.n_row + 1,
+                                             d_alloc, stream);
 
-    auto output_mst = raft::mst::mst<value_idx, value_idx, value_t>(handle,
-                                                                    indptr2.data(),
-                                                                    out_edges.cols(),
-                                                                    out_edges.vals(),
-                                                                    params.n_row,
-                                                                    out_edges.nnz,
-                                                                    colors.data(),
-                                                                    stream,
-                                                                    false,
-                                                                    false);
+    auto output_mst = raft::mst::mst<value_idx, value_idx, value_t>(
+      handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row,
+      out_edges.nnz, colors.data(), stream, false, false);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
@@ -150,199 +138,366 @@ const std::vector<ConnectComponentsInputs<float, int>> fix_conn_inputsf2 = {
   // Test n_clusters == n_points
   {10,
    5,
-   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
-    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
-    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
-    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
-    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
-    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392,
+    0.77782677, 0.43772379, 0.4035871,  0.3282796,  0.47544681, 0.59862974,
+    0.12319357, 0.06239463, 0.28200272, 0.1345717,  0.50498218, 0.5113505,
+    0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,
+    0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792,
+    0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692,
+    0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
     0.76166195, 0.66613745},
    -1},
   // Test n_points == 100
   {100,
    10,
-   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
-    6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
-    9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
-    7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
-    3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
-    9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
-    7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
-    4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
-    3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
-    9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
-    8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
-    5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
-    8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
-    6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
-    1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
-    5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
-    9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
-    6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
-    9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
-    1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
-    8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
-    2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
-    1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
-    5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
-    4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
-    6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
-    9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
-    7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
-    7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
-    5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
-    1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
-    6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
-    3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
-    3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
-    4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
-    2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
-    7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
-    5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
-    7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
-    2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
-    7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
-    1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
-    9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
-    2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
-    4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
-    5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
-    6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
-    4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
-    5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
-    9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
-    1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
-    9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
-    3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
-    2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
-    1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
-    2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
-    2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
-    8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
-    9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
-    9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
-    9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
-    8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
-    4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
-    1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
-    3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
-    5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
-    1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
-    8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
-    1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
-    6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
-    8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
-    5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
-    3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
-    1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
-    2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
-    6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
-    6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
-    6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
-    3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
-    1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
-    9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
-    9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
-    3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
-    1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
-    9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
-    9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
-    2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
-    3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
-    3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
-    5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
-    6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
-    3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
-    1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
-    2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
-    4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
-    1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
-    8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
-    8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
-    9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
-    6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
-    7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
-    8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
-    5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
-    7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
-    1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
-    8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
-    1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
-    3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
-    9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
-    2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
-    6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
-    5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
-    2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
-    7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
-    4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
-    9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
-    2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
-    5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
-    4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
-    4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
-    8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
-    7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
-    4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
-    1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
-    2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
-    9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
-    1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
-    3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
-    3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
-    7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
-    8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
-    5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
-    8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
-    4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
-    7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
-    4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
-    7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
-    1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
-    6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
-    9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
-    1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
-    8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
-    9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
-    4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
-    7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
-    1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
-    2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
-    7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
-    7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
-    3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
-    7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
-    2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
-    2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
-    9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
-    4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
-    4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
-    5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
-    3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
-    9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
-    4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
-    1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
-    3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
-    4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
-    8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
-    5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
-    4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
-    1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01
+   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01,
+    2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
+    6.88942598e-01, 5.79163537e-01, 6.70341547e-01,
+    2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
+    9.89948537e-01, 7.75253347e-01, 1.34491522e-02,
+    2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
+    7.86373507e-01, 7.18748577e-01, 8.66998621e-01,
+    6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
+    3.76246281e-01, 4.86828710e-01, 5.67464772e-01,
+    5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
+    9.49339111e-01, 3.55248484e-01, 9.06046929e-01,
+    4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
+    7.74840000e-01, 5.21046603e-01, 4.66423971e-02,
+    5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
+    4.31536306e-01, 5.83857744e-01, 4.41787364e-01,
+    4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
+    3.19650588e-01, 6.12579596e-01, 6.49126442e-02,
+    8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
+    9.46507115e-01, 8.58440748e-01, 3.61528940e-01,
+    2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
+    8.82216988e-01, 8.31498633e-01, 7.23474381e-01,
+    7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
+    5.13985168e-01, 3.00686418e-01, 8.70109949e-01,
+    2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
+    8.70985521e-01, 8.77491176e-01, 6.72537226e-01,
+    3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
+    6.18239142e-01, 2.64768597e-01, 5.76145451e-01,
+    8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
+    1.27645356e-01, 4.51004673e-01, 3.92292980e-01,
+    4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
+    5.71832605e-02, 2.06763039e-01, 3.70116249e-01,
+    2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
+    9.84156240e-02, 2.66249156e-01, 3.87635103e-01,
+    2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
+    6.86227676e-01, 1.08848960e-01, 5.96731841e-02,
+    3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
+    9.00700636e-01, 8.76363105e-01, 2.67334632e-01,
+    1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
+    1.09372387e-01, 8.74028108e-01, 6.46403232e-01,
+    4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
+    8.83865057e-01, 3.15879821e-01, 2.27043992e-01,
+    9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
+    2.40548962e-01, 3.21795663e-01, 8.75087904e-02,
+    8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
+    1.21958818e-01, 3.44348628e-02, 8.72630414e-01,
+    3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
+    5.33896401e-01, 6.21642973e-01, 4.93062535e-01,
+    4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
+    4.43447610e-01, 8.95646149e-01, 6.05220676e-01,
+    1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
+    6.92582693e-01, 7.55946922e-01, 7.95086143e-01,
+    6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
+    9.81114529e-01, 4.98266428e-01, 6.37127930e-03,
+    1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
+    7.38827633e-01, 8.93214770e-01, 2.16494306e-01,
+    9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
+    7.86240041e-01, 7.06854594e-01, 2.13725879e-02,
+    7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
+    5.01989826e-03, 4.22081572e-02, 1.65337732e-01,
+    8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
+    1.14028379e-01, 3.69739861e-01, 1.32955599e-01,
+    2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
+    6.88449594e-01, 4.44921417e-01, 8.23296587e-01,
+    1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
+    3.42600285e-01, 5.64505195e-01, 5.57594559e-01,
+    7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
+    3.21010077e-01, 8.55081359e-01, 4.30105779e-01,
+    1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
+    4.50880592e-01, 2.72289598e-01, 6.31615256e-01,
+    8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
+    2.68767748e-02, 2.43374608e-01, 4.02141103e-01,
+    4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
+    7.16149148e-01, 4.19664401e-01, 2.29335357e-01,
+    2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
+    5.69849716e-01, 5.86454477e-01, 3.54474989e-01,
+    9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
+    7.88039746e-02, 2.04814126e-01, 7.82251754e-01,
+    2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
+    2.95349590e-01, 6.57991826e-01, 8.81214312e-01,
+    5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
+    7.69797417e-02, 6.44792402e-01, 9.46950998e-01,
+    7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
+    1.67498426e-01, 2.66514296e-01, 6.50140368e-01,
+    1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
+    9.85033484e-01, 2.92909152e-01, 8.65816607e-01,
+    1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
+    2.89234322e-01, 8.18668708e-01, 4.71706924e-01,
+    9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
+    4.13915748e-01, 9.31274932e-02, 6.66322195e-01,
+    9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
+    5.03096313e-02, 6.95225201e-01, 5.78469859e-01,
+    6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
+    6.80663678e-01, 6.34607157e-01, 6.42765834e-01,
+    1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
+    4.68676824e-01, 2.86003928e-01, 7.18608322e-01,
+    8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
+    5.24379196e-01, 2.13526524e-01, 5.88375435e-01,
+    9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
+    9.53760881e-01, 5.27151288e-01, 7.03017278e-01,
+    3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
+    1.98979011e-01, 4.24917361e-01, 5.73172761e-01,
+    2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
+    9.29665524e-01, 2.26135696e-01, 9.20563384e-01,
+    7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
+    3.78559302e-03, 9.15219382e-01, 3.55705698e-01,
+    6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
+    2.49478206e-01, 7.93679304e-01, 4.75830027e-01,
+    4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
+    1.70386675e-01, 7.04056121e-01, 4.85963102e-01,
+    9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
+    2.58915007e-01, 6.70052890e-01, 2.61945109e-01,
+    8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
+    2.45776933e-01, 2.66658783e-01, 3.71724077e-01,
+    4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
+    8.07997684e-01, 1.64296275e-01, 6.01638065e-01,
+    8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
+    9.75844338e-01, 7.81226782e-01, 2.20925515e-01,
+    7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
+    9.08058083e-01, 6.88010677e-01, 8.14271847e-01,
+    5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
+    9.17455497e-01, 2.12052706e-01, 7.04074603e-01,
+    8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
+    8.54801557e-01, 2.49729159e-01, 9.76594604e-01,
+    2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
+    4.25193986e-01, 7.61869994e-01, 5.13334255e-01,
+    6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
+    1.08154647e-01, 8.78446825e-01, 2.43833016e-01,
+    9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
+    3.74510294e-01, 4.08451278e-02, 9.78392777e-01,
+    3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
+    5.25978080e-01, 1.42803678e-01, 4.05451674e-01,
+    7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
+    1.43159543e-02, 1.80363779e-01, 5.05096904e-01,
+    2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
+    8.73223968e-01, 4.38545619e-01, 4.81348800e-01,
+    6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
+    1.89869550e-01, 2.34083070e-01, 2.94066207e-01,
+    5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
+    6.72650672e-02, 8.47345378e-01, 2.80916761e-01,
+    7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
+    8.48781331e-01, 8.83225408e-01, 7.34398275e-01,
+    7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
+    5.40732486e-01, 3.69704071e-01, 5.77305837e-01,
+    2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
+    3.49496706e-01, 8.34948910e-01, 1.56403291e-02,
+    6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
+    1.43943153e-01, 3.49618530e-01, 2.10440392e-01,
+    3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
+    2.72177079e-01, 7.07946300e-01, 4.33717726e-02,
+    3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
+    6.22777789e-01, 2.95989228e-02, 4.32855769e-01,
+    7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
+    6.43721247e-01, 6.58025802e-01, 1.05247633e-02,
+    5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
+    6.62634841e-01, 8.25936616e-01, 9.91253704e-01,
+    6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
+    3.32139049e-01, 7.98732398e-01, 7.38865223e-01,
+    9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
+    1.83778839e-01, 7.27558919e-02, 5.91602822e-01,
+    3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
+    9.18556407e-01, 9.35373324e-01, 6.89209070e-01,
+    2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
+    9.84983432e-01, 6.62322741e-01, 2.04144457e-01,
+    3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
+    3.14043787e-01, 5.91072666e-01, 7.44703771e-01,
+    8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
+    1.41526372e-01, 4.14878484e-01, 6.80683651e-01,
+    5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
+    9.03269815e-01, 8.68443745e-01, 9.86939190e-01,
+    7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
+    9.69509248e-01, 1.11908818e-01, 4.49198556e-01,
+    1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
+    2.10747488e-01, 9.53884090e-01, 8.43167950e-01,
+    4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
+    3.55290379e-01, 2.95705968e-01, 1.69622690e-01,
+    1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
+    3.70932500e-01, 9.94292830e-01, 4.62587505e-01,
+    7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
+    5.75768304e-01, 9.71448393e-01, 6.95574827e-02,
+    3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
+    6.73797120e-02, 6.76596969e-01, 5.50948898e-01,
+    3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
+    3.03264879e-01, 7.61037886e-03, 2.72289601e-01,
+    1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
+    1.92088941e-01, 2.19043977e-01, 9.09320161e-01,
+    2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
+    2.23355609e-01, 1.84789435e-01, 4.16104518e-01,
+    4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
+    4.50328256e-01, 8.72199917e-01, 2.51279916e-01,
+    4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
+    1.06187277e-01, 4.92341327e-01, 1.46017513e-01,
+    5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
+    8.72648431e-01, 5.54051490e-01, 1.80745062e-01,
+    2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
+    8.30254678e-01, 5.00003328e-01, 4.69017439e-01,
+    6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
+    9.06516882e-02, 8.52975842e-01, 1.19985883e-01,
+    3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
+    6.28362507e-02, 4.32693501e-01, 3.10500685e-01,
+    6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
+    7.91284868e-01, 7.93054570e-01, 2.93406765e-01,
+    8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
+    8.67523104e-01, 1.47963482e-01, 1.25584706e-01,
+    3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
+    5.75553531e-02, 5.31607516e-01, 2.63869588e-01,
+    9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
+    7.74866558e-01, 5.65210610e-01, 7.28015327e-02,
+    6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
+    1.29932405e-01, 8.64026259e-01, 9.92599934e-01,
+    7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
+    8.11335531e-01, 7.87734900e-01, 9.87344678e-01,
+    5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
+    1.66085871e-01, 1.12937664e-01, 5.24423470e-01,
+    6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
+    3.08722276e-02, 6.26979315e-01, 4.49754105e-01,
+    8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
+    9.22168418e-01, 3.73210378e-01, 8.04432575e-01,
+    5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
+    2.40407640e-01, 5.91631279e-01, 1.59369206e-01,
+    7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
+    6.39105224e-01, 4.85274738e-01, 2.12630838e-01,
+    2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
+    5.23869697e-01, 9.99418314e-01, 8.35331599e-01,
+    4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
+    2.77001890e-02, 5.75809742e-01, 2.78513031e-01,
+    8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
+    7.88311357e-01, 9.64676177e-01, 1.75752651e-01,
+    4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
+    4.06647450e-01, 8.46539387e-01, 2.12620694e-01,
+    9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
+    9.63626055e-01, 5.96689242e-01, 1.63372670e-01,
+    4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
+    2.82327625e-01, 4.75535418e-01, 6.27760926e-01,
+    8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
+    5.05508062e-01, 5.28102944e-01, 6.13045057e-01,
+    7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
+    4.89839179e-01, 3.10496849e-01, 8.82309038e-01,
+    2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
+    4.62955493e-01, 2.38185305e-01, 5.47259907e-02,
+    7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
+    8.77741168e-01, 4.19881322e-01, 4.81222328e-01,
+    1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
+    7.37216484e-01, 5.62134821e-02, 7.14089724e-01,
+    9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
+    4.70237690e-01, 2.66524167e-01, 7.93875484e-01,
+    4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
+    1.70082405e-01, 6.35905179e-01, 3.75379109e-01,
+    4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
+    2.24643800e-01, 2.42142981e-01, 6.57283636e-01,
+    3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
+    9.43856291e-01, 4.47518596e-01, 5.44453573e-01,
+    9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
+    1.01179183e-01, 4.45473958e-01, 4.60327322e-01,
+    4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
+    3.41027487e-01, 1.56175026e-01, 7.58283148e-01,
+    6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
+    3.92517893e-01, 6.70418431e-01, 5.16440832e-01,
+    8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
+    7.39396341e-01, 7.20852434e-01, 2.35653246e-02,
+    3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
+    8.79339335e-01, 7.41599159e-02, 5.62433904e-01,
+    6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
+    5.26845015e-02, 5.58471266e-01, 1.63632233e-01,
+    5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
+    8.99035326e-01, 7.20847756e-01, 5.68954684e-01,
+    7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
+    4.89328290e-01, 5.62208561e-01, 4.97540804e-02,
+    4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
+    7.89548214e-01, 8.46136387e-01, 8.46816189e-01,
+    1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
+    4.50646654e-01, 3.74785037e-01, 4.87196697e-01,
+    4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
+    7.13597697e-01, 1.23641270e-02, 5.10031271e-01,
+    4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
+    1.91165703e-01, 4.51170940e-01, 7.50843157e-01,
+    4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
+    6.55689206e-01, 9.68257670e-02, 1.96528793e-01,
+    8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
+    9.41828079e-01, 4.54397338e-01, 5.61893331e-01,
+    5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
+    1.74888861e-01, 6.65641378e-01, 2.81668336e-01,
+    1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
+    8.25092797e-01, 5.18106324e-01, 1.71904024e-01,
+    3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
+    9.30274827e-01, 2.38198517e-01, 9.52222901e-01,
+    5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
+    4.83356794e-01, 2.73050402e-01, 3.68027050e-01,
+    5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
+    7.13926203e-01, 8.16750052e-01, 1.57890291e-01,
+    6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
+    1.02429784e-01, 9.17488471e-01, 4.03584434e-01,
+    9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
+    2.45200576e-01, 1.28896951e-01, 3.15713052e-01,
+    5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
+    7.74738919e-02, 8.42422142e-01, 3.75598924e-01,
+    3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
+    7.43107867e-01, 9.46182666e-01, 9.44344819e-01,
+    3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
+    3.84060507e-01, 2.91057722e-01, 7.68173662e-02,
+    1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
+    7.21342202e-01, 6.69471294e-03, 9.07298311e-01,
+    5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
+    2.06407453e-01, 2.59590556e-01, 7.58512718e-01,
+    5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
+    2.42829343e-01, 9.19323719e-01, 3.46832864e-01,
+    3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
+    9.58438860e-01, 5.66326411e-01, 6.60292846e-01,
+    5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
+    4.44713264e-01, 2.09732933e-01, 5.22732436e-01,
+    1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
+    4.94036404e-01, 4.09785794e-01, 6.40025507e-01,
+    5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
+    5.41072639e-01, 5.18847173e-01, 1.97093284e-01,
+    8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
+    3.87699807e-01, 4.50705808e-01, 2.49371643e-01,
+    3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
+    9.07275994e-01, 3.73075859e-01, 4.14044139e-03,
+    2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
+    4.50350196e-01, 3.48618117e-01, 5.07193932e-01,
+    5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
+    1.02623450e-01, 3.06088345e-01, 7.80461650e-01,
+    2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
+    3.68286735e-01, 7.39358243e-01, 8.97879394e-01,
+    9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
+    4.23976657e-02, 8.25922012e-01, 2.60956996e-01,
+    2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
+    8.49071471e-01, 3.45835425e-01, 7.65458276e-01,
+    5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
+    5.63368667e-02, 4.26548945e-01, 5.46745780e-01,
+    5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
+    4.46492976e-01, 6.40240123e-01, 2.73246969e-01,
+    2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
+    1.96617189e-01, 6.61271644e-01, 8.12687657e-01,
+    8.66342445e-01
 
    },
    -4}};
 
 typedef ConnectComponentsTest<int, float> ConnectComponentsTestF_Int;
-TEST_P(ConnectComponentsTestF_Int, Result)
-{
+TEST_P(ConnectComponentsTestF_Int, Result) {
   /**
-   * Verify the src & dst vertices on each edge have different colors
-   */
+     * Verify the src & dst vertices on each edge have different colors
+     */
   EXPECT_TRUE(final_edges == params.n_row - 1);
 }
 
-INSTANTIATE_TEST_CASE_P(ConnectComponentsTest,
-                        ConnectComponentsTestF_Int,
+INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, ConnectComponentsTestF_Int,
                         ::testing::ValuesIn(fix_conn_inputsf2));
 };  // namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index 2e4c2c1a14..ea69ecfc53 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -39,8 +39,7 @@ struct CSRtoCOOInputs {
 template <typename Index_>
 class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<CSRtoCOOInputs<Index_>>::GetParam();
 
     cudaStreamCreate(&stream);
@@ -49,21 +48,20 @@ class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
     raft::allocate(result, params.verify.size(), true);
   }
 
-  void Run()
-  {
+  void Run() {
     Index_ n_rows = params.ex_scan.size();
-    Index_ nnz    = params.verify.size();
+    Index_ nnz = params.verify.size();
 
     raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
     raft::update_device(verify, params.verify.data(), nnz, stream);
 
     convert::csr_to_coo<Index_, 32>(ex_scan, n_rows, result, nnz, stream);
 
-    ASSERT_TRUE(raft::devArrMatch<Index_>(verify, result, nnz, raft::Compare<float>(), stream));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(verify, result, nnz,
+                                          raft::Compare<float>(), stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(ex_scan));
     CUDA_CHECK(cudaFree(verify));
     CUDA_CHECK(cudaFree(result));
@@ -91,11 +89,9 @@ const std::vector<CSRtoCOOInputs<int64_t>> csrtocoo_inputs_64 = {
   {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest,
-                        CSRtoCOOTestI,
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestI,
                         ::testing::ValuesIn(csrtocoo_inputs_32));
-INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest,
-                        CSRtoCOOTestL,
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestL,
                         ::testing::ValuesIn(csrtocoo_inputs_64));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index b2878081ae..553ef2ddee 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -37,13 +37,14 @@ struct SparseConvertCSRInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const SparseConvertCSRInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const SparseConvertCSRInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
-class SparseConvertCSRTest : public ::testing::TestWithParam<SparseConvertCSRInputs<T>> {
+class SparseConvertCSRTest
+  : public ::testing::TestWithParam<SparseConvertCSRInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -53,21 +54,22 @@ class SparseConvertCSRTest : public ::testing::TestWithParam<SparseConvertCSRInp
   SparseConvertCSRInputs<T> params;
 };
 
-const std::vector<SparseConvertCSRInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
+const std::vector<SparseConvertCSRInputs<float>> inputsf = {
+  {5, 10, 5, 1234ULL}};
 
 typedef SparseConvertCSRTest<float> SortedCOOToCSR;
-TEST_P(SortedCOOToCSR, Result)
-{
+TEST_P(SortedCOOToCSR, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
+  std::shared_ptr<raft::mr::device::allocator> alloc(
+    new raft::mr::device::default_allocator);
 
   int nnz = 8;
 
   int *in, *out, *exp;
 
-  int* in_h  = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
-  int* exp_h = new int[4]{0, 2, 4, 6};
+  int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
+  int *exp_h = new int[4]{0, 2, 4, 6};
 
   raft::allocate(in, nnz, true);
   raft::allocate(exp, 4, true);
@@ -90,7 +92,8 @@ TEST_P(SortedCOOToCSR, Result)
   CUDA_CHECK(cudaFree(out));
 }
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR,
+                        ::testing::ValuesIn(inputsf));
 
 /******************************** adj graph ********************************/
 
@@ -104,10 +107,10 @@ struct CSRAdjGraphInputs {
 };
 
 template <typename Index_>
-class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_>> {
+class CSRAdjGraphTest
+  : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<CSRAdjGraphInputs<Index_>>::GetParam();
     cudaStreamCreate(&stream);
     nnz = params.verify.size();
@@ -118,21 +121,20 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_
     raft::allocate(verify, nnz);
   }
 
-  void Run()
-  {
+  void Run() {
     raft::update_device(row_ind, params.row_ind.data(), params.n_rows, stream);
-    raft::update_device(
-      adj, reinterpret_cast<bool*>(params.adj.data()), params.n_rows * params.n_cols, stream);
+    raft::update_device(adj, reinterpret_cast<bool *>(params.adj.data()),
+                        params.n_rows * params.n_cols, stream);
     raft::update_device(verify, params.verify.data(), nnz, stream);
 
     convert::csr_adj_graph_batched<Index_, 32>(
       row_ind, params.n_cols, nnz, params.n_rows, adj, result, stream);
 
-    ASSERT_TRUE(raft::devArrMatch<Index_>(verify, result, nnz, raft::Compare<Index_>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<Index_>(verify, result, nnz, raft::Compare<Index_>()));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(row_ind));
     CUDA_CHECK(cudaFree(adj));
     CUDA_CHECK(cudaFree(verify));
@@ -145,7 +147,7 @@ class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_
   cudaStream_t stream;
   Index_ nnz;
   Index_ *row_ind, *result, *verify;
-  bool* adj;
+  bool *adj;
 };
 
 using CSRAdjGraphTestI = CSRAdjGraphTest<int>;
@@ -169,11 +171,9 @@ const std::vector<CSRAdjGraphInputs<int64_t>> csradjgraph_inputs_l = {
    {0, 1, 2, 0, 1, 2, 0, 1, 2}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
-                        CSRAdjGraphTestI,
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestI,
                         ::testing::ValuesIn(csradjgraph_inputs_i));
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
-                        CSRAdjGraphTestL,
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestL,
                         ::testing::ValuesIn(csradjgraph_inputs_l));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index fe43f0d182..625772a842 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -47,19 +47,19 @@ struct CSRRowSliceInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream& operator<<(::std::ostream& os, const CSRRowSliceInputs<value_idx, value_t>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const CSRRowSliceInputs<value_idx, value_t> &dims) {
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>> {
+class CSRRowSliceTest
+  : public ::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>> {
  protected:
-  void make_data()
-  {
-    std::vector<value_idx> indptr_h  = params.indptr_h;
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h      = params.data_h;
+    std::vector<value_t> data_h = params.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
@@ -69,27 +69,31 @@ class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_
     update_device(indices, indices_h.data(), indices_h.size(), stream);
     update_device(data, data_h.data(), data_h.size(), stream);
 
-    std::vector<value_idx> out_indptr_ref_h  = params.out_indptr_ref_h;
+    std::vector<value_idx> out_indptr_ref_h = params.out_indptr_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
-    std::vector<value_t> out_data_ref_h      = params.out_data_ref_h;
+    std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
 
     allocate(out_indptr_ref, out_indptr_ref_h.size());
     allocate(out_indices_ref, out_indices_ref_h.size());
     allocate(out_data_ref, out_data_ref_h.size());
 
-    update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream);
-    update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
-    update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream);
+    update_device(out_indptr_ref, out_indptr_ref_h.data(),
+                  out_indptr_ref_h.size(), stream);
+    update_device(out_indices_ref, out_indices_ref_h.data(),
+                  out_indices_ref_h.size(), stream);
+    update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(),
+                  stream);
 
     allocate(out_indptr, out_indptr_ref_h.size());
     allocate(out_indices, out_indices_ref_h.size());
     allocate(out_data, out_data_ref_h.size());
   }
 
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      CSRRowSliceInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     make_data();
@@ -97,22 +101,18 @@ class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_
     int csr_start_offset;
     int csr_stop_offset;
 
-    raft::sparse::op::csr_row_slice_indptr(params.start_row,
-                                           params.stop_row,
-                                           indptr,
-                                           out_indptr,
-                                           &csr_start_offset,
-                                           &csr_stop_offset,
-                                           stream);
+    raft::sparse::op::csr_row_slice_indptr(
+      params.start_row, params.stop_row, indptr, out_indptr, &csr_start_offset,
+      &csr_stop_offset, stream);
 
-    raft::sparse::op::csr_row_slice_populate(
-      csr_start_offset, csr_stop_offset, indices, data, out_indices, out_data, stream);
+    raft::sparse::op::csr_row_slice_populate(csr_start_offset, csr_stop_offset,
+                                             indices, data, out_indices,
+                                             out_data, stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
@@ -125,14 +125,15 @@ class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_
     CUDA_CHECK(cudaFree(out_data_ref));
   }
 
-  void compare()
-  {
-    ASSERT_TRUE(
-      devArrMatch(out_indptr, out_indptr_ref, params.out_indptr_ref_h.size(), Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(
-      out_indices, out_indices_ref, params.out_indices_ref_h.size(), Compare<value_t>()));
-    ASSERT_TRUE(
-      devArrMatch(out_data, out_data_ref, params.out_data_ref_h.size(), Compare<value_t>()));
+  void compare() {
+    ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref,
+                            params.out_indptr_ref_h.size(),
+                            Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref,
+                            params.out_indices_ref_h.size(),
+                            Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out_data, out_data_ref,
+                            params.out_data_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
@@ -140,15 +141,15 @@ class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_
 
   // input data
   value_idx *indptr, *indices;
-  value_t* data;
+  value_t *data;
 
   // output data
   value_idx *out_indptr, *out_indices;
-  value_t* out_data;
+  value_t *out_data;
 
   // expected output data
   value_idx *out_indptr_ref, *out_indices_ref;
-  value_t* out_data_ref;
+  value_t *out_data_ref;
 
   CSRRowSliceInputs<value_idx, value_t> params;
 };
@@ -176,7 +177,8 @@ const std::vector<CSRRowSliceInputs<int, float>> inputs_i32_f = {
 };
 typedef CSRRowSliceTest<int, float> CSRRowSliceTestF;
 TEST_P(CSRRowSliceTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 286493ada7..5535df4fe3 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -43,19 +43,19 @@ struct CSRToDenseInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream& operator<<(::std::ostream& os, const CSRToDenseInputs<value_idx, value_t>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const CSRToDenseInputs<value_idx, value_t> &dims) {
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>> {
+class CSRToDenseTest
+  : public ::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>> {
  protected:
-  void make_data()
-  {
-    std::vector<value_idx> indptr_h  = params.indptr_h;
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h      = params.data_h;
+    std::vector<value_t> data_h = params.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
@@ -74,24 +74,24 @@ class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_id
     allocate(out, out_ref_h.size());
   }
 
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      CSRToDenseInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
     make_data();
 
-    convert::csr_to_dense(
-      handle, params.nrows, params.ncols, indptr, indices, data, params.nrows, out, stream, true);
+    convert::csr_to_dense(handle, params.nrows, params.ncols, indptr, indices,
+                          data, params.nrows, out, stream, true);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
@@ -100,9 +100,9 @@ class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_id
     CUDA_CHECK(cudaFree(out_ref));
   }
 
-  void compare()
-  {
-    ASSERT_TRUE(devArrMatch(out, out_ref, params.out_ref_h.size(), Compare<value_t>()));
+  void compare() {
+    ASSERT_TRUE(
+      devArrMatch(out, out_ref, params.out_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
@@ -111,13 +111,13 @@ class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_id
 
   // input data
   value_idx *indptr, *indices;
-  value_t* data;
+  value_t *data;
 
   // output data
-  value_t* out;
+  value_t *out;
 
   // expected output data
-  value_t* out_ref;
+  value_t *out_ref;
 
   CSRToDenseInputs<value_idx, value_t> params;
 };
@@ -128,26 +128,13 @@ const std::vector<CSRToDenseInputs<int, float>> inputs_i32_f = {
    {0, 2, 4, 6, 8},
    {0, 1, 2, 3, 0, 1, 2, 3},  // indices
    {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
-   {1.0f,
-    3.0f,
-    0.0f,
-    0.0f,
-    0.0f,
-    0.0f,
-    1.0f,
-    5.0f,
-    50.0f,
-    28.0f,
-    0.0f,
-    0.0f,
-    0.0f,
-    0.0f,
-    16.0f,
-    2.0f}},
+   {1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 5.0f, 50.0f, 28.0f, 0.0f, 0.0f,
+    0.0f, 0.0f, 16.0f, 2.0f}},
 };
 typedef CSRToDenseTest<int, float> CSRToDenseTestF;
 TEST_P(CSRToDenseTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index 87b8b17073..c257d6eb3c 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -49,19 +49,19 @@ struct CSRTransposeInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream& operator<<(::std::ostream& os, const CSRTransposeInputs<value_idx, value_t>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const CSRTransposeInputs<value_idx, value_t> &dims) {
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>> {
+class CSRTransposeTest
+  : public ::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>> {
  protected:
-  void make_data()
-  {
-    std::vector<value_idx> indptr_h  = params.indptr_h;
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h      = params.data_h;
+    std::vector<value_t> data_h = params.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
@@ -71,51 +71,45 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
     update_device(indices, indices_h.data(), indices_h.size(), stream);
     update_device(data, data_h.data(), data_h.size(), stream);
 
-    std::vector<value_idx> out_indptr_ref_h  = params.out_indptr_ref_h;
+    std::vector<value_idx> out_indptr_ref_h = params.out_indptr_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
-    std::vector<value_t> out_data_ref_h      = params.out_data_ref_h;
+    std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
 
     allocate(out_indptr_ref, out_indptr_ref_h.size());
     allocate(out_indices_ref, out_indices_ref_h.size());
     allocate(out_data_ref, out_data_ref_h.size());
 
-    update_device(out_indptr_ref, out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream);
-    update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
-    update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(), stream);
+    update_device(out_indptr_ref, out_indptr_ref_h.data(),
+                  out_indptr_ref_h.size(), stream);
+    update_device(out_indices_ref, out_indices_ref_h.data(),
+                  out_indices_ref_h.size(), stream);
+    update_device(out_data_ref, out_data_ref_h.data(), out_data_ref_h.size(),
+                  stream);
 
     allocate(out_indptr, out_indptr_ref_h.size());
     allocate(out_indices, out_indices_ref_h.size());
     allocate(out_data, out_data_ref_h.size());
   }
 
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      CSRTransposeInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
     make_data();
 
-    raft::sparse::linalg::csr_transpose(handle,
-                                        indptr,
-                                        indices,
-                                        data,
-                                        out_indptr,
-                                        out_indices,
-                                        out_data,
-                                        params.nrows,
-                                        params.ncols,
-                                        params.nnz,
-                                        alloc,
-                                        stream);
+    raft::sparse::linalg::csr_transpose(
+      handle, indptr, indices, data, out_indptr, out_indices, out_data,
+      params.nrows, params.ncols, params.nnz, alloc, stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
@@ -128,14 +122,15 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
     CUDA_CHECK(cudaFree(out_data_ref));
   }
 
-  void compare()
-  {
-    ASSERT_TRUE(
-      devArrMatch(out_indptr, out_indptr_ref, params.out_indptr_ref_h.size(), Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(
-      out_indices, out_indices_ref, params.out_indices_ref_h.size(), Compare<value_t>()));
-    ASSERT_TRUE(
-      devArrMatch(out_data, out_data_ref, params.out_data_ref_h.size(), Compare<value_t>()));
+  void compare() {
+    ASSERT_TRUE(devArrMatch(out_indptr, out_indptr_ref,
+                            params.out_indptr_ref_h.size(),
+                            Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out_indices, out_indices_ref,
+                            params.out_indices_ref_h.size(),
+                            Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out_data, out_data_ref,
+                            params.out_data_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
@@ -144,15 +139,15 @@ class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<valu
 
   // input data
   value_idx *indptr, *indices;
-  value_t* data;
+  value_t *data;
 
   // output data
   value_idx *out_indptr, *out_indices;
-  value_t* out_data;
+  value_t *out_data;
 
   // expected output data
   value_idx *out_indptr_ref, *out_indices_ref;
-  value_t* out_data_ref;
+  value_t *out_data_ref;
 
   CSRTransposeInputs<value_idx, value_t> params;
 };
@@ -172,7 +167,8 @@ const std::vector<CSRTransposeInputs<int, float>> inputs_i32_f = {
 };
 typedef CSRTransposeTest<int, float> CSRTransposeTestF;
 TEST_P(CSRTransposeTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index c6b2a27273..5d687ad92b 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -33,7 +33,8 @@ struct SparseDegreeInputs {
 };
 
 template <typename T>
-class SparseDegreeTests : public ::testing::TestWithParam<SparseDegreeInputs<T>> {
+class SparseDegreeTests
+  : public ::testing::TestWithParam<SparseDegreeInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -46,12 +47,11 @@ class SparseDegreeTests : public ::testing::TestWithParam<SparseDegreeInputs<T>>
 const std::vector<SparseDegreeInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseDegreeTests<float> COODegree;
-TEST_P(COODegree, Result)
-{
+TEST_P(COODegree, Result) {
   int *in_rows, *verify, *results;
 
   int in_rows_h[5] = {0, 0, 1, 2, 2};
-  int verify_h[5]  = {2, 1, 2, 0, 0};
+  int verify_h[5] = {2, 1, 2, 0, 0};
 
   raft::allocate(in_rows, 5);
   raft::allocate(verify, 5, true);
@@ -70,17 +70,16 @@ TEST_P(COODegree, Result)
 }
 
 typedef SparseDegreeTests<float> COODegreeNonzero;
-TEST_P(COODegreeNonzero, Result)
-{
+TEST_P(COODegreeNonzero, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
   int *in_rows, *verify, *results;
-  float* in_vals;
+  float *in_vals;
 
-  int in_rows_h[5]   = {0, 0, 1, 2, 2};
+  int in_rows_h[5] = {0, 0, 1, 2, 2};
   float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0};
-  int verify_h[5]    = {1, 0, 2, 0, 0};
+  int verify_h[5] = {1, 0, 2, 0, 0};
 
   raft::allocate(in_rows, 5);
   raft::allocate(verify, 5, true);
@@ -102,8 +101,10 @@ TEST_P(COODegreeNonzero, Result)
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, ::testing::ValuesIn(inputsf));
-INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree,
+                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero,
+                        ::testing::ValuesIn(inputsf));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index 7c0db49a04..a83b93f83f 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -55,82 +55,71 @@ struct InputConfiguration {
 };
 
 using dense_smem_strategy_t = dense_smem_strategy<int, float, 1024>;
-using hash_strategy_t       = hash_strategy<int, float, 1024>;
+using hash_strategy_t = hash_strategy<int, float, 1024>;
 
 template <typename value_idx, typename value_t, typename strategy_t>
 struct SparseDistanceCOOSPMVInputs {
   InputConfiguration<value_idx, value_t> input_configuration;
 
   float capacity_threshold = 0.5;
-  int map_size             = hash_strategy<value_idx, value_t, 1024>::get_map_size();
+  int map_size = hash_strategy<value_idx, value_t, 1024>::get_map_size();
 };
 
 template <typename value_idx, typename value_t, typename strategy_t>
-::std::ostream& operator<<(::std::ostream& os,
-                           const SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>& dims)
-{
+::std::ostream &operator<<(
+  ::std::ostream &os,
+  const SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t> &dims) {
   return os;
 }
 
 template <typename value_idx, typename value_t, typename strategy_t>
 class SparseDistanceCOOSPMVTest
-  : public ::testing::TestWithParam<SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>> {
+  : public ::testing::TestWithParam<
+      SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>> {
  public:
   SparseDistanceCOOSPMVTest() : dist_config(handle) {}
 
-  template <typename U, std::enable_if_t<std::is_same_v<U, hash_strategy_t>>* = nullptr>
-  U make_strategy()
-  {
+  template <typename U,
+            std::enable_if_t<std::is_same_v<U, hash_strategy_t>> * = nullptr>
+  U make_strategy() {
     return strategy_t(dist_config, params.capacity_threshold, params.map_size);
   }
 
-  template <typename U, std::enable_if_t<std::is_same_v<U, dense_smem_strategy_t>>* = nullptr>
-  U make_strategy()
-  {
+  template <typename U, std::enable_if_t<
+                          std::is_same_v<U, dense_smem_strategy_t>> * = nullptr>
+  U make_strategy() {
     return strategy_t(dist_config);
   }
 
   template <typename reduce_f, typename accum_f, typename write_f>
-  void compute_dist(reduce_f reduce_func, accum_f accum_func, write_f write_func, bool rev = true)
-  {
-    raft::mr::device::buffer<value_idx> coo_rows(dist_config.handle.get_device_allocator(),
-                                                 dist_config.handle.get_stream(),
-                                                 max(dist_config.b_nnz, dist_config.a_nnz));
-
-    raft::sparse::convert::csr_to_coo(dist_config.b_indptr,
-                                      dist_config.b_nrows,
-                                      coo_rows.data(),
-                                      dist_config.b_nnz,
+  void compute_dist(reduce_f reduce_func, accum_f accum_func,
+                    write_f write_func, bool rev = true) {
+    raft::mr::device::buffer<value_idx> coo_rows(
+      dist_config.handle.get_device_allocator(),
+      dist_config.handle.get_stream(),
+      max(dist_config.b_nnz, dist_config.a_nnz));
+
+    raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows,
+                                      coo_rows.data(), dist_config.b_nnz,
                                       dist_config.handle.get_stream());
 
     strategy_t selected_strategy = make_strategy<strategy_t>();
-    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(out_dists,
-                                                               dist_config,
-                                                               coo_rows.data(),
-                                                               reduce_func,
-                                                               accum_func,
-                                                               write_func,
-                                                               selected_strategy);
+    balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
+      out_dists, dist_config, coo_rows.data(), reduce_func, accum_func,
+      write_func, selected_strategy);
 
     if (rev) {
-      raft::sparse::convert::csr_to_coo(dist_config.a_indptr,
-                                        dist_config.a_nrows,
-                                        coo_rows.data(),
-                                        dist_config.a_nnz,
-                                        dist_config.handle.get_stream());
-
-      balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(out_dists,
-                                                                     dist_config,
-                                                                     coo_rows.data(),
-                                                                     reduce_func,
-                                                                     accum_func,
-                                                                     write_func,
-                                                                     selected_strategy);
+      raft::sparse::convert::csr_to_coo(
+        dist_config.a_indptr, dist_config.a_nrows, coo_rows.data(),
+        dist_config.a_nnz, dist_config.handle.get_stream());
+
+      balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
+        out_dists, dist_config, coo_rows.data(), reduce_func, accum_func,
+        write_func, selected_strategy);
     }
   }
 
-  void run_spmv()
-  {
+  void run_spmv() {
     switch (params.input_configuration.metric) {
       case raft::distance::DistanceType::InnerProduct:
         compute_dist(Product(), Sum(), AtomicAdd(), true);
@@ -140,69 +129,75 @@ class SparseDistanceCOOSPMVTest
         break;
       case raft::distance::DistanceType::Canberra:
         compute_dist(
-          [] __device__(value_t a, value_t b) { return fabsf(a - b) / (fabsf(a) + fabsf(b)); },
-          Sum(),
-          AtomicAdd());
+          [] __device__(value_t a, value_t b) {
+            return fabsf(a - b) / (fabsf(a) + fabsf(b));
+          },
+          Sum(), AtomicAdd());
+        break;
+      case raft::distance::DistanceType::L1:
+        compute_dist(AbsDiff(), Sum(), AtomicAdd());
+        break;
+      case raft::distance::DistanceType::Linf:
+        compute_dist(AbsDiff(), Max(), AtomicMax());
         break;
-      case raft::distance::DistanceType::L1: compute_dist(AbsDiff(), Sum(), AtomicAdd()); break;
-      case raft::distance::DistanceType::Linf: compute_dist(AbsDiff(), Max(), AtomicMax()); break;
       case raft::distance::DistanceType::LpUnexpanded: {
-        compute_dist(PDiff(params.input_configuration.metric_arg), Sum(), AtomicAdd());
+        compute_dist(PDiff(params.input_configuration.metric_arg), Sum(),
+                     AtomicAdd());
         float p = 1.0f / params.input_configuration.metric_arg;
         raft::linalg::unaryOp<value_t>(
-          out_dists,
-          out_dists,
-          dist_config.a_nrows * dist_config.b_nrows,
+          out_dists, out_dists, dist_config.a_nrows * dist_config.b_nrows,
           [=] __device__(value_t input) { return powf(input, p); },
           dist_config.handle.get_stream());
 
       } break;
-      default: throw raft::exception("Unknown distance");
+      default:
+        throw raft::exception("Unknown distance");
     }
   }
 
  protected:
-  void make_data()
-  {
-    std::vector<value_idx> indptr_h  = params.input_configuration.indptr_h;
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.input_configuration.indptr_h;
     std::vector<value_idx> indices_h = params.input_configuration.indices_h;
-    std::vector<value_t> data_h      = params.input_configuration.data_h;
+    std::vector<value_t> data_h = params.input_configuration.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
     allocate(data, data_h.size());
 
-    update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream());
-    update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream());
+    update_device(indptr, indptr_h.data(), indptr_h.size(),
+                  handle.get_stream());
+    update_device(indices, indices_h.data(), indices_h.size(),
+                  handle.get_stream());
     update_device(data, data_h.data(), data_h.size(), handle.get_stream());
 
-    std::vector<value_t> out_dists_ref_h = params.input_configuration.out_dists_ref_h;
+    std::vector<value_t> out_dists_ref_h =
+      params.input_configuration.out_dists_ref_h;
 
     allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1));
 
-    update_device(
-      out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream());
+    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
+                  handle.get_stream());
   }
 
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<
       SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>>::GetParam();
 
     make_data();
 
-    dist_config.b_nrows   = params.input_configuration.indptr_h.size() - 1;
-    dist_config.b_ncols   = params.input_configuration.n_cols;
-    dist_config.b_nnz     = params.input_configuration.indices_h.size();
-    dist_config.b_indptr  = indptr;
+    dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1;
+    dist_config.b_ncols = params.input_configuration.n_cols;
+    dist_config.b_nnz = params.input_configuration.indices_h.size();
+    dist_config.b_indptr = indptr;
     dist_config.b_indices = indices;
-    dist_config.b_data    = data;
-    dist_config.a_nrows   = params.input_configuration.indptr_h.size() - 1;
-    dist_config.a_ncols   = params.input_configuration.n_cols;
-    dist_config.a_nnz     = params.input_configuration.indices_h.size();
-    dist_config.a_indptr  = indptr;
+    dist_config.b_data = data;
+    dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1;
+    dist_config.a_ncols = params.input_configuration.n_cols;
+    dist_config.a_nnz = params.input_configuration.indices_h.size();
+    dist_config.a_indptr = indptr;
     dist_config.a_indices = indices;
-    dist_config.a_data    = data;
+    dist_config.a_data = data;
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
@@ -213,8 +208,7 @@ class SparseDistanceCOOSPMVTest
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
@@ -223,10 +217,8 @@ class SparseDistanceCOOSPMVTest
     CUDA_CHECK(cudaFree(out_dists_ref));
   }
 
-  void compare()
-  {
-    ASSERT_TRUE(devArrMatch(out_dists_ref,
-                            out_dists,
+  void compare() {
+    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
                             params.input_configuration.out_dists_ref_h.size(),
                             CompareApprox<value_t>(1e-3)));
   }
@@ -236,7 +228,7 @@ class SparseDistanceCOOSPMVTest
 
   // input data
   value_idx *indptr, *indices;
-  value_t* data;
+  value_t *data;
 
   // output data
   value_t *out_dists, *out_dists_ref;
@@ -251,7 +243,8 @@ const InputConfiguration<int, float> input_inner_product = {
   {0, 2, 4, 6, 8},
   {0, 1, 0, 1, 0, 1, 0, 1},
   {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
-  {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0},
+  {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
+   5.0},
   raft::distance::DistanceType::InnerProduct,
   0.0};
 
@@ -282,379 +275,384 @@ const InputConfiguration<int, float> input_l2_unexpanded = {
   raft::distance::DistanceType::L2Unexpanded,
   0.0};
 
-const InputConfiguration<int, float> input_canberra = {
-  10,
-  {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-  {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
-   6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-  {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
-   0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
-   0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
-   0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
-   0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-  {0.0,
-   3.3954660629919076,
-   5.6469232737388815,
-   6.373112846266441,
-   4.0212880272531715,
-   6.916281504639404,
-   5.741508386786526,
-   5.411470999663036,
-   9.0,
-   4.977014354725805,
-   3.3954660629919076,
-   0.0,
-   7.56256082439209,
-   5.540261147481582,
-   4.832322929216881,
-   4.62003193872216,
-   6.498056792320361,
-   4.309846252268695,
-   6.317531174829905,
-   6.016362684141827,
-   5.6469232737388815,
-   7.56256082439209,
-   0.0,
-   5.974878731322299,
-   4.898357301336036,
-   6.442097410320605,
-   5.227077347287883,
-   7.134101195584642,
-   5.457753923371659,
-   7.0,
-   6.373112846266441,
-   5.540261147481582,
-   5.974878731322299,
-   0.0,
-   5.5507273748583,
-   4.897749658726415,
-   9.0,
-   8.398776718824767,
-   3.908281400328807,
-   4.83431066343688,
-   4.0212880272531715,
-   4.832322929216881,
-   4.898357301336036,
-   5.5507273748583,
-   0.0,
-   6.632989819428174,
-   7.438852294822894,
-   5.6631570310967465,
-   7.579428202635459,
-   6.760811985364303,
-   6.916281504639404,
-   4.62003193872216,
-   6.442097410320605,
-   4.897749658726415,
-   6.632989819428174,
-   0.0,
-   5.249404187382862,
-   6.072559523278559,
-   4.07661278488929,
-   6.19678948003145,
-   5.741508386786526,
-   6.498056792320361,
-   5.227077347287883,
-   9.0,
-   7.438852294822894,
-   5.249404187382862,
-   0.0,
-   3.854811639654704,
-   6.652724827169063,
-   5.298236851430971,
-   5.411470999663036,
-   4.309846252268695,
-   7.134101195584642,
-   8.398776718824767,
-   5.6631570310967465,
-   6.072559523278559,
-   3.854811639654704,
-   0.0,
-   7.529184598969917,
-   6.903282911791188,
-   9.0,
-   6.317531174829905,
-   5.457753923371659,
-   3.908281400328807,
-   7.579428202635459,
-   4.07661278488929,
-   6.652724827169063,
-   7.529184598969917,
-   0.0,
-   7.0,
-   4.977014354725805,
-   6.016362684141827,
-   7.0,
-   4.83431066343688,
-   6.760811985364303,
-   6.19678948003145,
-   5.298236851430971,
-   6.903282911791188,
-   7.0,
-   0.0},
-  raft::distance::DistanceType::Canberra,
-  0.0};
-
-const InputConfiguration<int, float> input_lp_unexpanded = {
-  10,
-  {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-  {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
-   6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-  {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
-   0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
-   0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
-   0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
-   0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-  {0.0,
-   1.31462855332296,
-   1.3690307816129905,
-   1.698603990921237,
-   1.3460470789553531,
-   1.6636670712582544,
-   1.2651744044972217,
-   1.1938329352055201,
-   1.8811409082590185,
-   1.3653115050624267,
-   1.31462855332296,
-   0.0,
-   1.9447722703291133,
-   1.42818777206562,
-   1.4685491458946494,
-   1.3071999866010466,
-   1.4988622861692171,
-   0.9698559287406783,
-   1.4972023224597841,
-   1.5243383567266802,
-   1.3690307816129905,
-   1.9447722703291133,
-   0.0,
-   1.2748400840107568,
-   1.0599569946448246,
-   1.546591282841402,
-   1.147526531928459,
-   1.447002179128145,
-   1.5982242387673176,
-   1.3112533607072414,
-   1.698603990921237,
-   1.42818777206562,
-   1.2748400840107568,
-   0.0,
-   1.038121552545461,
-   1.011788365364402,
-   1.3907391109256988,
-   1.3128200942311496,
-   1.19595706584447,
-   1.3233328139624725,
-   1.3460470789553531,
-   1.4685491458946494,
-   1.0599569946448246,
-   1.038121552545461,
-   0.0,
-   1.3642741698145529,
-   1.3493868683808095,
-   1.394942694628328,
-   1.572881849642552,
-   1.380122665319464,
-   1.6636670712582544,
-   1.3071999866010466,
-   1.546591282841402,
-   1.011788365364402,
-   1.3642741698145529,
-   0.0,
-   1.018961640373018,
-   1.0114394258945634,
-   0.8338711034820684,
-   1.1247823842299223,
-   1.2651744044972217,
-   1.4988622861692171,
-   1.147526531928459,
-   1.3907391109256988,
-   1.3493868683808095,
-   1.018961640373018,
-   0.0,
-   0.7701238110357329,
-   1.245486437864406,
-   0.5551259549534626,
-   1.1938329352055201,
-   0.9698559287406783,
-   1.447002179128145,
-   1.3128200942311496,
-   1.394942694628328,
-   1.0114394258945634,
-   0.7701238110357329,
-   0.0,
-   1.1886800117391216,
-   1.0083692448135637,
-   1.8811409082590185,
-   1.4972023224597841,
-   1.5982242387673176,
-   1.19595706584447,
-   1.572881849642552,
-   0.8338711034820684,
-   1.245486437864406,
-   1.1886800117391216,
-   0.0,
-   1.3661374102525012,
-   1.3653115050624267,
-   1.5243383567266802,
-   1.3112533607072414,
-   1.3233328139624725,
-   1.380122665319464,
-   1.1247823842299223,
-   0.5551259549534626,
-   1.0083692448135637,
-   1.3661374102525012,
-   0.0},
-  raft::distance::DistanceType::LpUnexpanded,
-  2.0};
-
-const InputConfiguration<int, float> input_linf = {
-  10,
-  {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-  {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
-   6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-  {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
-   0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
-   0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
-   0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
-   0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-  {0.0,
-   0.9251771844789913,
-   0.9036452083899731,
-   0.9251771844789913,
-   0.8706483735804971,
-   0.9251771844789913,
-   0.717493881903289,
-   0.6920214832303888,
-   0.9251771844789913,
-   0.9251771844789913,
-   0.9251771844789913,
-   0.0,
-   0.9036452083899731,
-   0.8655339692155823,
-   0.8706483735804971,
-   0.8655339692155823,
-   0.8655339692155823,
-   0.6329837991017668,
-   0.8655339692155823,
-   0.8655339692155823,
-   0.9036452083899731,
-   0.9036452083899731,
-   0.0,
-   0.7988276152181608,
-   0.7028075145996631,
-   0.9036452083899731,
-   0.9036452083899731,
-   0.9036452083899731,
-   0.8429599432532096,
-   0.9036452083899731,
-   0.9251771844789913,
-   0.8655339692155823,
-   0.7988276152181608,
-   0.0,
-   0.48376552205293305,
-   0.8206394616536681,
-   0.8206394616536681,
-   0.8206394616536681,
-   0.8429599432532096,
-   0.8206394616536681,
-   0.8706483735804971,
-   0.8706483735804971,
-   0.7028075145996631,
-   0.48376552205293305,
-   0.0,
-   0.8706483735804971,
-   0.8706483735804971,
-   0.8706483735804971,
-   0.8429599432532096,
-   0.8706483735804971,
-   0.9251771844789913,
-   0.8655339692155823,
-   0.9036452083899731,
-   0.8206394616536681,
-   0.8706483735804971,
-   0.0,
-   0.8853924473642432,
-   0.535821510936138,
-   0.6497196601457607,
-   0.8853924473642432,
-   0.717493881903289,
-   0.8655339692155823,
-   0.9036452083899731,
-   0.8206394616536681,
-   0.8706483735804971,
-   0.8853924473642432,
-   0.0,
-   0.5279604218147174,
-   0.6658348373853169,
-   0.33799874888632914,
-   0.6920214832303888,
-   0.6329837991017668,
-   0.9036452083899731,
-   0.8206394616536681,
-   0.8706483735804971,
-   0.535821510936138,
-   0.5279604218147174,
-   0.0,
-   0.662579808115858,
-   0.5079750812968089,
-   0.9251771844789913,
-   0.8655339692155823,
-   0.8429599432532096,
-   0.8429599432532096,
-   0.8429599432532096,
-   0.6497196601457607,
-   0.6658348373853169,
-   0.662579808115858,
-   0.0,
-   0.8429599432532096,
-   0.9251771844789913,
-   0.8655339692155823,
-   0.9036452083899731,
-   0.8206394616536681,
-   0.8706483735804971,
-   0.8853924473642432,
-   0.33799874888632914,
-   0.5079750812968089,
-   0.8429599432532096,
-   0.0},
-  raft::distance::DistanceType::Linf,
+const InputConfiguration<int, float> input_canberra =
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    3.3954660629919076,
+    5.6469232737388815,
+    6.373112846266441,
+    4.0212880272531715,
+    6.916281504639404,
+    5.741508386786526,
+    5.411470999663036,
+    9.0,
+    4.977014354725805,
+    3.3954660629919076,
+    0.0,
+    7.56256082439209,
+    5.540261147481582,
+    4.832322929216881,
+    4.62003193872216,
+    6.498056792320361,
+    4.309846252268695,
+    6.317531174829905,
+    6.016362684141827,
+    5.6469232737388815,
+    7.56256082439209,
+    0.0,
+    5.974878731322299,
+    4.898357301336036,
+    6.442097410320605,
+    5.227077347287883,
+    7.134101195584642,
+    5.457753923371659,
+    7.0,
+    6.373112846266441,
+    5.540261147481582,
+    5.974878731322299,
+    0.0,
+    5.5507273748583,
+    4.897749658726415,
+    9.0,
+    8.398776718824767,
+    3.908281400328807,
+    4.83431066343688,
+    4.0212880272531715,
+    4.832322929216881,
+    4.898357301336036,
+    5.5507273748583,
+    0.0,
+    6.632989819428174,
+    7.438852294822894,
+    5.6631570310967465,
+    7.579428202635459,
+    6.760811985364303,
+    6.916281504639404,
+    4.62003193872216,
+    6.442097410320605,
+    4.897749658726415,
+    6.632989819428174,
+    0.0,
+    5.249404187382862,
+    6.072559523278559,
+    4.07661278488929,
+    6.19678948003145,
+    5.741508386786526,
+    6.498056792320361,
+    5.227077347287883,
+    9.0,
+    7.438852294822894,
+    5.249404187382862,
+    0.0,
+    3.854811639654704,
+    6.652724827169063,
+    5.298236851430971,
+    5.411470999663036,
+    4.309846252268695,
+    7.134101195584642,
+    8.398776718824767,
+    5.6631570310967465,
+    6.072559523278559,
+    3.854811639654704,
+    0.0,
+    7.529184598969917,
+    6.903282911791188,
+    9.0,
+    6.317531174829905,
+    5.457753923371659,
+    3.908281400328807,
+    7.579428202635459,
+    4.07661278488929,
+    6.652724827169063,
+    7.529184598969917,
+    0.0,
+    7.0,
+    4.977014354725805,
+    6.016362684141827,
+    7.0,
+    4.83431066343688,
+    6.760811985364303,
+    6.19678948003145,
+    5.298236851430971,
+    6.903282911791188,
+    7.0,
+    0.0},
+   raft::distance::DistanceType::Canberra,
+   0.0};
+
+const InputConfiguration<int, float> input_lp_unexpanded =
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    1.31462855332296,
+    1.3690307816129905,
+    1.698603990921237,
+    1.3460470789553531,
+    1.6636670712582544,
+    1.2651744044972217,
+    1.1938329352055201,
+    1.8811409082590185,
+    1.3653115050624267,
+    1.31462855332296,
+    0.0,
+    1.9447722703291133,
+    1.42818777206562,
+    1.4685491458946494,
+    1.3071999866010466,
+    1.4988622861692171,
+    0.9698559287406783,
+    1.4972023224597841,
+    1.5243383567266802,
+    1.3690307816129905,
+    1.9447722703291133,
+    0.0,
+    1.2748400840107568,
+    1.0599569946448246,
+    1.546591282841402,
+    1.147526531928459,
+    1.447002179128145,
+    1.5982242387673176,
+    1.3112533607072414,
+    1.698603990921237,
+    1.42818777206562,
+    1.2748400840107568,
+    0.0,
+    1.038121552545461,
+    1.011788365364402,
+    1.3907391109256988,
+    1.3128200942311496,
+    1.19595706584447,
+    1.3233328139624725,
+    1.3460470789553531,
+    1.4685491458946494,
+    1.0599569946448246,
+    1.038121552545461,
+    0.0,
+    1.3642741698145529,
+    1.3493868683808095,
+    1.394942694628328,
+    1.572881849642552,
+    1.380122665319464,
+    1.6636670712582544,
+    1.3071999866010466,
+    1.546591282841402,
+    1.011788365364402,
+    1.3642741698145529,
+    0.0,
+    1.018961640373018,
+    1.0114394258945634,
+    0.8338711034820684,
+    1.1247823842299223,
+    1.2651744044972217,
+    1.4988622861692171,
+    1.147526531928459,
+    1.3907391109256988,
+    1.3493868683808095,
+    1.018961640373018,
+    0.0,
+    0.7701238110357329,
+    1.245486437864406,
+    0.5551259549534626,
+    1.1938329352055201,
+    0.9698559287406783,
+    1.447002179128145,
+    1.3128200942311496,
+    1.394942694628328,
+    1.0114394258945634,
+    0.7701238110357329,
+    0.0,
+    1.1886800117391216,
+    1.0083692448135637,
+    1.8811409082590185,
+    1.4972023224597841,
+    1.5982242387673176,
+    1.19595706584447,
+    1.572881849642552,
+    0.8338711034820684,
+    1.245486437864406,
+    1.1886800117391216,
+    0.0,
+    1.3661374102525012,
+    1.3653115050624267,
+    1.5243383567266802,
+    1.3112533607072414,
+    1.3233328139624725,
+    1.380122665319464,
+    1.1247823842299223,
+    0.5551259549534626,
+    1.0083692448135637,
+    1.3661374102525012,
+    0.0},
+   raft::distance::DistanceType::LpUnexpanded,
+   2.0};
+
+const InputConfiguration<int, float> input_linf =
+  {10,
+   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.0,
+    0.9251771844789913,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.717493881903289,
+    0.6920214832303888,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.9251771844789913,
+    0.0,
+    0.9036452083899731,
+    0.8655339692155823,
+    0.8706483735804971,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.6329837991017668,
+    0.8655339692155823,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.0,
+    0.7988276152181608,
+    0.7028075145996631,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.9036452083899731,
+    0.8429599432532096,
+    0.9036452083899731,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.7988276152181608,
+    0.0,
+    0.48376552205293305,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8206394616536681,
+    0.8429599432532096,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.7028075145996631,
+    0.48376552205293305,
+    0.0,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8706483735804971,
+    0.8429599432532096,
+    0.8706483735804971,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.0,
+    0.8853924473642432,
+    0.535821510936138,
+    0.6497196601457607,
+    0.8853924473642432,
+    0.717493881903289,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.0,
+    0.5279604218147174,
+    0.6658348373853169,
+    0.33799874888632914,
+    0.6920214832303888,
+    0.6329837991017668,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.535821510936138,
+    0.5279604218147174,
+    0.0,
+    0.662579808115858,
+    0.5079750812968089,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.8429599432532096,
+    0.6497196601457607,
+    0.6658348373853169,
+    0.662579808115858,
+    0.0,
+    0.8429599432532096,
+    0.9251771844789913,
+    0.8655339692155823,
+    0.9036452083899731,
+    0.8206394616536681,
+    0.8706483735804971,
+    0.8853924473642432,
+    0.33799874888632914,
+    0.5079750812968089,
+    0.8429599432532096,
+    0.0},
+   raft::distance::DistanceType::Linf,
+   0.0};
+
+const InputConfiguration<int, float> input_l1 = {
+  4,
+  {0, 1, 1, 2, 4},
+  {3, 2, 0, 1},  // indices
+  {0.99296, 0.42180, 0.11687, 0.305869},
+  {
+    // dense output
+    0.0,
+    0.99296,
+    1.41476,
+    1.415707,
+    0.99296,
+    0.0,
+    0.42180,
+    0.42274,
+    1.41476,
+    0.42180,
+    0.0,
+    0.84454,
+    1.41570,
+    0.42274,
+    0.84454,
+    0.0,
+  },
+  raft::distance::DistanceType::L1,
   0.0};
 
-const InputConfiguration<int, float> input_l1 = {4,
-                                                 {0, 1, 1, 2, 4},
-                                                 {3, 2, 0, 1},  // indices
-                                                 {0.99296, 0.42180, 0.11687, 0.305869},
-                                                 {
-                                                   // dense output
-                                                   0.0,
-                                                   0.99296,
-                                                   1.41476,
-                                                   1.415707,
-                                                   0.99296,
-                                                   0.0,
-                                                   0.42180,
-                                                   0.42274,
-                                                   1.41476,
-                                                   0.42180,
-                                                   0.0,
-                                                   0.84454,
-                                                   1.41570,
-                                                   0.42274,
-                                                   0.84454,
-                                                   0.0,
-                                                 },
-                                                 raft::distance::DistanceType::L1,
-                                                 0.0};
-
 // test dense smem strategy
-const std::vector<SparseDistanceCOOSPMVInputs<int, float, dense_smem_strategy_t>>
-  inputs_dense_strategy = {{input_inner_product},
-                           {input_l2_unexpanded},
-                           {input_canberra},
-                           {input_lp_unexpanded},
-                           {input_linf},
-                           {input_l1}};
+const std::vector<
+  SparseDistanceCOOSPMVInputs<int, float, dense_smem_strategy_t>>
+  inputs_dense_strategy = {{input_inner_product}, {input_l2_unexpanded},
+                           {input_canberra},      {input_lp_unexpanded},
+                           {input_linf},          {input_l1}};
 
 typedef SparseDistanceCOOSPMVTest<int, float, dense_smem_strategy_t>
   SparseDistanceCOOSPMVTestDenseStrategyF;
@@ -664,22 +662,22 @@ INSTANTIATE_TEST_CASE_P(SparseDistanceCOOSPMVTests,
                         ::testing::ValuesIn(inputs_dense_strategy));
 
 // test hash and chunk strategy
-const std::vector<SparseDistanceCOOSPMVInputs<int, float, hash_strategy_t>> inputs_hash_strategy = {
-  {input_inner_product},
-  {input_inner_product, 0.5, 2},
-  {input_l2_unexpanded},
-  {input_l2_unexpanded, 0.5, 2},
-  {input_canberra},
-  {input_canberra, 0.5, 2},
-  {input_canberra, 0.5, 6},
-  {input_lp_unexpanded},
-  {input_lp_unexpanded, 0.5, 2},
-  {input_lp_unexpanded, 0.5, 6},
-  {input_linf},
-  {input_linf, 0.5, 2},
-  {input_linf, 0.5, 6},
-  {input_l1},
-  {input_l1, 0.5, 2}};
+const std::vector<SparseDistanceCOOSPMVInputs<int, float, hash_strategy_t>>
+  inputs_hash_strategy = {{input_inner_product},
+                          {input_inner_product, 0.5, 2},
+                          {input_l2_unexpanded},
+                          {input_l2_unexpanded, 0.5, 2},
+                          {input_canberra},
+                          {input_canberra, 0.5, 2},
+                          {input_canberra, 0.5, 6},
+                          {input_lp_unexpanded},
+                          {input_lp_unexpanded, 0.5, 2},
+                          {input_lp_unexpanded, 0.5, 6},
+                          {input_linf},
+                          {input_linf, 0.5, 2},
+                          {input_linf, 0.5, 6},
+                          {input_l1},
+                          {input_l1, 0.5, 2}};
 
 typedef SparseDistanceCOOSPMVTest<int, float, hash_strategy_t>
   SparseDistanceCOOSPMVTestHashStrategyF;
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 8d6675f954..0589637061 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -50,8 +50,8 @@ struct SparseDistanceInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs<value_idx, value_t>& dims)
-{
+::std::ostream &operator<<(
+  ::std::ostream &os, const SparseDistanceInputs<value_idx, value_t> &dims) {
   return os;
 }
 
@@ -61,24 +61,24 @@ class SparseDistanceTest
  public:
   SparseDistanceTest() : dist_config(handle) {}
 
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>>::GetParam();
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      SparseDistanceInputs<value_idx, value_t>>::GetParam();
 
     make_data();
 
-    dist_config.b_nrows   = params.indptr_h.size() - 1;
-    dist_config.b_ncols   = params.n_cols;
-    dist_config.b_nnz     = params.indices_h.size();
-    dist_config.b_indptr  = indptr;
+    dist_config.b_nrows = params.indptr_h.size() - 1;
+    dist_config.b_ncols = params.n_cols;
+    dist_config.b_nnz = params.indices_h.size();
+    dist_config.b_indptr = indptr;
     dist_config.b_indices = indices;
-    dist_config.b_data    = data;
-    dist_config.a_nrows   = params.indptr_h.size() - 1;
-    dist_config.a_ncols   = params.n_cols;
-    dist_config.a_nnz     = params.indices_h.size();
-    dist_config.a_indptr  = indptr;
+    dist_config.b_data = data;
+    dist_config.a_nrows = params.indptr_h.size() - 1;
+    dist_config.a_ncols = params.n_cols;
+    dist_config.a_nnz = params.indices_h.size();
+    dist_config.a_indptr = indptr;
     dist_config.a_indices = indices;
-    dist_config.a_data    = data;
+    dist_config.a_data = data;
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
@@ -89,8 +89,7 @@ class SparseDistanceTest
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
@@ -99,34 +98,33 @@ class SparseDistanceTest
     CUDA_CHECK(cudaFree(out_dists_ref));
   }
 
-  void compare()
-  {
-    ASSERT_TRUE(devArrMatch(
-      out_dists_ref, out_dists, params.out_dists_ref_h.size(), CompareApprox<value_t>(1e-3)));
+  void compare() {
+    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists,
+                            params.out_dists_ref_h.size(),
+                            CompareApprox<value_t>(1e-3)));
   }
 
  protected:
-  void make_data()
-  {
-    std::vector<value_idx> indptr_h  = params.indptr_h;
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h      = params.data_h;
+    std::vector<value_t> data_h = params.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
     allocate(data, data_h.size());
 
-    update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream());
-    update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream());
+    update_device(indptr, indptr_h.data(), indptr_h.size(),
+                  handle.get_stream());
+    update_device(indices, indices_h.data(), indices_h.size(),
+                  handle.get_stream());
     update_device(data, data_h.data(), data_h.size(), handle.get_stream());
 
     std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
 
     allocate(out_dists_ref, (indptr_h.size() - 1) * (indptr_h.size() - 1));
 
-    update_device(out_dists_ref,
-                  out_dists_ref_h.data(),
-                  out_dists_ref_h.size(),
+    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
                   dist_config.handle.get_stream());
   }
 
@@ -134,7 +132,7 @@ class SparseDistanceTest
 
   // input data
   value_idx *indptr, *indices;
-  value_t* data;
+  value_t *data;
 
   // output data
   value_t *out_dists, *out_dists_ref;
@@ -189,7 +187,8 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {0, 2, 4, 6, 8},
    {0, 1, 0, 1, 0, 1, 0, 1},
    {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
-   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0},
+   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
+    5.0},
    raft::distance::DistanceType::InnerProduct,
    0.0},
   {2,
@@ -220,33 +219,40 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
-    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
-    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
-    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
-    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
-    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.,         0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102,
-    1.,         0.76978799, 0.39419924, 0.,         0.97577154, 0.48904013, 0.48300801, 0.45087445,
-    0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0.,         0.51413997,
-    0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819,  1.,         0.79593037, 0.48904013,
-    0.51413997, 0.,         0.28605559, 0.35772784, 1.,         0.60889396, 0.43324829, 0.84923694,
-    0.45658883, 0.48300801, 0.31195441, 0.28605559, 0.,         0.58623212, 0.6745457,  0.60287165,
-    0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0.,
-    0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1.,
-    0.6745457,  0.77917274, 0.,         0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481,
-    0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0.,         0.51360432, 0.68185144,
-    1.,         0.54847744, 0.8321819,  0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432,
-    0.,         1.,         0.76978799, 0.78021386, 1.,         0.84923694, 0.73155632, 0.99166225,
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.,         0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219,
+    0.58146987, 0.44940102, 1.,         0.76978799, 0.39419924, 0.,
+    0.97577154, 0.48904013, 0.48300801, 0.45087445, 0.73323749, 0.21050481,
+    0.54847744, 0.78021386, 0.54823225, 0.97577154, 0.,         0.51413997,
+    0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819,  1.,
+    0.79593037, 0.48904013, 0.51413997, 0.,         0.28605559, 0.35772784,
+    1.,         0.60889396, 0.43324829, 0.84923694, 0.45658883, 0.48300801,
+    0.31195441, 0.28605559, 0.,         0.58623212, 0.6745457,  0.60287165,
+    0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784,
+    0.58623212, 0.,         0.77917274, 0.48390993, 0.24558392, 0.99166225,
+    0.58146987, 0.73323749, 0.67534399, 1.,         0.6745457,  0.77917274,
+    0.,         0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481,
+    0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0.,
+    0.51360432, 0.68185144, 1.,         0.54847744, 0.8321819,  0.43324829,
+    0.67676228, 0.24558392, 0.76064776, 0.51360432, 0.,         1.,
+    0.76978799, 0.78021386, 1.,         0.84923694, 0.73155632, 0.99166225,
     0.61547536, 0.68185144, 1.,         0.},
    raft::distance::DistanceType::CosineExpanded,
    0.0},
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
-    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
    {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
     1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
     1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
@@ -355,13 +361,15 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
-    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
-    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
-    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
-    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
-    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
    {0.0,
     3.3954660629919076,
     5.6469232737388815,
@@ -467,13 +475,15 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
-    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
-    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
-    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
-    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
-    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
    {0.0,
     1.31462855332296,
     1.3690307816129905,
@@ -579,13 +589,15 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
-    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
-    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
-    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
-    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
-    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
+    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
+    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
+    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
+    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
+    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
+    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
+    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
    {0.0,
     0.9251771844789913,
     0.9036452083899731,
@@ -691,14 +703,17 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {15,
    {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45},
-   {0, 1, 5,  6, 9, 1,  4,  14, 7, 3,  4,  7, 9, 11, 14, 0, 3, 7, 8, 12, 0,  2, 5,
-    7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8,  9,  0, 2, 3, 4, 6,  10, 11},
-   {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219,
-    1.,         0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246,
-    0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026,  0.05279589, 0.1387149,  0.05962761,
-    0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739,
-    0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409,  0.31461499, 0.24412279, 0.08327667,
-    0.04444576, 0.05047969, 0.26190054, 0.2077349,  0.10803964},
+   {0,  1, 5,  6,  9,  1, 4, 14, 7, 3, 4,  7, 9, 11, 14,
+    0,  3, 7,  8,  12, 0, 2, 5,  7, 8, 14, 4, 9, 10, 11,
+    13, 4, 10, 14, 5,  6, 8, 9,  0, 2, 3,  4, 6, 10, 11},
+   {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507,
+    0.73789274, 0.08450219, 1.,         0.20184723, 0.18036963, 0.12581403,
+    0.13867603, 0.24040536, 0.11288773, 0.00290246, 0.09120187, 0.31190555,
+    0.43245423, 0.16153588, 0.3233026,  0.05279589, 0.1387149,  0.05962761,
+    0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881,
+    0.15605804, 0.3867739,  0.24908977, 0.36413632, 0.37643732, 0.28910679,
+    0.0198409,  0.31461499, 0.24412279, 0.08327667, 0.04444576, 0.05047969,
+    0.26190054, 0.2077349,  0.10803964},
    {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01,
     9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00,
     6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08,
@@ -757,25 +772,31 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45},
    {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2,
     3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4},
-   {0.70862347, 0.8232774,  0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667,
-    0.34426657, 0.2357925,  0.01274851, 0.11422017, 0.3437756,  0.31967718, 0.5956055,  0.31610373,
-    0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529,
-    0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736,
-    0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815,
-    0.31648823, 0.89874295, 0.27366735, 0.5119944,  0.11416134},
+   {0.70862347, 0.8232774,  0.12108795, 0.84527547, 0.94937088, 0.03258545,
+    0.99584118, 0.76835667, 0.34426657, 0.2357925,  0.01274851, 0.11422017,
+    0.3437756,  0.31967718, 0.5956055,  0.31610373, 0.04147273, 0.03724415,
+    0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529,
+    0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329,
+    0.61364678, 0.22837736, 0.56609561, 0.29809423, 0.76736686, 0.56460608,
+    0.98165371, 0.02140123, 0.19881268, 0.26057815, 0.31648823, 0.89874295,
+    0.27366735, 0.5119944,  0.11416134},
    {// dense output
-    0.,         0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794,  0.76962708, 1.122858,
-    1.1232498,  1.08166081, 0.48769777, 0.,         1.31332116, 0.98318907, 0.42661815, 0.09279052,
-    1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0.,         1.82943642,
-    1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907,
-    1.82943642, 0.,         0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116,
-    0.26657011, 0.42661815, 1.54826077, 0.29945563, 0.,         0.45060069, 0.77814948, 1.45245711,
-    1.18328348, 0.82486987, 0.7874794,  0.09279052, 1.05918884, 1.08494093, 0.45060069, 0.,
-    1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281,
-    0.77814948, 1.29899154, 0.,         0.33202426, 1.92108999, 1.88812175, 1.122858,   1.38429055,
-    1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0.,         1.47318624, 1.92660889,
-    1.1232498,  0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624,
-    0.,         0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363,
+    0.,         0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794,
+    0.76962708, 1.122858,   1.1232498,  1.08166081, 0.48769777, 0.,
+    1.31332116, 0.98318907, 0.42661815, 0.09279052, 1.35187836, 1.38429055,
+    0.40658897, 0.56136388, 1.88014197, 1.31332116, 0.,         1.82943642,
+    1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848,
+    0.26127048, 0.98318907, 1.82943642, 0.,         0.29945563, 1.08494093,
+    0.22934281, 0.82801925, 1.74288748, 1.50610116, 0.26657011, 0.42661815,
+    1.54826077, 0.29945563, 0.,         0.45060069, 0.77814948, 1.45245711,
+    1.18328348, 0.82486987, 0.7874794,  0.09279052, 1.05918884, 1.08494093,
+    0.45060069, 0.,         1.29899154, 1.40683824, 0.48505269, 0.53862363,
+    0.76962708, 1.35187836, 1.59360067, 0.22934281, 0.77814948, 1.29899154,
+    0.,         0.33202426, 1.92108999, 1.88812175, 1.122858,   1.38429055,
+    1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0.,
+    1.47318624, 1.92660889, 1.1232498,  0.40658897, 0.60215168, 1.74288748,
+    1.18328348, 0.48505269, 1.92108999, 1.47318624, 0.,         0.24992619,
+    1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363,
     1.88812175, 1.92660889, 0.24992619, 0.},
    raft::distance::DistanceType::CorrelationExpanded,
    0.0},
@@ -784,11 +805,12 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {1, 4, 0, 4, 1, 3, 0, 1, 3, 0},
    {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
    {// dense output
-    0.,  1.,  1.,  1., 0.8, 1., 1.,  0.8, 1., 1.,  1.,  0., 0.8, 1., 1.,  1.,  1.,  1.,  1., 1.,
-    1.,  0.8, 0.,  1., 1.,  1., 0.8, 1.,  1., 0.8, 1.,  1., 1.,  0., 1.,  1.,  1.,  1.,  1., 1.,
-    0.8, 1.,  1.,  1., 0.,  1., 1.,  0.8, 1., 1.,  1.,  1., 1.,  1., 1.,  0.,  1.,  0.8, 1., 1.,
-    1.,  1.,  0.8, 1., 1.,  1., 0.,  1.,  1., 0.8, 0.8, 1., 1.,  1., 0.8, 0.8, 1.,  0.,  1., 1.,
-    1.,  1.,  1.,  1., 1.,  1., 1.,  1.,  0., 1.,  1.,  1., 0.8, 1., 1.,  1.,  0.8, 1.,  1., 0.},
+    0., 1.,  1.,  1., 0.8, 1., 1.,  0.8, 1., 1.,  1., 0.,  0.8, 1.,  1., 1., 1.,
+    1., 1.,  1.,  1., 0.8, 0., 1.,  1.,  1., 0.8, 1., 1.,  0.8, 1.,  1., 1., 0.,
+    1., 1.,  1.,  1., 1.,  1., 0.8, 1.,  1., 1.,  0., 1.,  1.,  0.8, 1., 1., 1.,
+    1., 1.,  1.,  1., 0.,  1., 0.8, 1.,  1., 1.,  1., 0.8, 1.,  1.,  1., 0., 1.,
+    1., 0.8, 0.8, 1., 1.,  1., 0.8, 0.8, 1., 0.,  1., 1.,  1.,  1.,  1., 1., 1.,
+    1., 1.,  1.,  0., 1.,  1., 1.,  0.8, 1., 1.,  1., 0.8, 1.,  1.,  0.},
    raft::distance::DistanceType::RusselRaoExpanded,
    0.0},
   {5,
@@ -796,12 +818,13 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {0, 3, 4, 4, 2, 3, 0, 2, 3, 2},
    {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
    {// dense output
-    0.,  0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4,
-    0.6, 0.2, 0.,  0.6, 0.4, 0.,  0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0.,  0.4, 0.,
-    0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.4, 0.2, 0.2, 0.2, 0.,  0.2, 0.6, 0.8, 0.4, 0.2, 0.2,
-    0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0.,  0.2,
-    0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0.,  0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4,
-    0.2, 0.2, 0.4, 0.,  0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.},
+    0.,  0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2,
+    0.,  0.4, 0.6, 0.2, 0.,  0.6, 0.4, 0.,  0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4,
+    0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.4, 0.2, 0.2, 0.2, 0.,
+    0.2, 0.6, 0.8, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,
+    0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0.,  0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8,
+    0.6, 0.2, 0.,  0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, 0.2, 0.2, 0.4, 0.,  0.2,
+    0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.},
    raft::distance::DistanceType::HammingUnexpanded,
    0.0},
   {3,
@@ -845,8 +868,7 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
 typedef SparseDistanceTest<int, float> SparseDistanceTestF;
 TEST_P(SparseDistanceTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseDistanceTests,
-                        SparseDistanceTestF,
+INSTANTIATE_TEST_CASE_P(SparseDistanceTests, SparseDistanceTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
 };  // namespace distance
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index 02be95c8a8..f7954f899f 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -36,7 +36,8 @@ struct SparseFilterInputs {
 };
 
 template <typename T>
-class SparseFilterTests : public ::testing::TestWithParam<SparseFilterInputs<T>> {
+class SparseFilterTests
+  : public ::testing::TestWithParam<SparseFilterInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -49,14 +50,14 @@ class SparseFilterTests : public ::testing::TestWithParam<SparseFilterInputs<T>>
 const std::vector<SparseFilterInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseFilterTests<float> COORemoveZeros;
-TEST_P(COORemoveZeros, Result)
-{
+TEST_P(COORemoveZeros, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
+  std::shared_ptr<raft::mr::device::allocator> alloc(
+    new raft::mr::device::default_allocator);
   params = ::testing::TestWithParam<SparseFilterInputs<float>>::GetParam();
 
-  float* in_h_vals = new float[params.nnz];
+  float *in_h_vals = new float[params.nnz];
 
   COO<float> in(alloc, stream, params.nnz, 5, 5);
 
@@ -69,8 +70,8 @@ TEST_P(COORemoveZeros, Result)
   in_h_vals[2] = 0;
   in_h_vals[3] = 0;
 
-  int* in_h_rows = new int[params.nnz];
-  int* in_h_cols = new int[params.nnz];
+  int *in_h_rows = new int[params.nnz];
+  int *in_h_cols = new int[params.nnz];
 
   for (int i = 0; i < params.nnz; i++) {
     in_h_rows[i] = params.nnz - i - 1;
@@ -86,9 +87,9 @@ TEST_P(COORemoveZeros, Result)
   int out_rows_ref_h[2] = {0, 3};
   int out_cols_ref_h[2] = {4, 1};
 
-  float* out_vals_ref_h = (float*)malloc(2 * sizeof(float));
-  out_vals_ref_h[0]     = in_h_vals[4];
-  out_vals_ref_h[1]     = in_h_vals[1];
+  float *out_vals_ref_h = (float *)malloc(2 * sizeof(float));
+  out_vals_ref_h[0] = in_h_vals[4];
+  out_vals_ref_h[1] = in_h_vals[1];
 
   COO<float> out_ref(alloc, stream, 2, 5, 5);
   COO<float> out(alloc, stream);
@@ -99,9 +100,12 @@ TEST_P(COORemoveZeros, Result)
 
   op::coo_remove_zeros<32, float>(&in, &out, alloc, stream);
 
-  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2, raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2, raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<float>(out_ref.vals(), out.vals(), 2, raft::Compare<float>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2,
+                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2,
+                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<float>(out_ref.vals(), out.vals(), 2,
+                                       raft::Compare<float>()));
 
   CUDA_CHECK(cudaStreamDestroy(stream));
   free(out_vals_ref_h);
@@ -111,7 +115,8 @@ TEST_P(COORemoveZeros, Result)
   delete[] in_h_vals;
 }
 
-INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros,
+                        ::testing::ValuesIn(inputsf));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index ca9da0bc05..8c3bf36318 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -50,53 +50,39 @@ struct SparseKNNInputs {
   int batch_size_index = 2;
   int batch_size_query = 2;
 
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded;
+  raft::distance::DistanceType metric =
+    raft::distance::DistanceType::L2SqrtExpanded;
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs<value_idx, value_t>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const SparseKNNInputs<value_idx, value_t> &dims) {
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
+class SparseKNNTest
+  : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
  public:
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam();
+  void SetUp() override {
+    params =
+      ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam();
 
     n_rows = params.indptr_h.size() - 1;
-    nnz    = params.indices_h.size();
-    k      = params.k;
+    nnz = params.indices_h.size();
+    k = params.k;
 
     make_data();
 
-    raft::sparse::selection::brute_force_knn<value_idx, value_t>(indptr,
-                                                                 indices,
-                                                                 data,
-                                                                 nnz,
-                                                                 n_rows,
-                                                                 params.n_cols,
-                                                                 indptr,
-                                                                 indices,
-                                                                 data,
-                                                                 nnz,
-                                                                 n_rows,
-                                                                 params.n_cols,
-                                                                 out_indices,
-                                                                 out_dists,
-                                                                 k,
-                                                                 handle,
-                                                                 params.batch_size_index,
-                                                                 params.batch_size_query,
-                                                                 params.metric);
+    raft::sparse::selection::brute_force_knn<value_idx, value_t>(
+      indptr, indices, data, nnz, n_rows, params.n_cols, indptr, indices, data,
+      nnz, n_rows, params.n_cols, out_indices, out_dists, k, handle,
+      params.batch_size_index, params.batch_size_query, params.metric);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
     CUDA_CHECK(cudaFree(data));
@@ -106,37 +92,39 @@ class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx,
     CUDA_CHECK(cudaFree(out_dists_ref));
   }
 
-  void compare()
-  {
-    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, CompareApprox<value_t>(1e-4)));
-    ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, Compare<value_idx>()));
+  void compare() {
+    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k,
+                            CompareApprox<value_t>(1e-4)));
+    ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k,
+                            Compare<value_idx>()));
   }
 
  protected:
-  void make_data()
-  {
-    std::vector<value_idx> indptr_h  = params.indptr_h;
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h      = params.data_h;
+    std::vector<value_t> data_h = params.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
     allocate(data, data_h.size());
 
-    update_device(indptr, indptr_h.data(), indptr_h.size(), handle.get_stream());
-    update_device(indices, indices_h.data(), indices_h.size(), handle.get_stream());
+    update_device(indptr, indptr_h.data(), indptr_h.size(),
+                  handle.get_stream());
+    update_device(indices, indices_h.data(), indices_h.size(),
+                  handle.get_stream());
     update_device(data, data_h.data(), data_h.size(), handle.get_stream());
 
-    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
+    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
 
     allocate(out_indices_ref, out_indices_ref_h.size());
     allocate(out_dists_ref, out_dists_ref_h.size());
 
-    update_device(
-      out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), handle.get_stream());
-    update_device(
-      out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), handle.get_stream());
+    update_device(out_indices_ref, out_indices_ref_h.data(),
+                  out_indices_ref_h.size(), handle.get_stream());
+    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
+                  handle.get_stream());
 
     allocate(out_dists, n_rows * k);
     allocate(out_indices, n_rows * k);
@@ -148,14 +136,14 @@ class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx,
 
   // input data
   value_idx *indptr, *indices;
-  value_t* data;
+  value_t *data;
 
   // output data
-  value_idx* out_indices;
-  value_t* out_dists;
+  value_idx *out_indices;
+  value_t *out_dists;
 
-  value_idx* out_indices_ref;
-  value_t* out_dists_ref;
+  value_idx *out_indices_ref;
+  value_t *out_dists_ref;
 
   SparseKNNInputs<value_idx, value_t> params;
 };
@@ -173,7 +161,8 @@ const std::vector<SparseKNNInputs<int, float>> inputs_i32_f = {
    raft::distance::DistanceType::L2SqrtExpanded}};
 typedef SparseKNNTest<int, float> SparseKNNTestF;
 TEST_P(SparseKNNTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF,
+                        ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace selection
 };  // end namespace sparse
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index f660e68aa3..ec41b32374 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -29,9 +29,8 @@ namespace raft {
 namespace sparse {
 
 template <typename value_idx, typename value_t>
-__global__ void assert_symmetry(
-  value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum)
-{
+__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals,
+                                value_idx nnz, value_idx *sum) {
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid >= nnz) return;
@@ -51,21 +50,22 @@ struct KNNGraphInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream& operator<<(::std::ostream& os, const KNNGraphInputs<value_idx, value_t>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const KNNGraphInputs<value_idx, value_t> &dims) {
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class KNNGraphTest : public ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>> {
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>>::GetParam();
+class KNNGraphTest
+  : public ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>> {
+  void SetUp() override {
+    params =
+      ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>>::GetParam();
 
     raft::handle_t handle;
 
     auto alloc = handle.get_device_allocator();
-    stream     = handle.get_stream();
+    stream = handle.get_stream();
 
     out = new raft::sparse::COO<value_t, value_idx>(alloc, stream);
 
@@ -74,7 +74,8 @@ class KNNGraphTest : public ::testing::TestWithParam<KNNGraphInputs<value_idx, v
     update_device(X, params.X.data(), params.X.size(), stream);
 
     raft::sparse::selection::knn_graph(
-      handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded, *out);
+      handle, X, params.m, params.n, raft::distance::DistanceType::L2Unexpanded,
+      *out);
 
     rmm::device_uvector<value_idx> sum(1, stream);
 
@@ -90,8 +91,7 @@ class KNNGraphTest : public ::testing::TestWithParam<KNNGraphInputs<value_idx, v
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(X));
 
     delete out;
@@ -101,9 +101,9 @@ class KNNGraphTest : public ::testing::TestWithParam<KNNGraphInputs<value_idx, v
   cudaStream_t stream;
 
   // input data
-  raft::sparse::COO<value_t, value_idx>* out;
+  raft::sparse::COO<value_t, value_idx> *out;
 
-  value_t* X;
+  value_t *X;
 
   value_idx sum_h;
 
@@ -115,15 +115,13 @@ const std::vector<KNNGraphInputs<int, float>> knn_graph_inputs_fint = {
   {4, 2, {0, 100, 0.01, 0.02, 5000, 10000, -5, -2}, 2}};
 
 typedef KNNGraphTest<int, float> KNNGraphTestF_int;
-TEST_P(KNNGraphTestF_int, Result)
-{
+TEST_P(KNNGraphTestF_int, Result) {
   // nnz should not be larger than twice m * k
   ASSERT_TRUE(out->nnz <= (params.m * params.k * 2));
   ASSERT_TRUE(sum_h == 0);
 }
 
-INSTANTIATE_TEST_CASE_P(KNNGraphTest,
-                        KNNGraphTestF_int,
+INSTANTIATE_TEST_CASE_P(KNNGraphTest, KNNGraphTestF_int,
                         ::testing::ValuesIn(knn_graph_inputs_fint));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 0ca7cec4e9..ce567e4298 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -55,44 +55,45 @@ struct LinkageInputs {
  * @param b: number of pairs of points that both the clusters have classified differently
  */
 template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void computeTheNumerator(
-  const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b)
-{
-  // calculating the indices of pairs of datapoints compared by the current thread
+__global__ void computeTheNumerator(const T* firstClusterArray,
+                                    const T* secondClusterArray, uint64_t size,
+                                    uint64_t* a, uint64_t* b) {
+  //calculating the indices of pairs of datapoints compared by the current thread
   uint64_t j = threadIdx.x + blockIdx.x * blockDim.x;
   uint64_t i = threadIdx.y + blockIdx.y * blockDim.y;
 
-  // thread-local variables to count a and b
+  //thread-local variables to count a and b
   uint64_t myA = 0, myB = 0;
 
   if (i < size && j < size && j < i) {
-    // checking if the pair have been classified the same by both the clusters
+    //checking if the pair have been classified the same by both the clusters
     if (firstClusterArray[i] == firstClusterArray[j] &&
         secondClusterArray[i] == secondClusterArray[j]) {
       ++myA;
     }
 
-    // checking if the pair have been classified differently by both the clusters
+    //checking if the pair have been classified differently by both the clusters
     else if (firstClusterArray[i] != firstClusterArray[j] &&
              secondClusterArray[i] != secondClusterArray[j]) {
       ++myB;
     }
   }
 
-  // specialize blockReduce for a 2D block of 1024 threads of type uint64_t
-  typedef cub::BlockReduce<uint64_t, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
+  //specialize blockReduce for a 2D block of 1024 threads of type uint64_t
+  typedef cub::BlockReduce<uint64_t, BLOCK_DIM_X,
+                           cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
     BlockReduce;
 
-  // Allocate shared memory for blockReduce
+  //Allocate shared memory for blockReduce
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
-  // summing up thread-local counts specific to a block
+  //summing up thread-local counts specific to a block
   myA = BlockReduce(temp_storage).Sum(myA);
   __syncthreads();
   myB = BlockReduce(temp_storage).Sum(myB);
   __syncthreads();
 
-  // executed once per block
+  //executed once per block
   if (threadIdx.x == 0 && threadIdx.y == 0) {
     raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)a, myA);
     raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)b, myB);
@@ -100,105 +101,102 @@ __global__ void computeTheNumerator(
 }
 
 /**
- * @brief Function to calculate RandIndex
- * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
- * @param firstClusterArray: the array of classes of type T
- * @param secondClusterArray: the array of classes of type T
- * @param size: the size of the data points of type uint64_t
- * @param allocator: object that takes care of temporary device memory allocation of type
- * std::shared_ptr<MLCommon::deviceAllocator>
- * @param stream: the cudaStream object
- */
+* @brief Function to calculate RandIndex
+* <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
+* @param firstClusterArray: the array of classes of type T
+* @param secondClusterArray: the array of classes of type T
+* @param size: the size of the data points of type uint64_t
+* @param allocator: object that takes care of temporary device memory allocation of type std::shared_ptr<MLCommon::deviceAllocator>
+* @param stream: the cudaStream object
+*/
 template <typename T>
-double compute_rand_index(T* firstClusterArray,
-                          T* secondClusterArray,
-                          uint64_t size,
-                          std::shared_ptr<raft::mr::device::allocator> allocator,
-                          cudaStream_t stream)
-{
-  // rand index for size less than 2 is not defined
+double compute_rand_index(
+  T* firstClusterArray, T* secondClusterArray, uint64_t size,
+  std::shared_ptr<raft::mr::device::allocator> allocator, cudaStream_t stream) {
+  //rand index for size less than 2 is not defined
   ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
 
-  // allocating and initializing memory for a and b in the GPU
+  //allocating and initializing memory for a and b in the GPU
   raft::mr::device::buffer<uint64_t> arr_buf(allocator, stream, 2);
   CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
 
-  // kernel configuration
+  //kernel configuration
   static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16;
   dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y);
   dim3 numBlocks(raft::ceildiv<int>(size, numThreadsPerBlock.x),
                  raft::ceildiv<int>(size, numThreadsPerBlock.y));
 
-  // calling the kernel
-  computeTheNumerator<T, BLOCK_DIM_X, BLOCK_DIM_Y><<<numBlocks, numThreadsPerBlock, 0, stream>>>(
-    firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1);
+  //calling the kernel
+  computeTheNumerator<T, BLOCK_DIM_X, BLOCK_DIM_Y>
+    <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+      firstClusterArray, secondClusterArray, size, arr_buf.data(),
+      arr_buf.data() + 1);
 
-  // synchronizing and updating the calculated values of a and b from device to host
+  //synchronizing and updating the calculated values of a and b from device to host
   uint64_t ab_host[2] = {0};
   raft::update_host(ab_host, arr_buf.data(), 2, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
-  // error handling
+  //error handling
   CUDA_CHECK(cudaGetLastError());
 
-  // denominator
+  //denominator
   uint64_t nChooseTwo = size * (size - 1) / 2;
 
-  // calculating the rand_index
+  //calculating the rand_index
   return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo);
 }
 
 template <typename T, typename IdxT>
-::std::ostream& operator<<(::std::ostream& os, const LinkageInputs<T, IdxT>& dims)
-{
+::std::ostream& operator<<(::std::ostream& os,
+                           const LinkageInputs<T, IdxT>& dims) {
   return os;
 }
 
 template <typename T, typename IdxT>
 class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
  protected:
-  void basicTest()
-  {
+  void basicTest() {
     raft::handle_t handle;
 
     params = ::testing::TestWithParam<LinkageInputs<T, IdxT>>::GetParam();
 
-    rmm::device_uvector<T> data(params.n_row * params.n_col, handle.get_stream());
+    rmm::device_uvector<T> data(params.n_row * params.n_col,
+                                handle.get_stream());
 
     // Allocate result labels and expected labels on device
     raft::allocate(labels, params.n_row);
     raft::allocate(labels_ref, params.n_row);
 
-    raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream());
-    raft::copy(labels_ref, params.expected_labels.data(), params.n_row, handle.get_stream());
+    raft::copy(data.data(), params.data.data(), data.size(),
+               handle.get_stream());
+    raft::copy(labels_ref, params.expected_labels.data(), params.n_row,
+               handle.get_stream());
 
     raft::hierarchy::linkage_output<IdxT, T> out_arrs;
     out_arrs.labels = labels;
 
-    rmm::device_uvector<IdxT> out_children(params.n_row * 2, handle.get_stream());
+    rmm::device_uvector<IdxT> out_children(params.n_row * 2,
+                                           handle.get_stream());
 
     out_arrs.children = out_children.data();
 
-    raft::hierarchy::single_linkage<IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>(
-      handle,
-      data.data(),
-      params.n_row,
-      params.n_col,
-      raft::distance::DistanceType::L2SqrtExpanded,
-      &out_arrs,
-      params.c,
+    raft::hierarchy::single_linkage<
+      IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>(
+      handle, data.data(), params.n_row, params.n_col,
+      raft::distance::DistanceType::L2SqrtExpanded, &out_arrs, params.c,
       params.n_clusters);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
-    score = compute_rand_index(
-      labels, labels_ref, params.n_row, handle.get_device_allocator(), handle.get_stream());
+    score =
+      compute_rand_index(labels, labels_ref, params.n_row,
+                         handle.get_device_allocator(), handle.get_stream());
   }
 
   void SetUp() override { basicTest(); }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(labels));
     CUDA_CHECK(cudaFree(labels_ref));
   }
@@ -214,12 +212,14 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   // Test n_clusters == n_points
   {10,
    5,
-   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
-    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
-    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
-    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
-    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
-    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392,
+    0.77782677, 0.43772379, 0.4035871,  0.3282796,  0.47544681, 0.59862974,
+    0.12319357, 0.06239463, 0.28200272, 0.1345717,  0.50498218, 0.5113505,
+    0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,
+    0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792,
+    0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692,
+    0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
     0.76166195, 0.66613745},
    {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
    10,
@@ -227,7 +227,8 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   //  // Test outlier points
   {9,
    2,
-   {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5},
+   {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000,
+    10, 50, 30, 5},
    {6, 0, 5, 0, 0, 4, 3, 2, 1},
    7,
    -1},
@@ -235,12 +236,14 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   // Test n_clusters == (n_points / 2)
   {10,
    5,
-   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
-    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
-    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
-    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
-    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
-    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392,
+    0.77782677, 0.43772379, 0.4035871,  0.3282796,  0.47544681, 0.59862974,
+    0.12319357, 0.06239463, 0.28200272, 0.1345717,  0.50498218, 0.5113505,
+    0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,
+    0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792,
+    0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692,
+    0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
     0.76166195, 0.66613745},
    {1, 0, 4, 0, 0, 3, 2, 0, 2, 1},
    5,
@@ -249,173 +252,340 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   // Test n_points == 100
   {100,
    10,
-   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
-    6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
-    9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
-    7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
-    3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
-    9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
-    7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
-    4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
-    3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
-    9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
-    8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
-    5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
-    8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
-    6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
-    1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
-    5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
-    9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
-    6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
-    9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
-    1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
-    8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
-    2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
-    1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
-    5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
-    4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
-    6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
-    9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
-    7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
-    7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
-    5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
-    1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
-    6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
-    3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
-    3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
-    4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
-    2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
-    7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
-    5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
-    7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
-    2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
-    7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
-    1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
-    9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
-    2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
-    4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
-    5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
-    6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
-    4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
-    5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
-    9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
-    1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
-    9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
-    3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
-    2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
-    1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
-    2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
-    2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
-    8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
-    9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
-    9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
-    9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
-    8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
-    4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
-    1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
-    3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
-    5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
-    1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
-    8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
-    1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
-    6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
-    8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
-    5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
-    3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
-    1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
-    2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
-    6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
-    6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
-    6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
-    3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
-    1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
-    9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
-    9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
-    3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
-    1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
-    9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
-    9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
-    2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
-    3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
-    3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
-    5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
-    6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
-    3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
-    1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
-    2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
-    4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
-    1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
-    8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
-    8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
-    9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
-    6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
-    7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
-    8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
-    5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
-    7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
-    1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
-    8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
-    1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
-    3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
-    9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
-    2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
-    6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
-    5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
-    2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
-    7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
-    4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
-    9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
-    2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
-    5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
-    4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
-    4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
-    8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
-    7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
-    4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
-    1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
-    2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
-    9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
-    1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
-    3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
-    3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
-    7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
-    8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
-    5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
-    8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
-    4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
-    7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
-    4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
-    7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
-    1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
-    6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
-    9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
-    1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
-    8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
-    9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
-    4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
-    7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
-    1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
-    2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
-    7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
-    7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
-    3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
-    7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
-    2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
-    2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
-    9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
-    4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
-    4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
-    5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
-    3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
-    9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
-    4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
-    1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
-    3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
-    4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
-    8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
-    5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
-    4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
-    1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01
+   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01,
+    2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
+    6.88942598e-01, 5.79163537e-01, 6.70341547e-01,
+    2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
+    9.89948537e-01, 7.75253347e-01, 1.34491522e-02,
+    2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
+    7.86373507e-01, 7.18748577e-01, 8.66998621e-01,
+    6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
+    3.76246281e-01, 4.86828710e-01, 5.67464772e-01,
+    5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
+    9.49339111e-01, 3.55248484e-01, 9.06046929e-01,
+    4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
+    7.74840000e-01, 5.21046603e-01, 4.66423971e-02,
+    5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
+    4.31536306e-01, 5.83857744e-01, 4.41787364e-01,
+    4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
+    3.19650588e-01, 6.12579596e-01, 6.49126442e-02,
+    8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
+    9.46507115e-01, 8.58440748e-01, 3.61528940e-01,
+    2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
+    8.82216988e-01, 8.31498633e-01, 7.23474381e-01,
+    7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
+    5.13985168e-01, 3.00686418e-01, 8.70109949e-01,
+    2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
+    8.70985521e-01, 8.77491176e-01, 6.72537226e-01,
+    3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
+    6.18239142e-01, 2.64768597e-01, 5.76145451e-01,
+    8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
+    1.27645356e-01, 4.51004673e-01, 3.92292980e-01,
+    4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
+    5.71832605e-02, 2.06763039e-01, 3.70116249e-01,
+    2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
+    9.84156240e-02, 2.66249156e-01, 3.87635103e-01,
+    2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
+    6.86227676e-01, 1.08848960e-01, 5.96731841e-02,
+    3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
+    9.00700636e-01, 8.76363105e-01, 2.67334632e-01,
+    1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
+    1.09372387e-01, 8.74028108e-01, 6.46403232e-01,
+    4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
+    8.83865057e-01, 3.15879821e-01, 2.27043992e-01,
+    9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
+    2.40548962e-01, 3.21795663e-01, 8.75087904e-02,
+    8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
+    1.21958818e-01, 3.44348628e-02, 8.72630414e-01,
+    3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
+    5.33896401e-01, 6.21642973e-01, 4.93062535e-01,
+    4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
+    4.43447610e-01, 8.95646149e-01, 6.05220676e-01,
+    1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
+    6.92582693e-01, 7.55946922e-01, 7.95086143e-01,
+    6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
+    9.81114529e-01, 4.98266428e-01, 6.37127930e-03,
+    1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
+    7.38827633e-01, 8.93214770e-01, 2.16494306e-01,
+    9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
+    7.86240041e-01, 7.06854594e-01, 2.13725879e-02,
+    7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
+    5.01989826e-03, 4.22081572e-02, 1.65337732e-01,
+    8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
+    1.14028379e-01, 3.69739861e-01, 1.32955599e-01,
+    2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
+    6.88449594e-01, 4.44921417e-01, 8.23296587e-01,
+    1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
+    3.42600285e-01, 5.64505195e-01, 5.57594559e-01,
+    7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
+    3.21010077e-01, 8.55081359e-01, 4.30105779e-01,
+    1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
+    4.50880592e-01, 2.72289598e-01, 6.31615256e-01,
+    8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
+    2.68767748e-02, 2.43374608e-01, 4.02141103e-01,
+    4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
+    7.16149148e-01, 4.19664401e-01, 2.29335357e-01,
+    2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
+    5.69849716e-01, 5.86454477e-01, 3.54474989e-01,
+    9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
+    7.88039746e-02, 2.04814126e-01, 7.82251754e-01,
+    2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
+    2.95349590e-01, 6.57991826e-01, 8.81214312e-01,
+    5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
+    7.69797417e-02, 6.44792402e-01, 9.46950998e-01,
+    7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
+    1.67498426e-01, 2.66514296e-01, 6.50140368e-01,
+    1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
+    9.85033484e-01, 2.92909152e-01, 8.65816607e-01,
+    1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
+    2.89234322e-01, 8.18668708e-01, 4.71706924e-01,
+    9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
+    4.13915748e-01, 9.31274932e-02, 6.66322195e-01,
+    9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
+    5.03096313e-02, 6.95225201e-01, 5.78469859e-01,
+    6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
+    6.80663678e-01, 6.34607157e-01, 6.42765834e-01,
+    1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
+    4.68676824e-01, 2.86003928e-01, 7.18608322e-01,
+    8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
+    5.24379196e-01, 2.13526524e-01, 5.88375435e-01,
+    9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
+    9.53760881e-01, 5.27151288e-01, 7.03017278e-01,
+    3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
+    1.98979011e-01, 4.24917361e-01, 5.73172761e-01,
+    2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
+    9.29665524e-01, 2.26135696e-01, 9.20563384e-01,
+    7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
+    3.78559302e-03, 9.15219382e-01, 3.55705698e-01,
+    6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
+    2.49478206e-01, 7.93679304e-01, 4.75830027e-01,
+    4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
+    1.70386675e-01, 7.04056121e-01, 4.85963102e-01,
+    9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
+    2.58915007e-01, 6.70052890e-01, 2.61945109e-01,
+    8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
+    2.45776933e-01, 2.66658783e-01, 3.71724077e-01,
+    4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
+    8.07997684e-01, 1.64296275e-01, 6.01638065e-01,
+    8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
+    9.75844338e-01, 7.81226782e-01, 2.20925515e-01,
+    7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
+    9.08058083e-01, 6.88010677e-01, 8.14271847e-01,
+    5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
+    9.17455497e-01, 2.12052706e-01, 7.04074603e-01,
+    8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
+    8.54801557e-01, 2.49729159e-01, 9.76594604e-01,
+    2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
+    4.25193986e-01, 7.61869994e-01, 5.13334255e-01,
+    6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
+    1.08154647e-01, 8.78446825e-01, 2.43833016e-01,
+    9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
+    3.74510294e-01, 4.08451278e-02, 9.78392777e-01,
+    3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
+    5.25978080e-01, 1.42803678e-01, 4.05451674e-01,
+    7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
+    1.43159543e-02, 1.80363779e-01, 5.05096904e-01,
+    2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
+    8.73223968e-01, 4.38545619e-01, 4.81348800e-01,
+    6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
+    1.89869550e-01, 2.34083070e-01, 2.94066207e-01,
+    5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
+    6.72650672e-02, 8.47345378e-01, 2.80916761e-01,
+    7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
+    8.48781331e-01, 8.83225408e-01, 7.34398275e-01,
+    7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
+    5.40732486e-01, 3.69704071e-01, 5.77305837e-01,
+    2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
+    3.49496706e-01, 8.34948910e-01, 1.56403291e-02,
+    6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
+    1.43943153e-01, 3.49618530e-01, 2.10440392e-01,
+    3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
+    2.72177079e-01, 7.07946300e-01, 4.33717726e-02,
+    3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
+    6.22777789e-01, 2.95989228e-02, 4.32855769e-01,
+    7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
+    6.43721247e-01, 6.58025802e-01, 1.05247633e-02,
+    5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
+    6.62634841e-01, 8.25936616e-01, 9.91253704e-01,
+    6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
+    3.32139049e-01, 7.98732398e-01, 7.38865223e-01,
+    9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
+    1.83778839e-01, 7.27558919e-02, 5.91602822e-01,
+    3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
+    9.18556407e-01, 9.35373324e-01, 6.89209070e-01,
+    2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
+    9.84983432e-01, 6.62322741e-01, 2.04144457e-01,
+    3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
+    3.14043787e-01, 5.91072666e-01, 7.44703771e-01,
+    8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
+    1.41526372e-01, 4.14878484e-01, 6.80683651e-01,
+    5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
+    9.03269815e-01, 8.68443745e-01, 9.86939190e-01,
+    7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
+    9.69509248e-01, 1.11908818e-01, 4.49198556e-01,
+    1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
+    2.10747488e-01, 9.53884090e-01, 8.43167950e-01,
+    4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
+    3.55290379e-01, 2.95705968e-01, 1.69622690e-01,
+    1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
+    3.70932500e-01, 9.94292830e-01, 4.62587505e-01,
+    7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
+    5.75768304e-01, 9.71448393e-01, 6.95574827e-02,
+    3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
+    6.73797120e-02, 6.76596969e-01, 5.50948898e-01,
+    3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
+    3.03264879e-01, 7.61037886e-03, 2.72289601e-01,
+    1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
+    1.92088941e-01, 2.19043977e-01, 9.09320161e-01,
+    2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
+    2.23355609e-01, 1.84789435e-01, 4.16104518e-01,
+    4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
+    4.50328256e-01, 8.72199917e-01, 2.51279916e-01,
+    4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
+    1.06187277e-01, 4.92341327e-01, 1.46017513e-01,
+    5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
+    8.72648431e-01, 5.54051490e-01, 1.80745062e-01,
+    2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
+    8.30254678e-01, 5.00003328e-01, 4.69017439e-01,
+    6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
+    9.06516882e-02, 8.52975842e-01, 1.19985883e-01,
+    3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
+    6.28362507e-02, 4.32693501e-01, 3.10500685e-01,
+    6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
+    7.91284868e-01, 7.93054570e-01, 2.93406765e-01,
+    8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
+    8.67523104e-01, 1.47963482e-01, 1.25584706e-01,
+    3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
+    5.75553531e-02, 5.31607516e-01, 2.63869588e-01,
+    9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
+    7.74866558e-01, 5.65210610e-01, 7.28015327e-02,
+    6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
+    1.29932405e-01, 8.64026259e-01, 9.92599934e-01,
+    7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
+    8.11335531e-01, 7.87734900e-01, 9.87344678e-01,
+    5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
+    1.66085871e-01, 1.12937664e-01, 5.24423470e-01,
+    6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
+    3.08722276e-02, 6.26979315e-01, 4.49754105e-01,
+    8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
+    9.22168418e-01, 3.73210378e-01, 8.04432575e-01,
+    5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
+    2.40407640e-01, 5.91631279e-01, 1.59369206e-01,
+    7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
+    6.39105224e-01, 4.85274738e-01, 2.12630838e-01,
+    2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
+    5.23869697e-01, 9.99418314e-01, 8.35331599e-01,
+    4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
+    2.77001890e-02, 5.75809742e-01, 2.78513031e-01,
+    8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
+    7.88311357e-01, 9.64676177e-01, 1.75752651e-01,
+    4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
+    4.06647450e-01, 8.46539387e-01, 2.12620694e-01,
+    9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
+    9.63626055e-01, 5.96689242e-01, 1.63372670e-01,
+    4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
+    2.82327625e-01, 4.75535418e-01, 6.27760926e-01,
+    8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
+    5.05508062e-01, 5.28102944e-01, 6.13045057e-01,
+    7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
+    4.89839179e-01, 3.10496849e-01, 8.82309038e-01,
+    2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
+    4.62955493e-01, 2.38185305e-01, 5.47259907e-02,
+    7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
+    8.77741168e-01, 4.19881322e-01, 4.81222328e-01,
+    1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
+    7.37216484e-01, 5.62134821e-02, 7.14089724e-01,
+    9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
+    4.70237690e-01, 2.66524167e-01, 7.93875484e-01,
+    4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
+    1.70082405e-01, 6.35905179e-01, 3.75379109e-01,
+    4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
+    2.24643800e-01, 2.42142981e-01, 6.57283636e-01,
+    3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
+    9.43856291e-01, 4.47518596e-01, 5.44453573e-01,
+    9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
+    1.01179183e-01, 4.45473958e-01, 4.60327322e-01,
+    4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
+    3.41027487e-01, 1.56175026e-01, 7.58283148e-01,
+    6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
+    3.92517893e-01, 6.70418431e-01, 5.16440832e-01,
+    8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
+    7.39396341e-01, 7.20852434e-01, 2.35653246e-02,
+    3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
+    8.79339335e-01, 7.41599159e-02, 5.62433904e-01,
+    6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
+    5.26845015e-02, 5.58471266e-01, 1.63632233e-01,
+    5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
+    8.99035326e-01, 7.20847756e-01, 5.68954684e-01,
+    7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
+    4.89328290e-01, 5.62208561e-01, 4.97540804e-02,
+    4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
+    7.89548214e-01, 8.46136387e-01, 8.46816189e-01,
+    1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
+    4.50646654e-01, 3.74785037e-01, 4.87196697e-01,
+    4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
+    7.13597697e-01, 1.23641270e-02, 5.10031271e-01,
+    4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
+    1.91165703e-01, 4.51170940e-01, 7.50843157e-01,
+    4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
+    6.55689206e-01, 9.68257670e-02, 1.96528793e-01,
+    8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
+    9.41828079e-01, 4.54397338e-01, 5.61893331e-01,
+    5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
+    1.74888861e-01, 6.65641378e-01, 2.81668336e-01,
+    1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
+    8.25092797e-01, 5.18106324e-01, 1.71904024e-01,
+    3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
+    9.30274827e-01, 2.38198517e-01, 9.52222901e-01,
+    5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
+    4.83356794e-01, 2.73050402e-01, 3.68027050e-01,
+    5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
+    7.13926203e-01, 8.16750052e-01, 1.57890291e-01,
+    6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
+    1.02429784e-01, 9.17488471e-01, 4.03584434e-01,
+    9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
+    2.45200576e-01, 1.28896951e-01, 3.15713052e-01,
+    5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
+    7.74738919e-02, 8.42422142e-01, 3.75598924e-01,
+    3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
+    7.43107867e-01, 9.46182666e-01, 9.44344819e-01,
+    3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
+    3.84060507e-01, 2.91057722e-01, 7.68173662e-02,
+    1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
+    7.21342202e-01, 6.69471294e-03, 9.07298311e-01,
+    5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
+    2.06407453e-01, 2.59590556e-01, 7.58512718e-01,
+    5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
+    2.42829343e-01, 9.19323719e-01, 3.46832864e-01,
+    3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
+    9.58438860e-01, 5.66326411e-01, 6.60292846e-01,
+    5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
+    4.44713264e-01, 2.09732933e-01, 5.22732436e-01,
+    1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
+    4.94036404e-01, 4.09785794e-01, 6.40025507e-01,
+    5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
+    5.41072639e-01, 5.18847173e-01, 1.97093284e-01,
+    8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
+    3.87699807e-01, 4.50705808e-01, 2.49371643e-01,
+    3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
+    9.07275994e-01, 3.73075859e-01, 4.14044139e-03,
+    2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
+    4.50350196e-01, 3.48618117e-01, 5.07193932e-01,
+    5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
+    1.02623450e-01, 3.06088345e-01, 7.80461650e-01,
+    2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
+    3.68286735e-01, 7.39358243e-01, 8.97879394e-01,
+    9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
+    4.23976657e-02, 8.25922012e-01, 2.60956996e-01,
+    2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
+    8.49071471e-01, 3.45835425e-01, 7.65458276e-01,
+    5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
+    5.63368667e-02, 4.26548945e-01, 5.46745780e-01,
+    5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
+    4.46492976e-01, 6.40240123e-01, 2.73246969e-01,
+    2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
+    1.96617189e-01, 6.61271644e-01, 8.12687657e-01,
+    8.66342445e-01
 
    },
    {0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -428,5 +598,6 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
 typedef LinkageTest<float, int> LinkageTestF_Int;
 TEST_P(LinkageTestF_Int, Result) { EXPECT_TRUE(score == 1.0); }
 
-INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, ::testing::ValuesIn(linkage_inputsf2));
+INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int,
+                        ::testing::ValuesIn(linkage_inputsf2));
 }  // end namespace raft
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 4897d8194b..7adbbf8b9a 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -39,11 +39,12 @@ struct CSRRowNormalizeInputs {
 };
 
 template <typename Type_f, typename Index_>
-class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
+class CSRRowNormalizeTest
+  : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
  protected:
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>>::GetParam();
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      CSRRowNormalizeInputs<Type_f, Index_>>::GetParam();
     cudaStreamCreate(&stream);
 
     raft::allocate(in_vals, params.in_vals.size());
@@ -52,10 +53,9 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInput
     raft::allocate(result, params.verify.size(), true);
   }
 
-  void Run()
-  {
+  void Run() {
     Index_ n_rows = params.ex_scan.size();
-    Index_ nnz    = params.in_vals.size();
+    Index_ nnz = params.in_vals.size();
 
     raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
     raft::update_device(in_vals, params.in_vals.data(), nnz, stream);
@@ -63,18 +63,20 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInput
 
     switch (params.method) {
       case MAX:
-        linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows, result, stream);
+        linalg::csr_row_normalize_max<32, Type_f>(ex_scan, in_vals, nnz, n_rows,
+                                                  result, stream);
         break;
       case L1:
-        linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows, result, stream);
+        linalg::csr_row_normalize_l1<32, Type_f>(ex_scan, in_vals, nnz, n_rows,
+                                                 result, stream);
         break;
     }
 
-    ASSERT_TRUE(raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(ex_scan));
     CUDA_CHECK(cudaFree(in_vals));
     CUDA_CHECK(cudaFree(verify));
@@ -85,7 +87,7 @@ class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInput
  protected:
   CSRRowNormalizeInputs<Type_f, Index_> params;
   cudaStream_t stream;
-  Index_* ex_scan;
+  Index_ *ex_scan;
   Type_f *in_vals, *result, *verify;
 };
 
@@ -116,11 +118,9 @@ const std::vector<CSRRowNormalizeInputs<double, int>> csrnormalize_inputs_d = {
    {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseNormTest,
-                        CSRRowNormalizeTestF,
+INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestF,
                         ::testing::ValuesIn(csrnormalize_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseNormTest,
-                        CSRRowNormalizeTestD,
+INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestD,
                         ::testing::ValuesIn(csrnormalize_inputs_d));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index 44098214d2..50b5dc5993 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -42,19 +42,19 @@ struct SparseReduceInputs {
 };
 
 template <typename value_t, typename value_idx>
-class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<value_t, value_idx>> {
+class SparseReduceTest
+  : public ::testing::TestWithParam<SparseReduceInputs<value_t, value_idx>> {
  protected:
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<SparseReduceInputs<value_t, value_idx>>::GetParam();
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      SparseReduceInputs<value_t, value_idx>>::GetParam();
   }
 
-  void Run()
-  {
+  void Run() {
     raft::handle_t handle;
 
     auto d_alloc = handle.get_device_allocator();
-    auto stream  = handle.get_stream();
+    auto stream = handle.get_stream();
 
     rmm::device_uvector<value_idx> in_rows(params.in_rows.size(), stream);
     rmm::device_uvector<value_idx> in_cols(params.in_cols.size(), stream);
@@ -63,29 +63,30 @@ class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<valu
     rmm::device_uvector<value_idx> out_cols(params.out_cols.size(), stream);
     rmm::device_uvector<value_t> out_vals(params.out_vals.size(), stream);
 
-    raft::update_device(in_rows.data(), params.in_rows.data(), params.in_rows.size(), stream);
-    raft::update_device(in_cols.data(), params.in_cols.data(), params.in_cols.size(), stream);
-    raft::update_device(in_vals.data(), params.in_vals.data(), params.in_vals.size(), stream);
-    raft::update_device(out_rows.data(), params.out_rows.data(), params.out_rows.size(), stream);
-    raft::update_device(out_cols.data(), params.out_cols.data(), params.out_cols.size(), stream);
-    raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream);
+    raft::update_device(in_rows.data(), params.in_rows.data(),
+                        params.in_rows.size(), stream);
+    raft::update_device(in_cols.data(), params.in_cols.data(),
+                        params.in_cols.size(), stream);
+    raft::update_device(in_vals.data(), params.in_vals.data(),
+                        params.in_vals.size(), stream);
+    raft::update_device(out_rows.data(), params.out_rows.data(),
+                        params.out_rows.size(), stream);
+    raft::update_device(out_cols.data(), params.out_cols.data(),
+                        params.out_cols.size(), stream);
+    raft::update_device(out_vals.data(), params.out_vals.data(),
+                        params.out_vals.size(), stream);
 
     raft::sparse::COO<value_t, value_idx> out(d_alloc, stream);
-    raft::sparse::op::max_duplicates(handle,
-                                     out,
-                                     in_rows.data(),
-                                     in_cols.data(),
-                                     in_vals.data(),
-                                     params.in_rows.size(),
-                                     params.m,
-                                     params.n);
+    raft::sparse::op::max_duplicates(handle, out, in_rows.data(),
+                                     in_cols.data(), in_vals.data(),
+                                     params.in_rows.size(), params.m, params.n);
 
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_rows.data(), out.rows(), out.nnz, raft::Compare<value_idx>()));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_cols.data(), out.cols(), out.nnz, raft::Compare<value_idx>()));
-    ASSERT_TRUE(
-      raft::devArrMatch<value_t>(out_vals.data(), out.vals(), out.nnz, raft::Compare<value_t>()));
+    ASSERT_TRUE(raft::devArrMatch<value_t>(out_vals.data(), out.vals(), out.nnz,
+                                           raft::Compare<value_t>()));
   }
 
   void TearDown() override {}
@@ -114,8 +115,7 @@ const std::vector<SparseReduceInputs<float, int>> max_reduce_inputs_f = {
    4},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseReduceTest,
-                        SparseReduceTestF,
+INSTANTIATE_TEST_CASE_P(SparseReduceTest, SparseReduceTestF,
                         ::testing::ValuesIn(max_reduce_inputs_f));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index feefa7baa3..b64fa25883 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -38,47 +38,43 @@ struct CSRRowOpInputs {
 /** Wrapper to call csr_row_op because the enclosing function of a __device__
  *  lambda cannot have private ot protected access within the class. */
 template <typename Type_f, typename Index_>
-void csr_row_op_wrapper(
-  const Index_* row_ind, Index_ n_rows, Index_ nnz, Type_f* result, cudaStream_t stream)
-{
+void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz,
+                        Type_f *result, cudaStream_t stream) {
   op::csr_row_op<Index_, 32>(
-    row_ind,
-    n_rows,
-    nnz,
+    row_ind, n_rows, nnz,
     [result] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {
-      for (Index_ i = start_idx; i < stop_idx; i++)
-        result[i] = row;
+      for (Index_ i = start_idx; i < stop_idx; i++) result[i] = row;
     },
     stream);
 }
 
 template <typename Type_f, typename Index_>
-class CSRRowOpTest : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>> {
+class CSRRowOpTest
+  : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>> {
  protected:
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>>::GetParam();
+  void SetUp() override {
+    params =
+      ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>>::GetParam();
     cudaStreamCreate(&stream);
     n_rows = params.ex_scan.size();
-    nnz    = params.verify.size();
+    nnz = params.verify.size();
 
     raft::allocate(verify, nnz);
     raft::allocate(ex_scan, n_rows);
     raft::allocate(result, nnz, true);
   }
 
-  void Run()
-  {
+  void Run() {
     raft::update_device(ex_scan, params.ex_scan.data(), n_rows, stream);
     raft::update_device(verify, params.verify.data(), nnz, stream);
 
     csr_row_op_wrapper<Type_f, Index_>(ex_scan, n_rows, nnz, result, stream);
 
-    ASSERT_TRUE(raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(verify, result, nnz, raft::Compare<Type_f>()));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(ex_scan));
     CUDA_CHECK(cudaFree(verify));
     CUDA_CHECK(cudaFree(result));
@@ -89,7 +85,7 @@ class CSRRowOpTest : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Inde
   CSRRowOpInputs<Type_f, Index_> params;
   cudaStream_t stream;
   Index_ n_rows, nnz;
-  Index_* ex_scan;
+  Index_ *ex_scan;
   Type_f *result, *verify;
 };
 
@@ -106,8 +102,10 @@ const std::vector<CSRRowOpInputs<double, int>> csrrowop_inputs_d = {
   {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, ::testing::ValuesIn(csrrowop_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, ::testing::ValuesIn(csrrowop_inputs_d));
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF,
+                        ::testing::ValuesIn(csrrowop_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD,
+                        ::testing::ValuesIn(csrrowop_inputs_d));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/selection.cu b/cpp/test/sparse/selection.cu
index 5d3b2a8317..46f2f6a844 100644
--- a/cpp/test/sparse/selection.cu
+++ b/cpp/test/sparse/selection.cu
@@ -45,9 +45,8 @@ struct SparseSelectionInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream& operator<<(::std::ostream& os,
-                           const SparseSelectionInputs<value_idx, value_t>& dims)
-{
+::std::ostream &operator<<(
+  ::std::ostream &os, const SparseSelectionInputs<value_idx, value_t> &dims) {
   return os;
 }
 
@@ -55,8 +54,7 @@ template <typename value_idx, typename value_t>
 class SparseSelectionTest
   : public ::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>> {
  protected:
-  void make_data()
-  {
+  void make_data() {
     std::vector<value_t> dists_h = params.dists_h;
 
     allocate(dists, n_rows * n_cols);
@@ -65,39 +63,42 @@ class SparseSelectionTest
     allocate(inds, n_rows * n_cols);
     iota_fill(inds, n_rows, n_cols, stream);
 
-    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
+    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
 
     allocate(out_indices_ref, out_indices_ref_h.size());
     allocate(out_dists_ref, out_dists_ref_h.size());
 
-    update_device(out_indices_ref, out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
-    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
+    update_device(out_indices_ref, out_indices_ref_h.data(),
+                  out_indices_ref_h.size(), stream);
+    update_device(out_dists_ref, out_dists_ref_h.data(), out_dists_ref_h.size(),
+                  stream);
 
     allocate(out_dists, n_rows * k);
     allocate(out_indices, n_rows * k);
   }
 
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>>::GetParam();
-    std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      SparseSelectionInputs<value_idx, value_t>>::GetParam();
+    std::shared_ptr<raft::mr::device::allocator> alloc(
+      new raft::mr::device::default_allocator);
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     n_rows = params.n_rows;
     n_cols = params.n_cols;
-    k      = params.k;
+    k = params.k;
 
     make_data();
 
-    raft::sparse::selection::select_k(
-      dists, inds, n_rows, n_cols, out_dists, out_indices, params.select_min, k, stream);
+    raft::sparse::selection::select_k(dists, inds, n_rows, n_cols, out_dists,
+                                      out_indices, params.select_min, k,
+                                      stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     CUDA_CHECK(cudaFree(dists));
@@ -110,10 +111,11 @@ class SparseSelectionTest
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void compare()
-  {
-    ASSERT_TRUE(devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k, Compare<value_idx>()));
+  void compare() {
+    ASSERT_TRUE(
+      devArrMatch(out_dists_ref, out_dists, n_rows * k, Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(out_indices_ref, out_indices, n_rows * k,
+                            Compare<value_idx>()));
   }
 
  protected:
@@ -122,15 +124,15 @@ class SparseSelectionTest
   int n_rows, n_cols, k;
 
   // input data
-  value_t* dists;
-  value_idx* inds;
+  value_t *dists;
+  value_idx *inds;
 
   // output data
-  value_idx* out_indices;
-  value_t* out_dists;
+  value_idx *out_indices;
+  value_t *out_dists;
 
-  value_idx* out_indices_ref;
-  value_t* out_dists_ref;
+  value_idx *out_indices_ref;
+  value_t *out_dists_ref;
 
   SparseSelectionInputs<value_idx, value_t> params;
 };
@@ -147,8 +149,7 @@ const std::vector<SparseSelectionInputs<int, float>> inputs_i32_f = {
    true}};
 typedef SparseSelectionTest<int, float> SparseSelectionTestF;
 TEST_P(SparseSelectionTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseSelectionTest,
-                        SparseSelectionTestF,
+INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace selection
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index e154d19d34..b9a8b849eb 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -47,27 +47,27 @@ class SparseSortTest : public ::testing::TestWithParam<SparseSortInput<T>> {
 const std::vector<SparseSortInput<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseSortTest<float> COOSort;
-TEST_P(COOSort, Result)
-{
+TEST_P(COOSort, Result) {
   int *in_rows, *in_cols, *verify;
-  float* in_vals;
+  float *in_vals;
 
   params = ::testing::TestWithParam<SparseSortInput<float>>::GetParam();
   raft::random::Rng r(params.seed);
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
-  std::shared_ptr<raft::mr::device::allocator> alloc(new raft::mr::device::default_allocator);
+  std::shared_ptr<raft::mr::device::allocator> alloc(
+    new raft::mr::device::default_allocator);
 
   raft::allocate(in_vals, params.nnz);
   r.uniform(in_vals, params.nnz, float(-1.0), float(1.0), stream);
 
-  int* in_rows_h = (int*)malloc(params.nnz * sizeof(int));
-  int* in_cols_h = (int*)malloc(params.nnz * sizeof(int));
-  int* verify_h  = (int*)malloc(params.nnz * sizeof(int));
+  int *in_rows_h = (int *)malloc(params.nnz * sizeof(int));
+  int *in_cols_h = (int *)malloc(params.nnz * sizeof(int));
+  int *verify_h = (int *)malloc(params.nnz * sizeof(int));
 
   for (int i = 0; i < params.nnz; i++) {
     in_rows_h[i] = params.nnz - i - 1;
-    verify_h[i]  = i;
+    verify_h[i] = i;
     in_cols_h[i] = i;
   }
 
@@ -80,9 +80,11 @@ TEST_P(COOSort, Result)
   raft::update_device(in_cols, in_cols_h, params.nnz, stream);
   raft::update_device(verify, verify_h, params.nnz, stream);
 
-  op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc, stream);
+  op::coo_sort(params.m, params.n, params.nnz, in_rows, in_cols, in_vals, alloc,
+               stream);
 
-  ASSERT_TRUE(raft::devArrMatch<int>(verify, in_rows, params.nnz, raft::Compare<int>()));
+  ASSERT_TRUE(
+    raft::devArrMatch<int>(verify, in_rows, params.nnz, raft::Compare<int>()));
 
   delete[] in_rows_h;
   delete[] in_cols_h;
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index 6a66daa769..d104028d2b 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -29,9 +29,8 @@ namespace raft {
 namespace sparse {
 
 template <typename value_idx, typename value_t>
-__global__ void assert_symmetry(
-  value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum)
-{
+__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals,
+                                value_idx nnz, value_idx *sum) {
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid >= nnz) return;
@@ -50,21 +49,19 @@ struct SparseSymmetrizeInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream& operator<<(::std::ostream& os,
-                           const SparseSymmetrizeInputs<value_idx, value_t>& dims)
-{
+::std::ostream &operator<<(
+  ::std::ostream &os, const SparseSymmetrizeInputs<value_idx, value_t> &dims) {
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class SparseSymmetrizeTest
-  : public ::testing::TestWithParam<SparseSymmetrizeInputs<value_idx, value_t>> {
+class SparseSymmetrizeTest : public ::testing::TestWithParam<
+                               SparseSymmetrizeInputs<value_idx, value_t>> {
  protected:
-  void make_data()
-  {
-    std::vector<value_idx> indptr_h  = params.indptr_h;
+  void make_data() {
+    std::vector<value_idx> indptr_h = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h      = params.data_h;
+    std::vector<value_t> data_h = params.data_h;
 
     allocate(indptr, indptr_h.size());
     allocate(indices, indices_h.size());
@@ -75,19 +72,19 @@ class SparseSymmetrizeTest
     update_device(data, data_h.data(), data_h.size(), stream);
   }
 
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<SparseSymmetrizeInputs<value_idx, value_t>>::GetParam();
+  void SetUp() override {
+    params = ::testing::TestWithParam<
+      SparseSymmetrizeInputs<value_idx, value_t>>::GetParam();
 
     raft::handle_t handle;
 
     auto alloc = handle.get_device_allocator();
-    stream     = handle.get_stream();
+    stream = handle.get_stream();
 
     make_data();
 
-    value_idx m   = params.indptr_h.size() - 1;
-    value_idx n   = params.n_cols;
+    value_idx m = params.indptr_h.size() - 1;
+    value_idx n = params.n_cols;
     value_idx nnz = params.indices_h.size();
 
     raft::mr::device::buffer<value_idx> coo_rows(alloc, stream, nnz);
@@ -96,8 +93,8 @@ class SparseSymmetrizeTest
 
     raft::sparse::COO<value_t, value_idx> out(alloc, stream);
 
-    raft::sparse::linalg::symmetrize(
-      handle, coo_rows.data(), indices, data, m, n, coo_rows.size(), out);
+    raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices, data, m,
+                                     n, coo_rows.size(), out);
 
     raft::mr::device::buffer<value_idx> sum(alloc, stream, 1);
 
@@ -110,8 +107,7 @@ class SparseSymmetrizeTest
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUDA_CHECK(cudaFree(indptr));
     CUDA_CHECK(cudaFree(indices));
@@ -123,7 +119,7 @@ class SparseSymmetrizeTest
 
   // input data
   value_idx *indptr, *indices;
-  value_t* data;
+  value_t *data;
 
   value_idx sum_h;
 
@@ -137,7 +133,8 @@ struct COOSymmetrizeInputs {
 };
 
 template <typename T>
-class COOSymmetrizeTest : public ::testing::TestWithParam<COOSymmetrizeInputs<T>> {
+class COOSymmetrizeTest
+  : public ::testing::TestWithParam<COOSymmetrizeInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -147,8 +144,7 @@ class COOSymmetrizeTest : public ::testing::TestWithParam<COOSymmetrizeInputs<T>
 const std::vector<COOSymmetrizeInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef COOSymmetrizeTest<float> COOSymmetrize;
-TEST_P(COOSymmetrize, Result)
-{
+TEST_P(COOSymmetrize, Result) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
@@ -157,14 +153,16 @@ TEST_P(COOSymmetrize, Result)
 
   int nnz = 8;
 
-  int* in_rows_h   = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
-  int* in_cols_h   = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2};
-  float* in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5};
+  int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
+  int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2};
+  float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5};
 
-  int* exp_rows_h = new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0};
-  int* exp_cols_h = new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0};
-  float* exp_vals_h =
-    new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
+  int *exp_rows_h =
+    new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0};
+  int *exp_cols_h =
+    new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0};
+  float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0,
+                                         0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
 
   COO<float> in(alloc, stream, nnz, 4, 4);
   raft::update_device(in.rows(), *&in_rows_h, nnz, stream);
@@ -174,19 +172,22 @@ TEST_P(COOSymmetrize, Result)
   COO<float> out(alloc, stream);
 
   linalg::coo_symmetrize<32, float>(
-    &in,
-    &out,
-    [] __device__(int row, int col, float val, float trans) { return val + trans; },
-    alloc,
-    stream);
+    &in, &out,
+    [] __device__(int row, int col, float val, float trans) {
+      return val + trans;
+    },
+    alloc, stream);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
   std::cout << out << std::endl;
 
   ASSERT_TRUE(out.nnz == nnz * 2);
-  ASSERT_TRUE(raft::devArrMatch<int>(out.rows(), exp_rows_h, out.nnz, raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<int>(out.cols(), exp_cols_h, out.nnz, raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<float>(out.vals(), exp_vals_h, out.nnz, raft::Compare<float>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.rows(), exp_rows_h, out.nnz,
+                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.cols(), exp_cols_h, out.nnz,
+                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<float>(out.vals(), exp_vals_h, out.nnz,
+                                       raft::Compare<float>()));
 
   cudaStreamDestroy(stream);
 
@@ -199,7 +200,8 @@ TEST_P(COOSymmetrize, Result)
   delete[] exp_vals_h;
 }
 
-INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize,
+                        ::testing::ValuesIn(inputsf));
 
 const std::vector<SparseSymmetrizeInputs<int, float>> symm_inputs_fint = {
   // Test n_clusters == n_points
@@ -219,8 +221,7 @@ const std::vector<SparseSymmetrizeInputs<int, float>> symm_inputs_fint = {
 typedef SparseSymmetrizeTest<int, float> SparseSymmetrizeTestF_int;
 TEST_P(SparseSymmetrizeTestF_int, Result) { ASSERT_TRUE(sum_h == 0); }
 
-INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest,
-                        SparseSymmetrizeTestF_int,
+INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, SparseSymmetrizeTestF_int,
                         ::testing::ValuesIn(symm_inputs_fint));
 
 }  // namespace sparse
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index 8d35960d6a..def1f1685b 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -29,8 +29,7 @@ namespace knn {
 template <typename value_idx, typename value_t>
 class HaversineKNNTest : public ::testing::Test {
  protected:
-  void basicTest()
-  {
+  void basicTest() {
     auto alloc = std::make_shared<raft::mr::device::default_allocator>();
 
     // Allocate input
@@ -45,37 +44,31 @@ class HaversineKNNTest : public ::testing::Test {
     raft::allocate(d_pred_D, n * n);
 
     // make testdata on host
-    std::vector<value_t> h_train_inputs = {0.71113885,
-                                           -1.29215058,
-                                           0.59613176,
-                                           -2.08048115,
-                                           0.74932804,
-                                           -1.33634042,
-                                           0.51486728,
-                                           -1.65962873,
-                                           0.53154002,
-                                           -1.47049808,
-                                           0.72891737,
-                                           -1.54095137};
+    std::vector<value_t> h_train_inputs = {
+      0.71113885, -1.29215058, 0.59613176, -2.08048115,
+      0.74932804, -1.33634042, 0.51486728, -1.65962873,
+      0.53154002, -1.47049808, 0.72891737, -1.54095137};
 
     h_train_inputs.resize(n);
     raft::update_device(d_train_inputs, h_train_inputs.data(), n * d, 0);
 
-    std::vector<value_t> h_res_D = {0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595,
-                                    0., 0.36575755, 0.44288665, 0.5170737,  0.59501296, 0.62925595,
-                                    0., 0.05041587, 0.152463,   0.2426416,  0.34925285, 0.59501296,
-                                    0., 0.16461092, 0.2345792,  0.34925285, 0.35749438, 0.36575755,
-                                    0., 0.16461092, 0.20535265, 0.23048252, 0.2426416,  0.5170737,
-                                    0., 0.152463,   0.18767063, 0.20535265, 0.2345792,  0.44288665};
+    std::vector<value_t> h_res_D = {
+      0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595,
+      0., 0.36575755, 0.44288665, 0.5170737,  0.59501296, 0.62925595,
+      0., 0.05041587, 0.152463,   0.2426416,  0.34925285, 0.59501296,
+      0., 0.16461092, 0.2345792,  0.34925285, 0.35749438, 0.36575755,
+      0., 0.16461092, 0.20535265, 0.23048252, 0.2426416,  0.5170737,
+      0., 0.152463,   0.18767063, 0.20535265, 0.2345792,  0.44288665};
     h_res_D.resize(n * n);
     raft::update_device(d_ref_D, h_res_D.data(), n * n, 0);
 
-    std::vector<value_idx> h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, 2, 0, 5, 4, 3, 1,
-                                      3, 4, 5, 2, 0, 1, 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1};
+    std::vector<value_idx> h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0,
+                                      2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1,
+                                      4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1};
     h_res_I.resize(n * n);
     raft::update_device<value_idx>(d_ref_I, h_res_I.data(), n * n, 0);
 
-    std::vector<value_t*> input_vec  = {d_train_inputs};
+    std::vector<value_t *> input_vec = {d_train_inputs};
     std::vector<value_idx> sizes_vec = {n};
 
     cudaStream_t stream;
@@ -89,8 +82,7 @@ class HaversineKNNTest : public ::testing::Test {
 
   void SetUp() override { basicTest(); }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(d_train_inputs));
     CUDA_CHECK(cudaFree(d_pred_I));
     CUDA_CHECK(cudaFree(d_pred_D));
@@ -99,26 +91,27 @@ class HaversineKNNTest : public ::testing::Test {
   }
 
  protected:
-  value_t* d_train_inputs;
+  value_t *d_train_inputs;
 
   int n = 6;
   int d = 2;
 
   int k = 6;
 
-  value_idx* d_pred_I;
-  value_t* d_pred_D;
+  value_idx *d_pred_I;
+  value_t *d_pred_D;
 
-  value_idx* d_ref_I;
-  value_t* d_ref_D;
+  value_idx *d_ref_I;
+  value_t *d_ref_D;
 };
 
 typedef HaversineKNNTest<int, float> HaversineKNNTestF;
 
-TEST_F(HaversineKNNTestF, Fit)
-{
-  ASSERT_TRUE(raft::devArrMatch(d_ref_D, d_pred_D, n * n, raft::CompareApprox<float>(1e-3)));
-  ASSERT_TRUE(raft::devArrMatch(d_ref_I, d_pred_I, n * n, raft::Compare<int>()));
+TEST_F(HaversineKNNTestF, Fit) {
+  ASSERT_TRUE(raft::devArrMatch(d_ref_D, d_pred_D, n * n,
+                                raft::CompareApprox<float>(1e-3)));
+  ASSERT_TRUE(
+    raft::devArrMatch(d_ref_I, d_pred_I, n * n, raft::Compare<int>()));
 }
 
 }  // namespace knn
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index d4e35c9d54..2b1ef89f7a 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -31,18 +31,18 @@ struct KNNInputs {
   std::vector<int> labels;
 };
 
-__global__ void build_actual_output(
-  int* output, int n_rows, int k, const int* idx_labels, const int64_t* indices)
-{
+__global__ void build_actual_output(int *output, int n_rows, int k,
+                                    const int *idx_labels,
+                                    const int64_t *indices) {
   int element = threadIdx.x + blockDim.x * blockIdx.x;
   if (element >= n_rows * k) return;
 
-  int ind         = (int)indices[element];
+  int ind = (int)indices[element];
   output[element] = idx_labels[ind];
 }
 
-__global__ void build_expected_output(int* output, int n_rows, int k, const int* labels)
-{
+__global__ void build_expected_output(int *output, int n_rows, int k,
+                                      const int *labels) {
   int row = threadIdx.x + blockDim.x * blockIdx.x;
   if (row >= n_rows) return;
 
@@ -55,33 +55,25 @@ __global__ void build_expected_output(int* output, int n_rows, int k, const int*
 template <typename T>
 class KNNTest : public ::testing::TestWithParam<KNNInputs> {
  protected:
-  void testBruteForce()
-  {
-    raft::print_device_vector("Input array: ", input_, rows_ * cols_, std::cout);
+  void testBruteForce() {
+    raft::print_device_vector("Input array: ", input_, rows_ * cols_,
+                              std::cout);
     std::cout << "K: " << k_ << "\n";
-    raft::print_device_vector("Labels array: ", search_labels_, rows_, std::cout);
+    raft::print_device_vector("Labels array: ", search_labels_, rows_,
+                              std::cout);
 
     auto stream = handle_.get_stream();
 
     raft::allocate(actual_labels_, rows_ * k_, true);
     raft::allocate(expected_labels_, rows_ * k_, true);
 
-    std::vector<float*> input_vec;
+    std::vector<float *> input_vec;
     std::vector<int> sizes_vec;
     input_vec.push_back(input_);
     sizes_vec.push_back(rows_);
 
-    brute_force_knn(handle_,
-                    input_vec,
-                    sizes_vec,
-                    cols_,
-                    search_data_,
-                    rows_,
-                    indices_,
-                    distances_,
-                    k_,
-                    true,
-                    true);
+    brute_force_knn(handle_, input_vec, sizes_vec, cols_, search_data_, rows_,
+                    indices_, distances_, k_, true, true);
 
     build_actual_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
       actual_labels_, rows_, k_, search_labels_, indices_);
@@ -89,20 +81,24 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     build_expected_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
       expected_labels_, rows_, k_, search_labels_);
 
-    raft::print_device_vector("Output indices: ", indices_, rows_ * k_, std::cout);
-    raft::print_device_vector("Output distances: ", distances_, rows_ * k_, std::cout);
-    raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_, std::cout);
-    raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_, std::cout);
-
-    ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_, raft::Compare<int>()));
+    raft::print_device_vector("Output indices: ", indices_, rows_ * k_,
+                              std::cout);
+    raft::print_device_vector("Output distances: ", distances_, rows_ * k_,
+                              std::cout);
+    raft::print_device_vector("Output labels: ", actual_labels_, rows_ * k_,
+                              std::cout);
+    raft::print_device_vector("Expected labels: ", expected_labels_, rows_ * k_,
+                              std::cout);
+
+    ASSERT_TRUE(devArrMatch(expected_labels_, actual_labels_, rows_ * k_,
+                            raft::Compare<int>()));
   }
 
-  void SetUp() override
-  {
+  void SetUp() override {
     params_ = ::testing::TestWithParam<KNNInputs>::GetParam();
-    rows_   = params_.input.size();
-    cols_   = params_.input[0].size();
-    k_      = params_.k;
+    rows_ = params_.input.size();
+    cols_ = params_.input[0].size();
+    k_ = params_.k;
 
     std::vector<float> row_major_input;
     for (int i = 0; i < params_.input.size(); ++i) {
@@ -111,12 +107,14 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
       }
     }
     rmm::device_buffer input_d = rmm::device_buffer(
-      row_major_input.data(), row_major_input.size() * sizeof(float), handle_.get_stream());
-    float* input_ptr = static_cast<float*>(input_d.data());
+      row_major_input.data(), row_major_input.size() * sizeof(float),
+      handle_.get_stream());
+    float *input_ptr = static_cast<float *>(input_d.data());
 
     rmm::device_buffer labels_d = rmm::device_buffer(
-      params_.labels.data(), params_.labels.size() * sizeof(int), handle_.get_stream());
-    int* labels_ptr = static_cast<int*>(labels_d.data());
+      params_.labels.data(), params_.labels.size() * sizeof(int),
+      handle_.get_stream());
+    int *labels_ptr = static_cast<int *>(labels_d.data());
 
     raft::allocate(input_, rows_ * cols_, true);
     raft::allocate(search_data_, rows_ * cols_, true);
@@ -129,8 +127,7 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     raft::copy(search_labels_, labels_ptr, rows_, handle_.get_stream());
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(search_data_));
     CUDA_CHECK(cudaFree(indices_));
     CUDA_CHECK(cudaFree(distances_));
@@ -142,15 +139,15 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
   KNNInputs params_;
   int rows_;
   int cols_;
-  float* input_;
-  float* search_data_;
-  int64_t* indices_;
-  float* distances_;
+  float *input_;
+  float *search_data_;
+  int64_t *indices_;
+  float *distances_;
   int k_;
 
-  int* search_labels_;
-  int* actual_labels_;
-  int* expected_labels_;
+  int *search_labels_;
+  int *actual_labels_;
+  int *expected_labels_;
 };
 
 const std::vector<KNNInputs> inputs = {
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index 2d7d713717..e5c2d52764 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -32,8 +32,7 @@ struct csr_view_t {
   index_type number_of_edges;
 };
 }  // namespace
-TEST(Raft, SpectralMatrices)
-{
+TEST(Raft, SpectralMatrices) {
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -50,18 +49,19 @@ TEST(Raft, SpectralMatrices)
   index_type* ro{nullptr};
   index_type* ci{nullptr};
   value_type* vs{nullptr};
-  index_type nnz   = 0;
+  index_type nnz = 0;
   index_type nrows = 0;
   sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
   sparse_matrix_t<index_type, value_type> sm2{h, csr_v};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
   ASSERT_EQ(nullptr, sm2.row_offsets_);
 
-  auto stream    = h.get_stream();
+  auto stream = h.get_stream();
   auto t_exe_pol = thrust::cuda::par.on(stream);
 
   auto cnstr_lm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) {
-    laplacian_matrix_t<index_type, value_type> lm1{h, t_exe_pol, ro, ci, vs, nrows, nnz};
+    laplacian_matrix_t<index_type, value_type> lm1{h,  t_exe_pol, ro, ci,
+                                                   vs, nrows,     nnz};
   };
   EXPECT_ANY_THROW(cnstr_lm1());  // because of nullptr ptr args
 
@@ -71,7 +71,8 @@ TEST(Raft, SpectralMatrices)
   EXPECT_ANY_THROW(cnstr_lm2());  // because of nullptr ptr args
 
   auto cnstr_mm1 = [&h, t_exe_pol, ro, ci, vs, nrows, nnz](void) {
-    modularity_matrix_t<index_type, value_type> mm1{h, t_exe_pol, ro, ci, vs, nrows, nnz};
+    modularity_matrix_t<index_type, value_type> mm1{h,  t_exe_pol, ro, ci,
+                                                    vs, nrows,     nnz};
   };
   EXPECT_ANY_THROW(cnstr_mm1());  // because of nullptr ptr args
 
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index 8eb2f91952..4a3b0ed196 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -35,16 +35,14 @@ struct MeanInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const MeanInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os, const MeanInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
 class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<MeanInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
 
@@ -61,15 +59,13 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
     meanSGtest(data, stream);
   }
 
-  void meanSGtest(T* data, cudaStream_t stream)
-  {
+  void meanSGtest(T *data, cudaStream_t stream) {
     int rows = params.rows, cols = params.cols;
 
     mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(mean_act));
   }
@@ -82,52 +78,52 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
 // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the
 // measured mean (of a normal distribution) will fall outside of an epsilon of
 // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times)
-const std::vector<MeanInputs<float>> inputsf = {{0.15f, 1.f, 1024, 32, true, false, 1234ULL},
-                                                {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
-                                                {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
-                                                {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
-                                                {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
-                                                {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
-                                                {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
-                                                {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
-                                                {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
-                                                {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
-                                                {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
-                                                {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
-                                                {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
-                                                {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
-                                                {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
-                                                {0.15f, -1.f, 1024, 256, false, true, 1234ULL}};
-
-const std::vector<MeanInputs<double>> inputsd = {{0.15, 1.0, 1024, 32, true, false, 1234ULL},
-                                                 {0.15, 1.0, 1024, 64, true, false, 1234ULL},
-                                                 {0.15, 1.0, 1024, 128, true, false, 1234ULL},
-                                                 {0.15, 1.0, 1024, 256, true, false, 1234ULL},
-                                                 {0.15, -1.0, 1024, 32, false, false, 1234ULL},
-                                                 {0.15, -1.0, 1024, 64, false, false, 1234ULL},
-                                                 {0.15, -1.0, 1024, 128, false, false, 1234ULL},
-                                                 {0.15, -1.0, 1024, 256, false, false, 1234ULL},
-                                                 {0.15, 1.0, 1024, 32, true, true, 1234ULL},
-                                                 {0.15, 1.0, 1024, 64, true, true, 1234ULL},
-                                                 {0.15, 1.0, 1024, 128, true, true, 1234ULL},
-                                                 {0.15, 1.0, 1024, 256, true, true, 1234ULL},
-                                                 {0.15, -1.0, 1024, 32, false, true, 1234ULL},
-                                                 {0.15, -1.0, 1024, 64, false, true, 1234ULL},
-                                                 {0.15, -1.0, 1024, 128, false, true, 1234ULL},
-                                                 {0.15, -1.0, 1024, 256, false, true, 1234ULL}};
+const std::vector<MeanInputs<float>> inputsf = {
+  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
+  {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
+  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
+  {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
+  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
+  {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
+  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
+  {0.15f, -1.f, 1024, 256, false, true, 1234ULL}};
+
+const std::vector<MeanInputs<double>> inputsd = {
+  {0.15, 1.0, 1024, 32, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 64, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 128, true, false, 1234ULL},
+  {0.15, 1.0, 1024, 256, true, false, 1234ULL},
+  {0.15, -1.0, 1024, 32, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 128, false, false, 1234ULL},
+  {0.15, -1.0, 1024, 256, false, false, 1234ULL},
+  {0.15, 1.0, 1024, 32, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 64, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 128, true, true, 1234ULL},
+  {0.15, 1.0, 1024, 256, true, true, 1234ULL},
+  {0.15, -1.0, 1024, 32, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 64, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 128, false, true, 1234ULL},
+  {0.15, -1.0, 1024, 256, false, true, 1234ULL}};
 
 typedef MeanTest<float> MeanTestF;
-TEST_P(MeanTestF, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(params.mean, mean_act, params.cols, CompareApprox<float>(params.tolerance)));
+TEST_P(MeanTestF, Result) {
+  ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols,
+                          CompareApprox<float>(params.tolerance)));
 }
 
 typedef MeanTest<double> MeanTestD;
-TEST_P(MeanTestD, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(params.mean, mean_act, params.cols, CompareApprox<double>(params.tolerance)));
+TEST_P(MeanTestD, Result) {
+  ASSERT_TRUE(devArrMatch(params.mean, mean_act, params.cols,
+                          CompareApprox<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestF, ::testing::ValuesIn(inputsf));
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index 67df0def05..8b0d607561 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -34,16 +34,16 @@ struct MeanCenterInputs {
 };
 
 template <typename T, typename IdxType>
-::std::ostream& operator<<(::std::ostream& os, const MeanCenterInputs<T, IdxType>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os,
+                           const MeanCenterInputs<T, IdxType> &dims) {
   return os;
 }
 
 template <typename T, typename IdxType>
-class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxType>> {
+class MeanCenterTest
+  : public ::testing::TestWithParam<MeanCenterInputs<T, IdxType>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam();
     raft::random::Rng r(params.seed);
 
@@ -51,7 +51,7 @@ class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxTy
     CUDA_CHECK(cudaStreamCreate(&stream));
 
     auto rows = params.rows, cols = params.cols;
-    auto len       = rows * cols;
+    auto len = rows * cols;
     IdxType vecLen = params.bcastAlongRows ? cols : rows;
 
     raft::allocate(out, len);
@@ -59,15 +59,16 @@ class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxTy
     raft::allocate(data, len);
     raft::allocate(meanVec, vecLen);
     r.normal(data, len, params.mean, (T)1.0, stream);
-    raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor, stream);
-    meanCenter(out, data, meanVec, cols, rows, params.rowMajor, params.bcastAlongRows, stream);
-    raft::linalg::naiveMatVec(
-      out_ref, data, meanVec, cols, rows, params.rowMajor, params.bcastAlongRows, (T)-1.0);
+    raft::stats::mean(meanVec, data, cols, rows, params.sample, params.rowMajor,
+                      stream);
+    meanCenter(out, data, meanVec, cols, rows, params.rowMajor,
+               params.bcastAlongRows, stream);
+    raft::linalg::naiveMatVec(out_ref, data, meanVec, cols, rows,
+                              params.rowMajor, params.bcastAlongRows, (T)-1.0);
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(out));
     CUDA_CHECK(cudaFree(out_ref));
     CUDA_CHECK(cudaFree(data));
@@ -105,11 +106,12 @@ const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
   {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
   {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<float, int> MeanCenterTestF_i32;
-TEST_P(MeanCenterTestF_i32, Result)
-{
-  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MeanCenterTestF_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
+                          raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32,
+                         ::testing::ValuesIn(inputsf_i32));
 
 const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
   {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL},
@@ -137,11 +139,12 @@ const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
   {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
   {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<float, size_t> MeanCenterTestF_i64;
-TEST_P(MeanCenterTestF_i64, Result)
-{
-  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols, raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MeanCenterTestF_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
+                          raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64,
+                         ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
   {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
@@ -169,12 +172,12 @@ const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
   {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
   {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<double, int> MeanCenterTestD_i32;
-TEST_P(MeanCenterTestD_i32, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out, out_ref, params.cols, raft::CompareApprox<double>(params.tolerance)));
+TEST_P(MeanCenterTestD_i32, Result) {
+  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
+                          raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32,
+                         ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
   {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
@@ -202,12 +205,12 @@ const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
   {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
   {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<double, size_t> MeanCenterTestD_i64;
-TEST_P(MeanCenterTestD_i64, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(out, out_ref, params.cols, raft::CompareApprox<double>(params.tolerance)));
+TEST_P(MeanCenterTestD_i64, Result) {
+  ASSERT_TRUE(devArrMatch(out, out_ref, params.cols,
+                          raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64,
+                         ::testing::ValuesIn(inputsd_i64));
 
 }  // end namespace stats
 }  // end namespace raft
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 8b7f75171b..ff2698788f 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -34,16 +34,14 @@ struct StdDevInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const StdDevInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os, const StdDevInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
 class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
  protected:
-  void SetUp() override
-  {
+  void SetUp() override {
     params = ::testing::TestWithParam<StdDevInputs<T>>::GetParam();
     random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
@@ -60,21 +58,21 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void stdVarSGtest(T* data, cudaStream_t stream)
-  {
+  void stdVarSGtest(T *data, cudaStream_t stream) {
     int rows = params.rows, cols = params.cols;
 
     mean(mean_act, data, cols, rows, params.sample, params.rowMajor, stream);
 
-    stddev(stddev_act, data, mean_act, cols, rows, params.sample, params.rowMajor, stream);
+    stddev(stddev_act, data, mean_act, cols, rows, params.sample,
+           params.rowMajor, stream);
 
-    vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor, stream);
+    vars(vars_act, data, mean_act, cols, rows, params.sample, params.rowMajor,
+         stream);
 
     raft::matrix::seqRoot(vars_act, T(1), cols, stream);
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(mean_act));
     CUDA_CHECK(cudaFree(stddev_act));
@@ -123,28 +121,28 @@ const std::vector<StdDevInputs<double>> inputsd = {
   {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL}};
 
 typedef StdDevTest<float> StdDevTestF;
-TEST_P(StdDevTestF, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(params.stddev, stddev_act, params.cols, CompareApprox<float>(params.tolerance)));
+TEST_P(StdDevTestF, Result) {
+  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols,
+                          CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(
-    devArrMatch(stddev_act, vars_act, params.cols, CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols,
+                          CompareApprox<float>(params.tolerance)));
 }
 
 typedef StdDevTest<double> StdDevTestD;
-TEST_P(StdDevTestD, Result)
-{
-  ASSERT_TRUE(
-    devArrMatch(params.stddev, stddev_act, params.cols, CompareApprox<double>(params.tolerance)));
+TEST_P(StdDevTestD, Result) {
+  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act, params.cols,
+                          CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(
-    devArrMatch(stddev_act, vars_act, params.cols, CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(stddev_act, vars_act, params.cols,
+                          CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF,
+                         ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD,
+                         ::testing::ValuesIn(inputsd));
 
 }  // end namespace stats
 }  // end namespace raft
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index 89e81708cc..c3140d4588 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -32,17 +32,15 @@ struct SumInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const SumInputs<T>& dims)
-{
+::std::ostream &operator<<(::std::ostream &os, const SumInputs<T> &dims) {
   return os;
 }
 
 template <typename T>
 class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
  protected:
-  void SetUp() override
-  {
-    params   = ::testing::TestWithParam<SumInputs<T>>::GetParam();
+  void SetUp() override {
+    params = ::testing::TestWithParam<SumInputs<T>>::GetParam();
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
     cudaStream_t stream;
@@ -61,8 +59,7 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
     CUDA_CHECK(cudaStreamDestroy(stream));
   }
 
-  void TearDown() override
-  {
+  void TearDown() override {
     CUDA_CHECK(cudaFree(data));
     CUDA_CHECK(cudaFree(sum_act));
   }
@@ -79,17 +76,15 @@ const std::vector<SumInputs<double>> inputsd = {{0.05, 1024, 32, 1234ULL},
                                                 {0.05, 1024, 256, 1234ULL}};
 
 typedef SumTest<float> SumTestF;
-TEST_P(SumTestF, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    float(params.rows), sum_act, params.cols, raft::CompareApprox<float>(params.tolerance)));
+TEST_P(SumTestF, Result) {
+  ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act, params.cols,
+                                raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef SumTest<double> SumTestD;
-TEST_P(SumTestD, Result)
-{
-  ASSERT_TRUE(raft::devArrMatch(
-    double(params.rows), sum_act, params.cols, raft::CompareApprox<double>(params.tolerance)));
+TEST_P(SumTestD, Result) {
+  ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act, params.cols,
+                                raft::CompareApprox<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_CASE_P(SumTests, SumTestF, ::testing::ValuesIn(inputsf));
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index ca09d9c855..b8e8fe3fa0 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -25,16 +25,15 @@ namespace raft {
 
 template <typename T>
 struct Compare {
-  bool operator()(const T& a, const T& b) const { return a == b; }
+  bool operator()(const T &a, const T &b) const { return a == b; }
 };
 
 template <typename T>
 struct CompareApprox {
   CompareApprox(T eps_) : eps(eps_) {}
-  bool operator()(const T& a, const T& b) const
-  {
-    T diff  = abs(a - b);
-    T m     = std::max(abs(a), abs(b));
+  bool operator()(const T &a, const T &b) const {
+    T diff = abs(a - b);
+    T m = std::max(abs(a), abs(b));
     T ratio = diff >= eps ? diff / m : diff;
 
     return (ratio <= eps);
@@ -47,10 +46,9 @@ struct CompareApprox {
 template <typename T>
 struct CompareApproxAbs {
   CompareApproxAbs(T eps_) : eps(eps_) {}
-  bool operator()(const T& a, const T& b) const
-  {
-    T diff  = abs(abs(a) - abs(b));
-    T m     = std::max(abs(a), abs(b));
+  bool operator()(const T &a, const T &b) const {
+    T diff = abs(abs(a) - abs(b));
+    T m = std::max(abs(a), abs(b));
     T ratio = diff >= eps ? diff / m : diff;
     return (ratio <= eps);
   }
@@ -60,26 +58,25 @@ struct CompareApproxAbs {
 };
 
 template <typename T>
-T abs(const T& a)
-{
+T abs(const T &a) {
   return a > T(0) ? a : -a;
 }
 
 /*
- * @brief Helper function to compare 2 device n-D arrays with custom comparison
- * @tparam T the data type of the arrays
- * @tparam L the comparator lambda or object function
- * @param expected expected value(s)
- * @param actual actual values
- * @param eq_compare the comparator
- * @param stream cuda stream
- * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
- * @{
- */
+     * @brief Helper function to compare 2 device n-D arrays with custom comparison
+     * @tparam T the data type of the arrays
+     * @tparam L the comparator lambda or object function
+     * @param expected expected value(s)
+     * @param actual actual values
+     * @param eq_compare the comparator
+     * @param stream cuda stream
+     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+     * @{
+     */
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(
-  const T* expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0)
-{
+testing::AssertionResult devArrMatch(const T *expected, const T *actual,
+                                     size_t size, L eq_compare,
+                                     cudaStream_t stream = 0) {
   std::unique_ptr<T[]> exp_h(new T[size]);
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(exp_h.get(), expected, size, stream);
@@ -89,16 +86,16 @@ testing::AssertionResult devArrMatch(
     auto exp = exp_h.get()[i];
     auto act = act_h.get()[i];
     if (!eq_compare(exp, act)) {
-      return testing::AssertionFailure() << "actual=" << act << " != expected=" << exp << " @" << i;
+      return testing::AssertionFailure()
+             << "actual=" << act << " != expected=" << exp << " @" << i;
     }
   }
   return testing::AssertionSuccess();
 }
 
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(
-  T expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0)
-{
+testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size,
+                                     L eq_compare, cudaStream_t stream = 0) {
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -113,13 +110,9 @@ testing::AssertionResult devArrMatch(
 }
 
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(const T* expected,
-                                     const T* actual,
-                                     size_t rows,
-                                     size_t cols,
-                                     L eq_compare,
-                                     cudaStream_t stream = 0)
-{
+testing::AssertionResult devArrMatch(const T *expected, const T *actual,
+                                     size_t rows, size_t cols, L eq_compare,
+                                     cudaStream_t stream = 0) {
   size_t size = rows * cols;
   std::unique_ptr<T[]> exp_h(new T[size]);
   std::unique_ptr<T[]> act_h(new T[size]);
@@ -133,7 +126,8 @@ testing::AssertionResult devArrMatch(const T* expected,
       auto act = act_h.get()[idx];
       if (!eq_compare(exp, act)) {
         return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << exp << " @" << i << "," << j;
+               << "actual=" << act << " != expected=" << exp << " @" << i << ","
+               << j;
       }
     }
   }
@@ -141,9 +135,9 @@ testing::AssertionResult devArrMatch(const T* expected,
 }
 
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(
-  T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0)
-{
+testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows,
+                                     size_t cols, L eq_compare,
+                                     cudaStream_t stream = 0) {
   size_t size = rows * cols;
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
@@ -154,7 +148,8 @@ testing::AssertionResult devArrMatch(
       auto act = act_h.get()[idx];
       if (!eq_compare(expected, act)) {
         return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << expected << " @" << i << "," << j;
+               << "actual=" << act << " != expected=" << expected << " @" << i
+               << "," << j;
       }
     }
   }
@@ -162,24 +157,24 @@ testing::AssertionResult devArrMatch(
 }
 
 /*
- * @brief Helper function to compare a device n-D arrays with an expected array
- * on the host, using a custom comparison
- * @tparam T the data type of the arrays
- * @tparam L the comparator lambda or object function
- * @param expected_h host array of expected value(s)
- * @param actual_d device array actual values
- * @param eq_compare the comparator
- * @param stream cuda stream
- * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
- */
+     * @brief Helper function to compare a device n-D arrays with an expected array
+     * on the host, using a custom comparison
+     * @tparam T the data type of the arrays
+     * @tparam L the comparator lambda or object function
+     * @param expected_h host array of expected value(s)
+     * @param actual_d device array actual values
+     * @param eq_compare the comparator
+     * @param stream cuda stream
+     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+     */
 template <typename T, typename L>
-testing::AssertionResult devArrMatchHost(
-  const T* expected_h, const T* actual_d, size_t size, L eq_compare, cudaStream_t stream = 0)
-{
+testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d,
+                                         size_t size, L eq_compare,
+                                         cudaStream_t stream = 0) {
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual_d, size, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
-  bool ok   = true;
+  bool ok = true;
   auto fail = testing::AssertionFailure();
   for (size_t i(0); i < size; ++i) {
     auto exp = expected_h[i];
@@ -194,19 +189,19 @@ testing::AssertionResult devArrMatchHost(
 }
 
 /*
- * @brief Helper function to compare diagonal values of a 2D matrix
- * @tparam T the data type of the arrays
- * @tparam L the comparator lambda or object function
- * @param expected expected value along diagonal
- * @param actual actual matrix
- * @param eq_compare the comparator
- * @param stream cuda stream
- * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
- */
+     * @brief Helper function to compare diagonal values of a 2D matrix
+     * @tparam T the data type of the arrays
+     * @tparam L the comparator lambda or object function
+     * @param expected expected value along diagonal
+     * @param actual actual matrix
+     * @param eq_compare the comparator
+     * @param stream cuda stream
+     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+     */
 template <typename T, typename L>
-testing::AssertionResult diagonalMatch(
-  T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0)
-{
+testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows,
+                                       size_t cols, L eq_compare,
+                                       cudaStream_t stream = 0) {
   size_t size = rows * cols;
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
@@ -218,7 +213,8 @@ testing::AssertionResult diagonalMatch(
       auto act = act_h.get()[idx];
       if (!eq_compare(expected, act)) {
         return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << expected << " @" << i << "," << j;
+               << "actual=" << act << " != expected=" << expected << " @" << i
+               << "," << j;
       }
     }
   }
@@ -226,10 +222,10 @@ testing::AssertionResult diagonalMatch(
 }
 
 template <typename T, typename L>
-testing::AssertionResult match(const T expected, T actual, L eq_compare)
-{
+testing::AssertionResult match(const T expected, T actual, L eq_compare) {
   if (!eq_compare(expected, actual)) {
-    return testing::AssertionFailure() << "actual=" << actual << " != expected=" << expected;
+    return testing::AssertionFailure()
+           << "actual=" << actual << " != expected=" << expected;
   }
   return testing::AssertionSuccess();
 }

From fc7eba1c87363081c3060344e9f6949659ccb896 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Wed, 24 Nov 2021 18:01:20 -0500
Subject: [PATCH 4/5] Formatting changes

---
 cpp/include/raft.hpp                          |    3 +-
 cpp/include/raft/cache/cache_util.cuh         |  105 +-
 cpp/include/raft/common/cub_wrappers.cuh      |   42 +-
 .../raft/common/device_loads_stores.cuh       |   87 +-
 cpp/include/raft/common/scatter.cuh           |   77 +-
 cpp/include/raft/comms/comms.hpp              |  359 ++--
 cpp/include/raft/comms/helper.hpp             |   35 +-
 cpp/include/raft/comms/mpi_comms.hpp          |  314 ++--
 cpp/include/raft/comms/std_comms.hpp          |  327 ++--
 cpp/include/raft/comms/test.hpp               |  239 ++-
 cpp/include/raft/comms/ucp_helper.hpp         |  138 +-
 cpp/include/raft/comms/util.hpp               |  114 +-
 cpp/include/raft/cuda_utils.cuh               |  259 ++-
 cpp/include/raft/cudart_utils.h               |  238 +--
 cpp/include/raft/device_atomics.cuh           |  285 ++-
 cpp/include/raft/distance/detail/canberra.cuh |  136 +-
 .../raft/distance/detail/chebyshev.cuh        |  136 +-
 .../raft/distance/detail/correlation.cuh      |  255 ++-
 cpp/include/raft/distance/detail/cosine.cuh   |  175 +-
 cpp/include/raft/distance/detail/distance.cuh |  698 +++++---
 .../raft/distance/detail/euclidean.cuh        |  314 ++--
 .../raft/distance/detail/fused_l2_nn.cuh      |  230 ++-
 cpp/include/raft/distance/detail/hamming.cuh  |  155 +-
 .../raft/distance/detail/hellinger.cuh        |  154 +-
 .../raft/distance/detail/jensen_shannon.cuh   |  155 +-
 .../raft/distance/detail/kl_divergence.cuh    |  253 ++-
 cpp/include/raft/distance/detail/l1.cuh       |  128 +-
 .../raft/distance/detail/minkowski.cuh        |  139 +-
 .../detail/pairwise_distance_base.cuh         |  169 +-
 .../raft/distance/detail/russell_rao.cuh      |  137 +-
 cpp/include/raft/distance/distance.hpp        |  369 ++--
 cpp/include/raft/distance/fused_l2_nn.hpp     |   41 +-
 cpp/include/raft/error.hpp                    |   50 +-
 cpp/include/raft/handle.hpp                   |  120 +-
 cpp/include/raft/integer_utils.h              |   55 +-
 cpp/include/raft/label/classlabels.cuh        |  118 +-
 cpp/include/raft/label/merge_labels.cuh       |   31 +-
 cpp/include/raft/lap/d_structs.h              |   20 +-
 cpp/include/raft/lap/lap.cuh                  |  158 +-
 cpp/include/raft/lap/lap_functions.cuh        |  366 ++--
 cpp/include/raft/lap/lap_kernels.cuh          |  343 ++--
 cpp/include/raft/linalg/add.cuh               |   35 +-
 cpp/include/raft/linalg/binary_op.cuh         |   61 +-
 .../raft/linalg/cholesky_r1_update.cuh        |   63 +-
 .../raft/linalg/coalesced_reduction.cuh       |   55 +-
 cpp/include/raft/linalg/contractions.cuh      |   79 +-
 cpp/include/raft/linalg/cublas_wrappers.h     |  937 +++++++---
 cpp/include/raft/linalg/cusolver_wrappers.h   | 1317 ++++++++++----
 cpp/include/raft/linalg/divide.cuh            |    7 +-
 cpp/include/raft/linalg/eig.cuh               |  214 ++-
 cpp/include/raft/linalg/eltwise.cuh           |   56 +-
 cpp/include/raft/linalg/gemm.cuh              |   85 +-
 cpp/include/raft/linalg/gemv.h                |   88 +-
 cpp/include/raft/linalg/init.h                |    6 +-
 cpp/include/raft/linalg/lanczos.hpp           |  786 +++++---
 cpp/include/raft/linalg/map.cuh               |   31 +-
 cpp/include/raft/linalg/map_then_reduce.cuh   |   92 +-
 cpp/include/raft/linalg/matrix_vector_op.cuh  |  108 +-
 .../raft/linalg/mean_squared_error.cuh        |   10 +-
 cpp/include/raft/linalg/multiply.cuh          |    7 +-
 cpp/include/raft/linalg/norm.cuh              |   92 +-
 cpp/include/raft/linalg/qr.cuh                |   83 +-
 cpp/include/raft/linalg/reduce.cuh            |   37 +-
 cpp/include/raft/linalg/strided_reduction.cuh |   74 +-
 cpp/include/raft/linalg/subtract.cuh          |   34 +-
 cpp/include/raft/linalg/svd.cuh               |  227 ++-
 cpp/include/raft/linalg/transpose.h           |   60 +-
 cpp/include/raft/linalg/unary_op.cuh          |   86 +-
 cpp/include/raft/matrix/detail/math.cuh       |   35 +-
 cpp/include/raft/matrix/detail/matrix.cuh     |   86 +-
 cpp/include/raft/matrix/math.hpp              |  255 ++-
 cpp/include/raft/matrix/matrix.hpp            |  160 +-
 cpp/include/raft/mr/buffer_base.hpp           |   59 +-
 cpp/include/raft/mr/device/allocator.hpp      |    9 +-
 cpp/include/raft/mr/device/buffer.hpp         |   14 +-
 cpp/include/raft/mr/host/allocator.hpp        |   13 +-
 cpp/include/raft/mr/host/buffer.hpp           |   21 +-
 cpp/include/raft/pow2_utils.cuh               |   47 +-
 cpp/include/raft/random/detail/rng_impl.cuh   |  432 +++--
 cpp/include/raft/random/rng.hpp               |  101 +-
 cpp/include/raft/sparse/convert/coo.cuh       |   20 +-
 cpp/include/raft/sparse/convert/csr.cuh       |  114 +-
 cpp/include/raft/sparse/convert/dense.cuh     |   35 +-
 cpp/include/raft/sparse/coo.cuh               |  192 +-
 cpp/include/raft/sparse/csr.cuh               |  129 +-
 cpp/include/raft/sparse/cusparse_wrappers.h   | 1590 ++++++++++++-----
 cpp/include/raft/sparse/distance/common.h     |   18 +-
 .../sparse/distance/detail/bin_distance.cuh   |  187 +-
 .../raft/sparse/distance/detail/coo_spmv.cuh  |  118 +-
 .../distance/detail/coo_spmv_kernel.cuh       |  196 +-
 .../coo_spmv_strategies/base_strategy.cuh     |  138 +-
 .../coo_mask_row_iterators.cuh                |  166 +-
 .../dense_smem_strategy.cuh                   |  104 +-
 .../coo_spmv_strategies/hash_strategy.cuh     |  277 +--
 .../sparse/distance/detail/ip_distance.cuh    |   39 +-
 .../sparse/distance/detail/l2_distance.cuh    |  384 ++--
 .../sparse/distance/detail/lp_distance.cuh    |  199 ++-
 .../raft/sparse/distance/detail/operators.cuh |   29 +-
 .../raft/sparse/distance/detail/utils.cuh     |    6 +-
 cpp/include/raft/sparse/distance/distance.hpp |   65 +-
 cpp/include/raft/sparse/hierarchy/common.h    |   10 +-
 .../sparse/hierarchy/detail/agglomerative.cuh |  128 +-
 .../hierarchy/detail/connectivities.cuh       |   83 +-
 .../raft/sparse/hierarchy/detail/mst.cuh      |   84 +-
 .../raft/sparse/hierarchy/single_linkage.hpp  |   62 +-
 cpp/include/raft/sparse/linalg/add.cuh        |  116 +-
 cpp/include/raft/sparse/linalg/degree.cuh     |   56 +-
 cpp/include/raft/sparse/linalg/norm.cuh       |   51 +-
 cpp/include/raft/sparse/linalg/spectral.cuh   |   65 +-
 cpp/include/raft/sparse/linalg/symmetrize.cuh |  154 +-
 cpp/include/raft/sparse/linalg/transpose.h    |   60 +-
 .../raft/sparse/mst/detail/mst_kernels.cuh    |  160 +-
 .../raft/sparse/mst/detail/mst_solver_inl.cuh |  258 +--
 cpp/include/raft/sparse/mst/detail/utils.cuh  |   19 +-
 cpp/include/raft/sparse/mst/mst.cuh           |   34 +-
 cpp/include/raft/sparse/mst/mst_solver.cuh    |   48 +-
 cpp/include/raft/sparse/op/filter.cuh         |  105 +-
 cpp/include/raft/sparse/op/reduce.cuh         |   54 +-
 cpp/include/raft/sparse/op/row_op.cuh         |   16 +-
 cpp/include/raft/sparse/op/slice.h            |   34 +-
 cpp/include/raft/sparse/op/sort.h             |   21 +-
 .../sparse/selection/connect_components.cuh   |  214 ++-
 cpp/include/raft/sparse/selection/knn.cuh     |  441 +++--
 .../raft/sparse/selection/knn_graph.cuh       |   52 +-
 cpp/include/raft/sparse/utils.h               |   22 +-
 cpp/include/raft/spatial/knn/ann.hpp          |   31 +-
 cpp/include/raft/spatial/knn/ann_common.h     |   10 +-
 cpp/include/raft/spatial/knn/ball_cover.hpp   |   82 +-
 .../raft/spatial/knn/ball_cover_common.h      |   37 +-
 .../knn/detail/ann_quantized_faiss.cuh        |  130 +-
 .../raft/spatial/knn/detail/ball_cover.cuh    |  322 ++--
 .../spatial/knn/detail/ball_cover/common.cuh  |   26 +-
 .../knn/detail/ball_cover/registers.cuh       |  613 +++++--
 .../spatial/knn/detail/block_select_faiss.cuh |   80 +-
 .../raft/spatial/knn/detail/common_faiss.h    |   37 +-
 .../raft/spatial/knn/detail/fused_l2_knn.cuh  |  802 ++++++---
 .../spatial/knn/detail/haversine_distance.cuh |   56 +-
 .../knn/detail/knn_brute_force_faiss.cuh      |  188 +-
 .../raft/spatial/knn/detail/processing.hpp    |  121 +-
 .../spatial/knn/detail/selection_faiss.cuh    |   99 +-
 .../spatial/knn/detail/warp_select_faiss.cuh  |  276 +--
 cpp/include/raft/spatial/knn/knn.hpp          |   75 +-
 cpp/include/raft/spectral/cluster_solvers.hpp |   39 +-
 cpp/include/raft/spectral/eigen_solvers.hpp   |   66 +-
 cpp/include/raft/spectral/kmeans.hpp          |  402 +++--
 cpp/include/raft/spectral/lapack.hpp          |  552 ++++--
 cpp/include/raft/spectral/matrix_wrappers.hpp |  260 +--
 .../raft/spectral/modularity_maximization.hpp |   44 +-
 cpp/include/raft/spectral/partition.hpp       |   53 +-
 cpp/include/raft/spectral/spectral_util.hpp   |  118 +-
 cpp/include/raft/spectral/warn_dbg.hpp        |    4 +-
 cpp/include/raft/stats/detail/mean.cuh        |   42 +-
 cpp/include/raft/stats/detail/stddev.cuh      |  136 +-
 cpp/include/raft/stats/detail/sum.cuh         |   38 +-
 cpp/include/raft/stats/mean.hpp               |    5 +-
 cpp/include/raft/stats/mean_center.hpp        |   45 +-
 cpp/include/raft/stats/stddev.hpp             |   22 +-
 cpp/include/raft/stats/sum.hpp                |    4 +-
 cpp/include/raft/vectorized.cuh               |  128 +-
 cpp/test/cluster_solvers.cu                   |   16 +-
 cpp/test/cudart_utils.cpp                     |    3 +-
 cpp/test/distance/dist_adj.cu                 |   94 +-
 cpp/test/distance/dist_canberra.cu            |   24 +-
 cpp/test/distance/dist_chebyshev.cu           |   24 +-
 cpp/test/distance/dist_correlation.cu         |   24 +-
 cpp/test/distance/dist_cos.cu                 |   25 +-
 cpp/test/distance/dist_euc_exp.cu             |   24 +-
 cpp/test/distance/dist_euc_unexp.cu           |   20 +-
 cpp/test/distance/dist_hamming.cu             |   24 +-
 cpp/test/distance/dist_hellinger.cu           |   24 +-
 cpp/test/distance/dist_jensen_shannon.cu      |   20 +-
 cpp/test/distance/dist_kl_divergence.cu       |   20 +-
 cpp/test/distance/dist_l1.cu                  |   24 +-
 cpp/test/distance/dist_minkowski.cu           |   23 +-
 cpp/test/distance/dist_russell_rao.cu         |   24 +-
 cpp/test/distance/distance_base.cuh           |  311 ++--
 cpp/test/distance/fused_l2_nn.cu              |  208 ++-
 cpp/test/eigen_solvers.cu                     |   31 +-
 cpp/test/handle.cpp                           |   18 +-
 cpp/test/integer_utils.cpp                    |    6 +-
 cpp/test/label/label.cu                       |   26 +-
 cpp/test/label/merge_labels.cu                |   67 +-
 cpp/test/lap/lap.cu                           |   93 +-
 cpp/test/linalg/add.cu                        |   14 +-
 cpp/test/linalg/add.cuh                       |   17 +-
 cpp/test/linalg/binary_op.cu                  |   94 +-
 cpp/test/linalg/binary_op.cuh                 |   17 +-
 cpp/test/linalg/cholesky_r1.cu                |   50 +-
 cpp/test/linalg/coalesced_reduction.cu        |   64 +-
 cpp/test/linalg/divide.cu                     |   53 +-
 cpp/test/linalg/eig.cu                        |  206 ++-
 cpp/test/linalg/eig_sel.cu                    |  100 +-
 cpp/test/linalg/eltwise.cu                    |  104 +-
 cpp/test/linalg/gemm_layout.cu                |   63 +-
 cpp/test/linalg/gemv.cu                       |   76 +-
 cpp/test/linalg/map.cu                        |  108 +-
 cpp/test/linalg/map_then_reduce.cu            |  101 +-
 cpp/test/linalg/matrix_vector_op.cu           |  128 +-
 cpp/test/linalg/matrix_vector_op.cuh          |   73 +-
 cpp/test/linalg/multiply.cu                   |   33 +-
 cpp/test/linalg/norm.cu                       |  150 +-
 cpp/test/linalg/reduce.cu                     |   86 +-
 cpp/test/linalg/reduce.cuh                    |   51 +-
 cpp/test/linalg/strided_reduction.cu          |   57 +-
 cpp/test/linalg/subtract.cu                   |   75 +-
 cpp/test/linalg/svd.cu                        |  120 +-
 cpp/test/linalg/transpose.cu                  |   63 +-
 cpp/test/linalg/unary_op.cu                   |   47 +-
 cpp/test/linalg/unary_op.cuh                  |   17 +-
 cpp/test/matrix/math.cu                       |  213 +--
 cpp/test/matrix/matrix.cu                     |   81 +-
 cpp/test/mr/device/buffer.cpp                 |   16 +-
 cpp/test/mr/host/buffer.cpp                   |    9 +-
 cpp/test/mst.cu                               |  182 +-
 cpp/test/pow2_utils.cu                        |   28 +-
 cpp/test/random/rng.cu                        |  210 +--
 cpp/test/random/rng_int.cu                    |   60 +-
 cpp/test/random/sample_without_replacement.cu |   42 +-
 cpp/test/sparse/add.cu                        |  118 +-
 cpp/test/sparse/connect_components.cu         |  593 +++---
 cpp/test/sparse/convert_coo.cu                |   22 +-
 cpp/test/sparse/convert_csr.cu                |   55 +-
 cpp/test/sparse/csr_row_slice.cu              |   77 +-
 cpp/test/sparse/csr_to_dense.cu               |   64 +-
 cpp/test/sparse/csr_transpose.cu              |   70 +-
 cpp/test/sparse/degree.cu                     |   45 +-
 cpp/test/sparse/dist_coo_spmv.cu              |  922 +++++-----
 cpp/test/sparse/distance.cu                   |  244 ++-
 cpp/test/sparse/filter.cu                     |   30 +-
 cpp/test/sparse/knn.cu                        |   78 +-
 cpp/test/sparse/knn_graph.cu                  |   32 +-
 cpp/test/sparse/linkage.cu                    |  632 +++----
 cpp/test/sparse/norm.cu                       |   25 +-
 cpp/test/sparse/reduce.cu                     |   48 +-
 cpp/test/sparse/row_op.cu                     |   43 +-
 cpp/test/sparse/sort.cu                       |   19 +-
 cpp/test/sparse/symmetrize.cu                 |   86 +-
 cpp/test/spatial/ball_cover.cu                |  200 ++-
 cpp/test/spatial/fused_l2_knn.cu              |  108 +-
 cpp/test/spatial/haversine.cu                 |   71 +-
 cpp/test/spatial/knn.cu                       |   82 +-
 cpp/test/spatial/selection.cu                 |   55 +-
 cpp/test/spatial/spatial_data.h               |   31 +-
 cpp/test/spectral_matrix.cu                   |   13 +-
 cpp/test/stats/mean.cu                        |   98 +-
 cpp/test/stats/mean_center.cu                 |   80 +-
 cpp/test/stats/stddev.cu                      |   52 +-
 cpp/test/stats/sum.cu                         |   23 +-
 cpp/test/test_utils.h                         |  143 +-
 249 files changed, 19679 insertions(+), 13246 deletions(-)

diff --git a/cpp/include/raft.hpp b/cpp/include/raft.hpp
index f380d276b2..08f836d3a8 100644
--- a/cpp/include/raft.hpp
+++ b/cpp/include/raft.hpp
@@ -21,7 +21,8 @@ namespace raft {
 /* Function for testing RAFT include
  *
  * @return message indicating RAFT has been included succesfully*/
-inline std::string test_raft() {
+inline std::string test_raft()
+{
   std::string status = "RAFT Setup succesfully";
   return status;
 }
diff --git a/cpp/include/raft/cache/cache_util.cuh b/cpp/include/raft/cache/cache_util.cuh
index a65227c402..dc9327bb94 100644
--- a/cpp/include/raft/cache/cache_util.cuh
+++ b/cpp/include/raft/cache/cache_util.cuh
@@ -42,17 +42,16 @@ namespace cache {
  * @param [out] out vectors collected from the cache, size [n_vec * n]
  */
 template <typename math_t, typename idx_t, typename int_t>
-__global__ void get_vecs(const math_t *cache, int_t n_vec,
-                         const idx_t *cache_idx, int_t n, math_t *out) {
+__global__ void get_vecs(
+  const math_t* cache, int_t n_vec, const idx_t* cache_idx, int_t n, math_t* out)
+{
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int row = tid % n_vec;  // row idx
   if (tid < n_vec * n) {
-    size_t out_col = tid / n_vec;  // col idx
+    size_t out_col   = tid / n_vec;  // col idx
     size_t cache_col = cache_idx[out_col];
     if (cache_idx[out_col] >= 0) {
-      if (row + out_col * n_vec < (size_t)n_vec * n) {
-        out[tid] = cache[row + cache_col * n_vec];
-      }
+      if (row + out_col * n_vec < (size_t)n_vec * n) { out[tid] = cache[row + cache_col * n_vec]; }
     }
   }
 }
@@ -84,21 +83,26 @@ __global__ void get_vecs(const math_t *cache, int_t n_vec,
  * @param [in] n_cache_vecs
  */
 template <typename math_t>
-__global__ void store_vecs(const math_t *tile, int n_tile, int n_vec,
-                           const int *tile_idx, int n, const int *cache_idx,
-                           math_t *cache, int n_cache_vecs) {
+__global__ void store_vecs(const math_t* tile,
+                           int n_tile,
+                           int n_vec,
+                           const int* tile_idx,
+                           int n,
+                           const int* cache_idx,
+                           math_t* cache,
+                           int n_cache_vecs)
+{
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   int row = tid % n_vec;  // row idx
   if (tid < n_vec * n) {
-    int tile_col = tid / n_vec;  // col idx
-    int data_col = tile_idx ? tile_idx[tile_col] : tile_col;
+    int tile_col  = tid / n_vec;  // col idx
+    int data_col  = tile_idx ? tile_idx[tile_col] : tile_col;
     int cache_col = cache_idx[tile_col];
 
     // We ignore negative values. The rest of the checks should be fulfilled
     // if the cache is used properly
     if (cache_col >= 0 && cache_col < n_cache_vecs && data_col < n_tile) {
-      cache[row + (size_t)cache_col * n_vec] =
-        tile[row + (size_t)data_col * n_vec];
+      cache[row + (size_t)cache_col * n_vec] = tile[row + (size_t)data_col * n_vec];
     }
   }
 }
@@ -121,14 +125,15 @@ int DI hash(int key, int n_cache_sets) { return key % n_cache_sets; }
  * @return the index of the first element in the array for which
  * array[idx] >= value. If there is no such value, then return n.
  */
-int DI arg_first_ge(const int *array, int n, int val) {
+int DI arg_first_ge(const int* array, int n, int val)
+{
   int start = 0;
-  int end = n - 1;
+  int end   = n - 1;
   if (array[0] == val) return 0;
   if (array[end] < val) return n;
   while (start + 1 < end) {
     int q = (start + end + 1) / 2;
-    //invariants:
+    // invariants:
     // start < end
     // start < q <=end
     // array[start] < val && array[end] <=val
@@ -157,7 +162,8 @@ int DI arg_first_ge(const int *array, int n, int val) {
  * @return the idx of the k-th occurance of val in array, or -1 if
  * the value is not found.
  */
-int DI find_nth_occurrence(const int *array, int n, int val, int k) {
+int DI find_nth_occurrence(const int* array, int n, int val, int k)
+{
   int q = arg_first_ge(array, n, val);
   if (q + k < n && array[q + k] == val) {
     q += k;
@@ -196,10 +202,10 @@ int DI find_nth_occurrence(const int *array, int n, int val, int k) {
  *   Each block should give a different pointer for rank.
  */
 template <int nthreads, int associativity>
-DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
+DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank)
+{
   const int items_per_thread = raft::ceildiv(associativity, nthreads);
-  typedef cub::BlockRadixSort<int, nthreads, items_per_thread, int>
-    BlockRadixSort;
+  typedef cub::BlockRadixSort<int, nthreads, items_per_thread, int> BlockRadixSort;
   __shared__ typename BlockRadixSort::TempStorage temp_storage;
 
   int key[items_per_thread];
@@ -208,8 +214,8 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
   int block_offset = blockIdx.x * associativity;
 
   for (int j = 0; j < items_per_thread; j++) {
-    int k = threadIdx.x + j * nthreads;
-    int t = (k < associativity) ? cache_time[block_offset + k] : 32768;
+    int k  = threadIdx.x + j * nthreads;
+    int t  = (k < associativity) ? cache_time[block_offset + k] : 32768;
     key[j] = t;
     val[j] = k;
   }
@@ -217,9 +223,7 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
   BlockRadixSort(temp_storage).Sort(key, val);
 
   for (int j = 0; j < items_per_thread; j++) {
-    if (val[j] < associativity) {
-      rank[val[j]] = threadIdx.x * items_per_thread + j;
-    }
+    if (val[j] < associativity) { rank[val[j]] = threadIdx.x * items_per_thread + j; }
   }
   __syncthreads();
 }
@@ -252,9 +256,15 @@ DI void rank_set_entries(const int *cache_time, int n_cache_sets, int *rank) {
  *   not be cached, size [n]
  */
 template <int nthreads, int associativity>
-__global__ void assign_cache_idx(const int *keys, int n, const int *cache_set,
-                                 int *cached_keys, int n_cache_sets,
-                                 int *cache_time, int time, int *cache_idx) {
+__global__ void assign_cache_idx(const int* keys,
+                                 int n,
+                                 const int* cache_set,
+                                 int* cached_keys,
+                                 int n_cache_sets,
+                                 int* cache_time,
+                                 int time,
+                                 int* cache_idx)
+{
   int block_offset = blockIdx.x * associativity;
 
   const int items_per_thread = raft::ceildiv(associativity, nthreads);
@@ -273,7 +283,7 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set,
   // these elements are assigned -1.
 
   for (int j = 0; j < items_per_thread; j++) {
-    int i = threadIdx.x + j * nthreads;
+    int i     = threadIdx.x + j * nthreads;
     int t_idx = block_offset + i;
     bool mask = (i < associativity);
     // whether this slot is available for writing
@@ -284,10 +294,10 @@ __global__ void assign_cache_idx(const int *keys, int n, const int *cache_set,
     if (mask) {
       int k = find_nth_occurrence(cache_set, n, blockIdx.x, rank[i]);
       if (k > -1) {
-        int key_val = keys[k];
+        int key_val        = keys[k];
         cached_keys[t_idx] = key_val;
-        cache_idx[k] = t_idx;
-        cache_time[t_idx] = time;
+        cache_idx[k]       = t_idx;
+        cache_time[t_idx]  = time;
       }
     }
   }
@@ -315,21 +325,28 @@ namespace {
  * @param [inout] cached_keys keys stored in the cache, size [n_cache_sets * associativity]
  * @param [in] n_cache_sets number of cache sets
  * @param [in] associativity number of keys in cache set
- * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets * associativity]
+ * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets *
+ * associativity]
  * @param [out] cache_idx cache indices of the working set elements, size [n]
  * @param [out] is_cached whether the element is cached size[n]
  * @param [in] time iteration counter (used for time stamping)
  */
-__global__ void get_cache_idx(int *keys, int n, int *cached_keys,
-                              int n_cache_sets, int associativity,
-                              int *cache_time, int *cache_idx, bool *is_cached,
-                              int time) {
+__global__ void get_cache_idx(int* keys,
+                              int n,
+                              int* cached_keys,
+                              int n_cache_sets,
+                              int associativity,
+                              int* cache_time,
+                              int* cache_idx,
+                              bool* is_cached,
+                              int time)
+{
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n) {
-    int widx = keys[tid];
-    int sidx = hash(widx, n_cache_sets);
-    int cidx = sidx * associativity;
-    int i = 0;
+    int widx   = keys[tid];
+    int sidx   = hash(widx, n_cache_sets);
+    int cidx   = sidx * associativity;
+    int i      = 0;
     bool found = false;
     // search for empty spot and the least recently used spot
     while (i < associativity && !found) {
@@ -338,9 +355,9 @@ __global__ void get_cache_idx(int *keys, int n, int *cached_keys,
     }
     is_cached[tid] = found;
     if (found) {
-      cidx = cidx + i - 1;
-      cache_time[cidx] = time;  //update time stamp
-      cache_idx[tid] = cidx;    //exact cache idx
+      cidx             = cidx + i - 1;
+      cache_time[cidx] = time;  // update time stamp
+      cache_idx[tid]   = cidx;  // exact cache idx
     } else {
       cache_idx[tid] = sidx;  // assign cache set
     }
diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh
index 8e3519fea5..32a46968b6 100644
--- a/cpp/include/raft/common/cub_wrappers.cuh
+++ b/cpp/include/raft/common/cub_wrappers.cuh
@@ -22,28 +22,32 @@
 namespace raft {
 
 /**
-     * @brief Convenience wrapper over cub's SortPairs method
-     * @tparam KeyT key type
-     * @tparam ValueT value type
-     * @param workspace workspace buffer which will get resized if not enough space
-     * @param inKeys input keys array
-     * @param outKeys output keys array
-     * @param inVals input values array
-     * @param outVals output values array
-     * @param len array length
-     * @param stream cuda stream
-     */
+ * @brief Convenience wrapper over cub's SortPairs method
+ * @tparam KeyT key type
+ * @tparam ValueT value type
+ * @param workspace workspace buffer which will get resized if not enough space
+ * @param inKeys input keys array
+ * @param outKeys output keys array
+ * @param inVals input values array
+ * @param outVals output values array
+ * @param len array length
+ * @param stream cuda stream
+ */
 template <typename KeyT, typename ValueT>
-void sortPairs(rmm::device_uvector<char> &workspace, const KeyT *inKeys,
-               KeyT *outKeys, const ValueT *inVals, ValueT *outVals, int len,
-               cudaStream_t stream) {
+void sortPairs(rmm::device_uvector<char>& workspace,
+               const KeyT* inKeys,
+               KeyT* outKeys,
+               const ValueT* inVals,
+               ValueT* outVals,
+               int len,
+               cudaStream_t stream)
+{
   size_t worksize;
-  cub::DeviceRadixSort::SortPairs(nullptr, worksize, inKeys, outKeys, inVals,
-                                  outVals, len, 0, sizeof(KeyT) * 8, stream);
+  cub::DeviceRadixSort::SortPairs(
+    nullptr, worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream);
   workspace.resize(worksize, stream);
-  cub::DeviceRadixSort::SortPairs(workspace.data(), worksize, inKeys, outKeys,
-                                  inVals, outVals, len, 0, sizeof(KeyT) * 8,
-                                  stream);
+  cub::DeviceRadixSort::SortPairs(
+    workspace.data(), worksize, inKeys, outKeys, inVals, outVals, len, 0, sizeof(KeyT) * 8, stream);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh
index bb2b019ecb..41dc9cab08 100644
--- a/cpp/include/raft/common/device_loads_stores.cuh
+++ b/cpp/include/raft/common/device_loads_stores.cuh
@@ -31,40 +31,43 @@ namespace raft {
  * @param[out] addr shared memory address (should be aligned to vector size)
  * @param[in]  x    data to be stored at this address
  */
-DI void sts(float* addr, const float& x) {
+DI void sts(float* addr, const float& x)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x));
 }
-DI void sts(float* addr, const float (&x)[1]) {
+DI void sts(float* addr, const float (&x)[1])
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("st.shared.f32 [%0], {%1};" : : "l"(s1), "f"(x[0]));
 }
-DI void sts(float* addr, const float (&x)[2]) {
+DI void sts(float* addr, const float (&x)[2])
+{
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
-  asm volatile("st.shared.v2.f32 [%0], {%1, %2};"
-               :
-               : "l"(s2), "f"(x[0]), "f"(x[1]));
+  asm volatile("st.shared.v2.f32 [%0], {%1, %2};" : : "l"(s2), "f"(x[0]), "f"(x[1]));
 }
-DI void sts(float* addr, const float (&x)[4]) {
+DI void sts(float* addr, const float (&x)[4])
+{
   auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
   asm volatile("st.shared.v4.f32 [%0], {%1, %2, %3, %4};"
                :
                : "l"(s4), "f"(x[0]), "f"(x[1]), "f"(x[2]), "f"(x[3]));
 }
 
-DI void sts(double* addr, const double& x) {
+DI void sts(double* addr, const double& x)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x));
 }
-DI void sts(double* addr, const double (&x)[1]) {
+DI void sts(double* addr, const double (&x)[1])
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("st.shared.f64 [%0], {%1};" : : "l"(s1), "d"(x[0]));
 }
-DI void sts(double* addr, const double (&x)[2]) {
+DI void sts(double* addr, const double (&x)[2])
+{
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<double2*>(addr));
-  asm volatile("st.shared.v2.f64 [%0], {%1, %2};"
-               :
-               : "l"(s2), "d"(x[0]), "d"(x[1]));
+  asm volatile("st.shared.v2.f64 [%0], {%1, %2};" : : "l"(s2), "d"(x[0]), "d"(x[1]));
 }
 /** @} */
 
@@ -80,39 +83,42 @@ DI void sts(double* addr, const double (&x)[2]) {
  * @param[in]  addr shared memory address from where to load
  *                  (should be aligned to vector size)
  */
-DI void lds(float& x, float* addr) {
+DI void lds(float& x, float* addr)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x) : "l"(s1));
 }
-DI void lds(float (&x)[1], float* addr) {
+DI void lds(float (&x)[1], float* addr)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<float*>(addr));
   asm volatile("ld.shared.f32 {%0}, [%1];" : "=f"(x[0]) : "l"(s1));
 }
-DI void lds(float (&x)[2], float* addr) {
+DI void lds(float (&x)[2], float* addr)
+{
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<float2*>(addr));
-  asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];"
-               : "=f"(x[0]), "=f"(x[1])
-               : "l"(s2));
+  asm volatile("ld.shared.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(s2));
 }
-DI void lds(float (&x)[4], float* addr) {
+DI void lds(float (&x)[4], float* addr)
+{
   auto s4 = __cvta_generic_to_shared(reinterpret_cast<float4*>(addr));
   asm volatile("ld.shared.v4.f32 {%0, %1, %2, %3}, [%4];"
                : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
                : "l"(s4));
 }
-DI void lds(double& x, double* addr) {
+DI void lds(double& x, double* addr)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x) : "l"(s1));
 }
-DI void lds(double (&x)[1], double* addr) {
+DI void lds(double (&x)[1], double* addr)
+{
   auto s1 = __cvta_generic_to_shared(reinterpret_cast<double*>(addr));
   asm volatile("ld.shared.f64 {%0}, [%1];" : "=d"(x[0]) : "l"(s1));
 }
-DI void lds(double (&x)[2], double* addr) {
+DI void lds(double (&x)[2], double* addr)
+{
   auto s2 = __cvta_generic_to_shared(reinterpret_cast<double2*>(addr));
-  asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];"
-               : "=d"(x[0]), "=d"(x[1])
-               : "l"(s2));
+  asm volatile("ld.shared.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(s2));
 }
 /** @} */
 
@@ -123,32 +129,35 @@ DI void lds(double (&x)[2], double* addr) {
  * @param[out] x    data to be loaded from global memory
  * @param[in]  addr address in global memory from where to load
  */
-DI void ldg(float& x, const float* addr) {
+DI void ldg(float& x, const float* addr)
+{
   asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x) : "l"(addr));
 }
-DI void ldg(float (&x)[1], const float* addr) {
+DI void ldg(float (&x)[1], const float* addr)
+{
   asm volatile("ld.global.cg.f32 %0, [%1];" : "=f"(x[0]) : "l"(addr));
 }
-DI void ldg(float (&x)[2], const float* addr) {
-  asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];"
-               : "=f"(x[0]), "=f"(x[1])
-               : "l"(addr));
+DI void ldg(float (&x)[2], const float* addr)
+{
+  asm volatile("ld.global.cg.v2.f32 {%0, %1}, [%2];" : "=f"(x[0]), "=f"(x[1]) : "l"(addr));
 }
-DI void ldg(float (&x)[4], const float* addr) {
+DI void ldg(float (&x)[4], const float* addr)
+{
   asm volatile("ld.global.cg.v4.f32 {%0, %1, %2, %3}, [%4];"
                : "=f"(x[0]), "=f"(x[1]), "=f"(x[2]), "=f"(x[3])
                : "l"(addr));
 }
-DI void ldg(double& x, const double* addr) {
+DI void ldg(double& x, const double* addr)
+{
   asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x) : "l"(addr));
 }
-DI void ldg(double (&x)[1], const double* addr) {
+DI void ldg(double (&x)[1], const double* addr)
+{
   asm volatile("ld.global.cg.f64 %0, [%1];" : "=d"(x[0]) : "l"(addr));
 }
-DI void ldg(double (&x)[2], const double* addr) {
-  asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];"
-               : "=d"(x[0]), "=d"(x[1])
-               : "l"(addr));
+DI void ldg(double (&x)[2], const double* addr)
+{
+  asm volatile("ld.global.cg.v2.f64 {%0, %1}, [%2];" : "=d"(x[0]), "=d"(x[1]) : "l"(addr));
 }
 /** @} */
 
diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh
index 785794461e..b228ac5499 100644
--- a/cpp/include/raft/common/scatter.cuh
+++ b/cpp/include/raft/common/scatter.cuh
@@ -22,8 +22,8 @@
 namespace raft {
 
 template <typename DataT, int VecLen, typename Lambda, typename IdxT>
-__global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx,
-                              IdxT len, Lambda op) {
+__global__ void scatterKernel(DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op)
+{
   typedef TxN_t<DataT, VecLen> DataVec;
   typedef TxN_t<IdxT, VecLen> IdxVec;
   IdxT tid = threadIdx.x + ((IdxT)blockIdx.x * blockDim.x);
@@ -34,61 +34,60 @@ __global__ void scatterKernel(DataT *out, const DataT *in, const IdxT *idx,
   DataVec dataIn;
 #pragma unroll
   for (int i = 0; i < VecLen; ++i) {
-    auto inPos = idxIn.val.data[i];
+    auto inPos         = idxIn.val.data[i];
     dataIn.val.data[i] = op(in[inPos], tid + i);
   }
   dataIn.store(out, tid);
 }
 
 template <typename DataT, int VecLen, typename Lambda, typename IdxT, int TPB>
-void scatterImpl(DataT *out, const DataT *in, const IdxT *idx, IdxT len,
-                 Lambda op, cudaStream_t stream) {
+void scatterImpl(
+  DataT* out, const DataT* in, const IdxT* idx, IdxT len, Lambda op, cudaStream_t stream)
+{
   const IdxT nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxT)TPB);
-  scatterKernel<DataT, VecLen, Lambda, IdxT>
-    <<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
+  scatterKernel<DataT, VecLen, Lambda, IdxT><<<nblks, TPB, 0, stream>>>(out, in, idx, len, op);
   CUDA_CHECK(cudaGetLastError());
 }
 
 /**
-     * @brief Performs scatter operation based on the input indexing array
-     * @tparam DataT data type whose array gets scattered
-     * @tparam IdxT indexing type
-     * @tparam TPB threads-per-block in the final kernel launched
-     * @tparam Lambda the device-lambda performing a unary operation on the loaded
-     * data before it gets scattered
-     * @param out the output array
-     * @param in the input array
-     * @param idx the indexing array
-     * @param len number of elements in the input array
-     * @param stream cuda stream where to launch work
-     * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This
-     * will be applied to every element before scattering it to the right location.
-     * The second param in this method will be the destination index.
-     */
-template <typename DataT, typename IdxT,
-          typename Lambda = raft::Nop<DataT, IdxT>, int TPB = 256>
-void scatter(DataT *out, const DataT *in, const IdxT *idx, IdxT len,
-             cudaStream_t stream, Lambda op = raft::Nop<DataT, IdxT>()) {
+ * @brief Performs scatter operation based on the input indexing array
+ * @tparam DataT data type whose array gets scattered
+ * @tparam IdxT indexing type
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Lambda the device-lambda performing a unary operation on the loaded
+ * data before it gets scattered
+ * @param out the output array
+ * @param in the input array
+ * @param idx the indexing array
+ * @param len number of elements in the input array
+ * @param stream cuda stream where to launch work
+ * @param op the device-lambda with signature `DataT func(DataT, IdxT);`. This
+ * will be applied to every element before scattering it to the right location.
+ * The second param in this method will be the destination index.
+ */
+template <typename DataT, typename IdxT, typename Lambda = raft::Nop<DataT, IdxT>, int TPB = 256>
+void scatter(DataT* out,
+             const DataT* in,
+             const IdxT* idx,
+             IdxT len,
+             cudaStream_t stream,
+             Lambda op = raft::Nop<DataT, IdxT>())
+{
   if (len <= 0) return;
-  constexpr size_t DataSize = sizeof(DataT);
-  constexpr size_t IdxSize = sizeof(IdxT);
+  constexpr size_t DataSize   = sizeof(DataT);
+  constexpr size_t IdxSize    = sizeof(IdxT);
   constexpr size_t MaxPerElem = DataSize > IdxSize ? DataSize : IdxSize;
-  size_t bytes = len * MaxPerElem;
+  size_t bytes                = len * MaxPerElem;
   if (16 / MaxPerElem && bytes % 16 == 0) {
-    scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len,
-                                                           op, stream);
+    scatterImpl<DataT, 16 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (8 / MaxPerElem && bytes % 8 == 0) {
-    scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
-                                                          stream);
+    scatterImpl<DataT, 8 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (4 / MaxPerElem && bytes % 4 == 0) {
-    scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
-                                                          stream);
+    scatterImpl<DataT, 4 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (2 / MaxPerElem && bytes % 2 == 0) {
-    scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
-                                                          stream);
+    scatterImpl<DataT, 2 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else if (1 / MaxPerElem) {
-    scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op,
-                                                          stream);
+    scatterImpl<DataT, 1 / MaxPerElem, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   } else {
     scatterImpl<DataT, 1, Lambda, IdxT, TPB>(out, in, idx, len, op, stream);
   }
diff --git a/cpp/include/raft/comms/comms.hpp b/cpp/include/raft/comms/comms.hpp
index bd8a4ce9e7..68b8e723e9 100644
--- a/cpp/include/raft/comms/comms.hpp
+++ b/cpp/include/raft/comms/comms.hpp
@@ -25,16 +25,7 @@ namespace raft {
 namespace comms {
 
 typedef unsigned int request_t;
-enum class datatype_t {
-  CHAR,
-  UINT8,
-  INT32,
-  UINT32,
-  INT64,
-  UINT64,
-  FLOAT32,
-  FLOAT64
-};
+enum class datatype_t { CHAR, UINT8, INT32, UINT32, INT64, UINT64, FLOAT32, FLOAT64 };
 enum class op_t { SUM, PROD, MIN, MAX };
 
 /**
@@ -50,42 +41,50 @@ template <typename value_t>
 constexpr datatype_t get_type();
 
 template <>
-constexpr datatype_t get_type<char>() {
+constexpr datatype_t get_type<char>()
+{
   return datatype_t::CHAR;
 }
 
 template <>
-constexpr datatype_t get_type<uint8_t>() {
+constexpr datatype_t get_type<uint8_t>()
+{
   return datatype_t::UINT8;
 }
 
 template <>
-constexpr datatype_t get_type<int>() {
+constexpr datatype_t get_type<int>()
+{
   return datatype_t::INT32;
 }
 
 template <>
-constexpr datatype_t get_type<uint32_t>() {
+constexpr datatype_t get_type<uint32_t>()
+{
   return datatype_t::UINT32;
 }
 
 template <>
-constexpr datatype_t get_type<int64_t>() {
+constexpr datatype_t get_type<int64_t>()
+{
   return datatype_t::INT64;
 }
 
 template <>
-constexpr datatype_t get_type<uint64_t>() {
+constexpr datatype_t get_type<uint64_t>()
+{
   return datatype_t::UINT64;
 }
 
 template <>
-constexpr datatype_t get_type<float>() {
+constexpr datatype_t get_type<float>()
+{
   return datatype_t::FLOAT32;
 }
 
 template <>
-constexpr datatype_t get_type<double>() {
+constexpr datatype_t get_type<double>()
+{
   return datatype_t::FLOAT64;
 }
 
@@ -95,76 +94,106 @@ class comms_iface {
   virtual int get_rank() const = 0;
 
   virtual std::unique_ptr<comms_iface> comm_split(int color, int key) const = 0;
-  virtual void barrier() const = 0;
+  virtual void barrier() const                                              = 0;
 
   virtual status_t sync_stream(cudaStream_t stream) const = 0;
 
-  virtual void isend(const void* buf, size_t size, int dest, int tag,
-                     request_t* request) const = 0;
+  virtual void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const = 0;
 
-  virtual void irecv(void* buf, size_t size, int source, int tag,
-                     request_t* request) const = 0;
+  virtual void irecv(void* buf, size_t size, int source, int tag, request_t* request) const = 0;
 
   virtual void waitall(int count, request_t array_of_requests[]) const = 0;
 
-  virtual void allreduce(const void* sendbuff, void* recvbuff, size_t count,
-                         datatype_t datatype, op_t op,
+  virtual void allreduce(const void* sendbuff,
+                         void* recvbuff,
+                         size_t count,
+                         datatype_t datatype,
+                         op_t op,
                          cudaStream_t stream) const = 0;
 
-  virtual void bcast(void* buff, size_t count, datatype_t datatype, int root,
-                     cudaStream_t stream) const = 0;
+  virtual void bcast(
+    void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const = 0;
 
-  virtual void bcast(const void* sendbuff, void* recvbuff, size_t count,
-                     datatype_t datatype, int root,
+  virtual void bcast(const void* sendbuff,
+                     void* recvbuff,
+                     size_t count,
+                     datatype_t datatype,
+                     int root,
                      cudaStream_t stream) const = 0;
 
-  virtual void reduce(const void* sendbuff, void* recvbuff, size_t count,
-                      datatype_t datatype, op_t op, int root,
+  virtual void reduce(const void* sendbuff,
+                      void* recvbuff,
+                      size_t count,
+                      datatype_t datatype,
+                      op_t op,
+                      int root,
                       cudaStream_t stream) const = 0;
 
-  virtual void allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                         datatype_t datatype, cudaStream_t stream) const = 0;
-
-  virtual void allgatherv(const void* sendbuf, void* recvbuf,
-                          const size_t* recvcounts, const size_t* displs,
-                          datatype_t datatype, cudaStream_t stream) const = 0;
+  virtual void allgather(const void* sendbuff,
+                         void* recvbuff,
+                         size_t sendcount,
+                         datatype_t datatype,
+                         cudaStream_t stream) const = 0;
 
-  virtual void gather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                      datatype_t datatype, int root,
+  virtual void allgatherv(const void* sendbuf,
+                          void* recvbuf,
+                          const size_t* recvcounts,
+                          const size_t* displs,
+                          datatype_t datatype,
+                          cudaStream_t stream) const = 0;
+
+  virtual void gather(const void* sendbuff,
+                      void* recvbuff,
+                      size_t sendcount,
+                      datatype_t datatype,
+                      int root,
                       cudaStream_t stream) const = 0;
 
-  virtual void gatherv(const void* sendbuf, void* recvbuf, size_t sendcount,
-                       const size_t* recvcounts, const size_t* displs,
-                       datatype_t datatype, int root,
+  virtual void gatherv(const void* sendbuf,
+                       void* recvbuf,
+                       size_t sendcount,
+                       const size_t* recvcounts,
+                       const size_t* displs,
+                       datatype_t datatype,
+                       int root,
                        cudaStream_t stream) const = 0;
 
-  virtual void reducescatter(const void* sendbuff, void* recvbuff,
-                             size_t recvcount, datatype_t datatype, op_t op,
+  virtual void reducescatter(const void* sendbuff,
+                             void* recvbuff,
+                             size_t recvcount,
+                             datatype_t datatype,
+                             op_t op,
                              cudaStream_t stream) const = 0;
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_send(const void* buf, size_t size, int dest,
-                           cudaStream_t stream) const = 0;
+  virtual void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const = 0;
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  virtual void device_recv(void* buf, size_t size, int source,
-                           cudaStream_t stream) const = 0;
-
-  virtual void device_sendrecv(const void* sendbuf, size_t sendsize, int dest,
-                               void* recvbuf, size_t recvsize, int source,
+  virtual void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const = 0;
+
+  virtual void device_sendrecv(const void* sendbuf,
+                               size_t sendsize,
+                               int dest,
+                               void* recvbuf,
+                               size_t recvsize,
+                               int source,
                                cudaStream_t stream) const = 0;
 
-  virtual void device_multicast_sendrecv(
-    const void* sendbuf, std::vector<size_t> const& sendsizes,
-    std::vector<size_t> const& sendoffsets, std::vector<int> const& dests,
-    void* recvbuf, std::vector<size_t> const& recvsizes,
-    std::vector<size_t> const& recvoffsets, std::vector<int> const& sources,
-    cudaStream_t stream) const = 0;
+  virtual void device_multicast_sendrecv(const void* sendbuf,
+                                         std::vector<size_t> const& sendsizes,
+                                         std::vector<size_t> const& sendoffsets,
+                                         std::vector<int> const& dests,
+                                         void* recvbuf,
+                                         std::vector<size_t> const& recvsizes,
+                                         std::vector<size_t> const& recvoffsets,
+                                         std::vector<int> const& sources,
+                                         cudaStream_t stream) const = 0;
 };
 
 class comms_t {
  public:
-  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release()) {
+  comms_t(std::unique_ptr<comms_iface> impl) : impl_(impl.release())
+  {
     ASSERT(nullptr != impl_.get(), "ERROR: Invalid comms_iface used!");
   }
 
@@ -191,7 +220,8 @@ class comms_t {
    * @param color ranks w/ the same color are placed in the same communicator
    * @param key controls rank assignment
    */
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const {
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
     return impl_->comm_split(color, key);
   }
 
@@ -208,9 +238,7 @@ class comms_t {
    *
    * @param stream the cuda stream to sync collective operations on
    */
-  status_t sync_stream(cudaStream_t stream) const {
-    return impl_->sync_stream(stream);
-  }
+  status_t sync_stream(cudaStream_t stream) const { return impl_->sync_stream(stream); }
 
   /**
    * Performs an asynchronous point-to-point send
@@ -223,10 +251,9 @@ class comms_t {
    * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
    */
   template <typename value_t>
-  void isend(const value_t* buf, size_t size, int dest, int tag,
-             request_t* request) const {
-    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest,
-                 tag, request);
+  void isend(const value_t* buf, size_t size, int dest, int tag, request_t* request) const
+  {
+    impl_->isend(static_cast<const void*>(buf), size * sizeof(value_t), dest, tag, request);
   }
 
   /**
@@ -240,10 +267,9 @@ class comms_t {
    * 		This will be used in `waitall()` to synchronize until the message is delivered (or fails).
    */
   template <typename value_t>
-  void irecv(value_t* buf, size_t size, int source, int tag,
-             request_t* request) const {
-    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag,
-                 request);
+  void irecv(value_t* buf, size_t size, int source, int tag, request_t* request) const
+  {
+    impl_->irecv(static_cast<void*>(buf), size * sizeof(value_t), source, tag, request);
   }
 
   /**
@@ -251,7 +277,8 @@ class comms_t {
    * @param count number of requests to synchronize on
    * @param array_of_requests an array of request_t objects returned from isend/irecv
    */
-  void waitall(int count, request_t array_of_requests[]) const {
+  void waitall(int count, request_t array_of_requests[]) const
+  {
     impl_->waitall(count, array_of_requests);
   }
 
@@ -265,11 +292,15 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void allreduce(const value_t* sendbuff, value_t* recvbuff, size_t count,
-                 op_t op, cudaStream_t stream) const {
+  void allreduce(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op, cudaStream_t stream) const
+  {
     impl_->allreduce(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff), count, get_type<value_t>(),
-                     op, stream);
+                     static_cast<void*>(recvbuff),
+                     count,
+                     get_type<value_t>(),
+                     op,
+                     stream);
   }
 
   /**
@@ -281,9 +312,9 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const {
-    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root,
-                 stream);
+  void bcast(value_t* buff, size_t count, int root, cudaStream_t stream) const
+  {
+    impl_->bcast(static_cast<void*>(buff), count, get_type<value_t>(), root, stream);
   }
 
   /**
@@ -296,10 +327,14 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void bcast(const value_t* sendbuff, value_t* recvbuff, size_t count, int root,
-             cudaStream_t stream) const {
+  void bcast(
+    const value_t* sendbuff, value_t* recvbuff, size_t count, int root, cudaStream_t stream) const
+  {
     impl_->bcast(static_cast<const void*>(sendbuff),
-                 static_cast<void*>(recvbuff), count, get_type<value_t>(), root,
+                 static_cast<void*>(recvbuff),
+                 count,
+                 get_type<value_t>(),
+                 root,
                  stream);
   }
 
@@ -314,11 +349,20 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void reduce(const value_t* sendbuff, value_t* recvbuff, size_t count, op_t op,
-              int root, cudaStream_t stream) const {
+  void reduce(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t count,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
     impl_->reduce(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff), count, get_type<value_t>(), op,
-                  root, stream);
+                  static_cast<void*>(recvbuff),
+                  count,
+                  get_type<value_t>(),
+                  op,
+                  root,
+                  stream);
   }
 
   /**
@@ -330,11 +374,16 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void allgather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount,
-                 cudaStream_t stream) const {
+  void allgather(const value_t* sendbuff,
+                 value_t* recvbuff,
+                 size_t sendcount,
+                 cudaStream_t stream) const
+  {
     impl_->allgather(static_cast<const void*>(sendbuff),
-                     static_cast<void*>(recvbuff), sendcount,
-                     get_type<value_t>(), stream);
+                     static_cast<void*>(recvbuff),
+                     sendcount,
+                     get_type<value_t>(),
+                     stream);
   }
 
   /**
@@ -349,12 +398,18 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void allgatherv(const value_t* sendbuf, value_t* recvbuf,
-                  const size_t* recvcounts, const size_t* displs,
-                  cudaStream_t stream) const {
+  void allgatherv(const value_t* sendbuf,
+                  value_t* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  cudaStream_t stream) const
+  {
     impl_->allgatherv(static_cast<const void*>(sendbuf),
-                      static_cast<void*>(recvbuf), recvcounts, displs,
-                      get_type<value_t>(), stream);
+                      static_cast<void*>(recvbuf),
+                      recvcounts,
+                      displs,
+                      get_type<value_t>(),
+                      stream);
   }
 
   /**
@@ -367,11 +422,18 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void gather(const value_t* sendbuff, value_t* recvbuff, size_t sendcount,
-              int root, cudaStream_t stream) const {
+  void gather(const value_t* sendbuff,
+              value_t* recvbuff,
+              size_t sendcount,
+              int root,
+              cudaStream_t stream) const
+  {
     impl_->gather(static_cast<const void*>(sendbuff),
-                  static_cast<void*>(recvbuff), sendcount, get_type<value_t>(),
-                  root, stream);
+                  static_cast<void*>(recvbuff),
+                  sendcount,
+                  get_type<value_t>(),
+                  root,
+                  stream);
   }
 
   /**
@@ -388,12 +450,22 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void gatherv(const value_t* sendbuf, value_t* recvbuf, size_t sendcount,
-               const size_t* recvcounts, const size_t* displs, int root,
-               cudaStream_t stream) const {
+  void gatherv(const value_t* sendbuf,
+               value_t* recvbuf,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               int root,
+               cudaStream_t stream) const
+  {
     impl_->gatherv(static_cast<const void*>(sendbuf),
-                   static_cast<void*>(recvbuf), sendcount, recvcounts, displs,
-                   get_type<value_t>(), root, stream);
+                   static_cast<void*>(recvbuf),
+                   sendcount,
+                   recvcounts,
+                   displs,
+                   get_type<value_t>(),
+                   root,
+                   stream);
   }
 
   /**
@@ -406,11 +478,18 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void reducescatter(const value_t* sendbuff, value_t* recvbuff,
-                     size_t recvcount, op_t op, cudaStream_t stream) const {
+  void reducescatter(const value_t* sendbuff,
+                     value_t* recvbuff,
+                     size_t recvcount,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
     impl_->reducescatter(static_cast<const void*>(sendbuff),
-                         static_cast<void*>(recvbuff), recvcount,
-                         get_type<value_t>(), op, stream);
+                         static_cast<void*>(recvbuff),
+                         recvcount,
+                         get_type<value_t>(),
+                         op,
+                         stream);
   }
 
   /**
@@ -425,10 +504,9 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_send(const value_t* buf, size_t size, int dest,
-                   cudaStream_t stream) const {
-    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t),
-                       dest, stream);
+  void device_send(const value_t* buf, size_t size, int dest, cudaStream_t stream) const
+  {
+    impl_->device_send(static_cast<const void*>(buf), size * sizeof(value_t), dest, stream);
   }
 
   /**
@@ -443,10 +521,9 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_recv(value_t* buf, size_t size, int source,
-                   cudaStream_t stream) const {
-    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source,
-                       stream);
+  void device_recv(value_t* buf, size_t size, int source, cudaStream_t stream) const
+  {
+    impl_->device_recv(static_cast<void*>(buf), size * sizeof(value_t), source, stream);
   }
 
   /**
@@ -462,12 +539,21 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_sendrecv(const value_t* sendbuf, size_t sendsize, int dest,
-                       value_t* recvbuf, size_t recvsize, int source,
-                       cudaStream_t stream) const {
-    impl_->device_sendrecv(
-      static_cast<const void*>(sendbuf), sendsize * sizeof(value_t), dest,
-      static_cast<void*>(recvbuf), recvsize * sizeof(value_t), source, stream);
+  void device_sendrecv(const value_t* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       value_t* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
+    impl_->device_sendrecv(static_cast<const void*>(sendbuf),
+                           sendsize * sizeof(value_t),
+                           dest,
+                           static_cast<void*>(recvbuf),
+                           recvsize * sizeof(value_t),
+                           source,
+                           stream);
   }
 
   /**
@@ -485,28 +571,37 @@ class comms_t {
    * @param stream CUDA stream to synchronize operation
    */
   template <typename value_t>
-  void device_multicast_sendrecv(
-    const value_t* sendbuf, std::vector<size_t> const& sendsizes,
-    std::vector<size_t> const& sendoffsets, std::vector<int> const& dests,
-    value_t* recvbuf, std::vector<size_t> const& recvsizes,
-    std::vector<size_t> const& recvoffsets, std::vector<int> const& sources,
-    cudaStream_t stream) const {
-    auto sendbytesizes = sendsizes;
+  void device_multicast_sendrecv(const value_t* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
+                                 value_t* recvbuf,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
+                                 cudaStream_t stream) const
+  {
+    auto sendbytesizes   = sendsizes;
     auto sendbyteoffsets = sendoffsets;
     for (size_t i = 0; i < sendsizes.size(); ++i) {
       sendbytesizes[i] *= sizeof(value_t);
       sendbyteoffsets[i] *= sizeof(value_t);
     }
-    auto recvbytesizes = recvsizes;
+    auto recvbytesizes   = recvsizes;
     auto recvbyteoffsets = recvoffsets;
     for (size_t i = 0; i < recvsizes.size(); ++i) {
       recvbytesizes[i] *= sizeof(value_t);
       recvbyteoffsets[i] *= sizeof(value_t);
     }
     impl_->device_multicast_sendrecv(static_cast<const void*>(sendbuf),
-                                     sendbytesizes, sendbyteoffsets, dests,
-                                     static_cast<void*>(recvbuf), recvbytesizes,
-                                     recvbyteoffsets, sources, stream);
+                                     sendbytesizes,
+                                     sendbyteoffsets,
+                                     dests,
+                                     static_cast<void*>(recvbuf),
+                                     recvbytesizes,
+                                     recvbyteoffsets,
+                                     sources,
+                                     stream);
   }
 
  private:
diff --git a/cpp/include/raft/comms/helper.hpp b/cpp/include/raft/comms/helper.hpp
index e01490d728..2be5b0d23f 100644
--- a/cpp/include/raft/comms/helper.hpp
+++ b/cpp/include/raft/comms/helper.hpp
@@ -36,12 +36,12 @@ namespace comms {
  * @param num_ranks number of ranks in communicator clique
  * @param rank rank of local instance
  */
-void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm,
-                           int num_ranks, int rank) {
+void build_comms_nccl_only(handle_t* handle, ncclComm_t nccl_comm, int num_ranks, int rank)
+{
   cudaStream_t stream = handle->get_stream();
 
-  auto communicator = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(
-    new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream)));
+  auto communicator = std::make_shared<comms_t>(
+    std::unique_ptr<comms_iface>(new raft::comms::std_comms(nccl_comm, num_ranks, rank, stream)));
   handle->set_comms(communicator);
 }
 
@@ -60,20 +60,20 @@ void build_comms_nccl_only(handle_t *handle, ncclComm_t nccl_comm,
  * @param num_ranks number of ranks in communicator clique
  * @param rank rank of local instance
  */
-void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm,
-                          void *ucp_worker, void *eps, int num_ranks,
-                          int rank) {
-  auto eps_sp = std::make_shared<ucp_ep_h *>(new ucp_ep_h[num_ranks]);
+void build_comms_nccl_ucx(
+  handle_t* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank)
+{
+  auto eps_sp = std::make_shared<ucp_ep_h*>(new ucp_ep_h[num_ranks]);
 
-  auto size_t_ep_arr = reinterpret_cast<size_t *>(eps);
+  auto size_t_ep_arr = reinterpret_cast<size_t*>(eps);
 
   for (int i = 0; i < num_ranks; i++) {
-    size_t ptr = size_t_ep_arr[i];
-    auto ucp_ep_v = reinterpret_cast<ucp_ep_h *>(*eps_sp);
+    size_t ptr    = size_t_ep_arr[i];
+    auto ucp_ep_v = reinterpret_cast<ucp_ep_h*>(*eps_sp);
 
     if (ptr != 0) {
       auto eps_ptr = reinterpret_cast<ucp_ep_h>(size_t_ep_arr[i]);
-      ucp_ep_v[i] = eps_ptr;
+      ucp_ep_v[i]  = eps_ptr;
     } else {
       ucp_ep_v[i] = nullptr;
     }
@@ -81,18 +81,19 @@ void build_comms_nccl_ucx(handle_t *handle, ncclComm_t nccl_comm,
 
   cudaStream_t stream = handle->get_stream();
 
-  auto communicator = std::make_shared<comms_t>(
-    std::unique_ptr<comms_iface>(new raft::comms::std_comms(
+  auto communicator =
+    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new raft::comms::std_comms(
       nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream)));
   handle->set_comms(communicator);
 }
 
-inline void nccl_unique_id_from_char(ncclUniqueId *id, char *uniqueId,
-                                     int size) {
+inline void nccl_unique_id_from_char(ncclUniqueId* id, char* uniqueId, int size)
+{
   memcpy(id->internal, uniqueId, size);
 }
 
-inline void get_unique_id(char *uid, int size) {
+inline void get_unique_id(char* uid, int size)
+{
   ncclUniqueId id;
   ncclGetUniqueId(&id);
 
diff --git a/cpp/include/raft/comms/mpi_comms.hpp b/cpp/include/raft/comms/mpi_comms.hpp
index 067c7bd0ab..3091cd53a9 100644
--- a/cpp/include/raft/comms/mpi_comms.hpp
+++ b/cpp/include/raft/comms/mpi_comms.hpp
@@ -32,16 +32,16 @@
 #include <raft/error.hpp>
 #include <raft/handle.hpp>
 
-#define MPI_TRY(call)                                                          \
-  do {                                                                         \
-    int status = call;                                                         \
-    if (MPI_SUCCESS != status) {                                               \
-      int mpi_error_string_lenght = 0;                                         \
-      char mpi_error_string[MPI_MAX_ERROR_STRING];                             \
-      MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght);    \
-      RAFT_EXPECTS(MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", \
-                   #call, mpi_error_string);                                   \
-    }                                                                          \
+#define MPI_TRY(call)                                                                         \
+  do {                                                                                        \
+    int status = call;                                                                        \
+    if (MPI_SUCCESS != status) {                                                              \
+      int mpi_error_string_lenght = 0;                                                        \
+      char mpi_error_string[MPI_MAX_ERROR_STRING];                                            \
+      MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght);                   \
+      RAFT_EXPECTS(                                                                           \
+        MPI_SUCCESS == status, "ERROR: MPI call='%s'. Reason:%s\n", #call, mpi_error_string); \
+    }                                                                                         \
   } while (0)
 
 #define MPI_TRY_NO_THROW(call)                                              \
@@ -51,48 +51,41 @@
       int mpi_error_string_lenght = 0;                                      \
       char mpi_error_string[MPI_MAX_ERROR_STRING];                          \
       MPI_Error_string(status, mpi_error_string, &mpi_error_string_lenght); \
-      printf("MPI call='%s' at file=%s line=%d failed with %s ", #call,     \
-             __FILE__, __LINE__, mpi_error_string);                         \
+      printf("MPI call='%s' at file=%s line=%d failed with %s ",            \
+             #call,                                                         \
+             __FILE__,                                                      \
+             __LINE__,                                                      \
+             mpi_error_string);                                             \
     }                                                                       \
   } while (0)
 
 namespace raft {
 namespace comms {
 
-constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype) {
+constexpr MPI_Datatype get_mpi_datatype(const datatype_t datatype)
+{
   switch (datatype) {
-    case datatype_t::CHAR:
-      return MPI_CHAR;
-    case datatype_t::UINT8:
-      return MPI_UNSIGNED_CHAR;
-    case datatype_t::INT32:
-      return MPI_INT;
-    case datatype_t::UINT32:
-      return MPI_UNSIGNED;
-    case datatype_t::INT64:
-      return MPI_LONG_LONG;
-    case datatype_t::UINT64:
-      return MPI_UNSIGNED_LONG_LONG;
-    case datatype_t::FLOAT32:
-      return MPI_FLOAT;
-    case datatype_t::FLOAT64:
-      return MPI_DOUBLE;
+    case datatype_t::CHAR: return MPI_CHAR;
+    case datatype_t::UINT8: return MPI_UNSIGNED_CHAR;
+    case datatype_t::INT32: return MPI_INT;
+    case datatype_t::UINT32: return MPI_UNSIGNED;
+    case datatype_t::INT64: return MPI_LONG_LONG;
+    case datatype_t::UINT64: return MPI_UNSIGNED_LONG_LONG;
+    case datatype_t::FLOAT32: return MPI_FLOAT;
+    case datatype_t::FLOAT64: return MPI_DOUBLE;
     default:
       // Execution should never reach here. This takes care of compiler warning.
       return MPI_DOUBLE;
   }
 }
 
-constexpr MPI_Op get_mpi_op(const op_t op) {
+constexpr MPI_Op get_mpi_op(const op_t op)
+{
   switch (op) {
-    case op_t::SUM:
-      return MPI_SUM;
-    case op_t::PROD:
-      return MPI_PROD;
-    case op_t::MIN:
-      return MPI_MIN;
-    case op_t::MAX:
-      return MPI_MAX;
+    case op_t::SUM: return MPI_SUM;
+    case op_t::PROD: return MPI_PROD;
+    case op_t::MIN: return MPI_MIN;
+    case op_t::MAX: return MPI_MAX;
     default:
       // Execution should never reach here. This takes care of compiler warning.
       return MPI_MAX;
@@ -102,38 +95,35 @@ constexpr MPI_Op get_mpi_op(const op_t op) {
 class mpi_comms : public comms_iface {
  public:
   mpi_comms(MPI_Comm comm, const bool owns_mpi_comm)
-    : owns_mpi_comm_(owns_mpi_comm),
-      mpi_comm_(comm),
-      size_(0),
-      rank_(1),
-      next_request_id_(0) {
+    : owns_mpi_comm_(owns_mpi_comm), mpi_comm_(comm), size_(0), rank_(1), next_request_id_(0)
+  {
     int mpi_is_initialized = 0;
     MPI_TRY(MPI_Initialized(&mpi_is_initialized));
     RAFT_EXPECTS(mpi_is_initialized, "ERROR: MPI is not initialized!");
     MPI_TRY(MPI_Comm_size(mpi_comm_, &size_));
     MPI_TRY(MPI_Comm_rank(mpi_comm_, &rank_));
-    //get NCCL unique ID at rank 0 and broadcast it to all others
+    // get NCCL unique ID at rank 0 and broadcast it to all others
     ncclUniqueId id;
     if (0 == rank_) NCCL_TRY(ncclGetUniqueId(&id));
     MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, mpi_comm_));
 
-    //initializing NCCL
+    // initializing NCCL
     NCCL_TRY(ncclCommInitRank(&nccl_comm_, size_, id, rank_));
   }
 
-  virtual ~mpi_comms() {
-    //finalizing NCCL
+  virtual ~mpi_comms()
+  {
+    // finalizing NCCL
     NCCL_TRY_NO_THROW(ncclCommDestroy(nccl_comm_));
-    if (owns_mpi_comm_) {
-      MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_));
-    }
+    if (owns_mpi_comm_) { MPI_TRY_NO_THROW(MPI_Comm_free(&mpi_comm_)); }
   }
 
   int get_size() const { return size_; }
 
   int get_rank() const { return rank_; }
 
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const {
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
     MPI_Comm new_comm;
     MPI_TRY(MPI_Comm_split(mpi_comm_, color, key, &new_comm));
     return std::unique_ptr<comms_iface>(new mpi_comms(new_comm, true));
@@ -141,15 +131,15 @@ class mpi_comms : public comms_iface {
 
   void barrier() const { MPI_TRY(MPI_Barrier(mpi_comm_)); }
 
-  void isend(const void* buf, size_t size, int dest, int tag,
-             request_t* request) const {
+  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
+  {
     MPI_Request mpi_req;
     request_t req_id;
     if (free_requests_.empty()) {
       req_id = next_request_id_++;
     } else {
       auto it = free_requests_.begin();
-      req_id = *it;
+      req_id  = *it;
       free_requests_.erase(it);
     }
     MPI_TRY(MPI_Isend(buf, size, MPI_BYTE, dest, tag, mpi_comm_, &mpi_req));
@@ -157,15 +147,15 @@ class mpi_comms : public comms_iface {
     *request = req_id;
   }
 
-  void irecv(void* buf, size_t size, int source, int tag,
-             request_t* request) const {
+  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
+  {
     MPI_Request mpi_req;
     request_t req_id;
     if (free_requests_.empty()) {
       req_id = next_request_id_++;
     } else {
       auto it = free_requests_.begin();
-      req_id = *it;
+      req_id  = *it;
       free_requests_.erase(it);
     }
 
@@ -174,7 +164,8 @@ class mpi_comms : public comms_iface {
     *request = req_id;
   }
 
-  void waitall(int count, request_t array_of_requests[]) const {
+  void waitall(int count, request_t array_of_requests[]) const
+  {
     std::vector<MPI_Request> requests;
     requests.reserve(count);
     for (int i = 0; i < count; ++i) {
@@ -189,94 +180,149 @@ class mpi_comms : public comms_iface {
     MPI_TRY(MPI_Waitall(requests.size(), requests.data(), MPI_STATUSES_IGNORE));
   }
 
-  void allreduce(const void* sendbuff, void* recvbuff, size_t count,
-                 datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count,
-                           get_nccl_datatype(datatype), get_nccl_op(op),
-                           nccl_comm_, stream));
+  void allreduce(const void* sendbuff,
+                 void* recvbuff,
+                 size_t count,
+                 datatype_t datatype,
+                 op_t op,
+                 cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclAllReduce(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
   }
 
-  void bcast(void* buff, size_t count, datatype_t datatype, int root,
-             cudaStream_t stream) const {
-    NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root,
-                           nccl_comm_, stream));
+  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
+  {
+    NCCL_TRY(
+      ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
-  void bcast(const void* sendbuff, void* recvbuff, size_t count,
-             datatype_t datatype, int root, cudaStream_t stream) const {
-    NCCL_TRY(ncclBroadcast(sendbuff, recvbuff, count,
-                           get_nccl_datatype(datatype), root, nccl_comm_,
-                           stream));
+  void bcast(const void* sendbuff,
+             void* recvbuff,
+             size_t count,
+             datatype_t datatype,
+             int root,
+             cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclBroadcast(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
-  void reduce(const void* sendbuff, void* recvbuff, size_t count,
-              datatype_t datatype, op_t op, int root,
-              cudaStream_t stream) const {
-    NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype),
-                        get_nccl_op(op), root, nccl_comm_, stream));
+  void reduce(const void* sendbuff,
+              void* recvbuff,
+              size_t count,
+              datatype_t datatype,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclReduce(sendbuff,
+                        recvbuff,
+                        count,
+                        get_nccl_datatype(datatype),
+                        get_nccl_op(op),
+                        root,
+                        nccl_comm_,
+                        stream));
   }
 
-  void allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
-                 datatype_t datatype, cudaStream_t stream) const {
-    NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount,
-                           get_nccl_datatype(datatype), nccl_comm_, stream));
+  void allgather(const void* sendbuff,
+                 void* recvbuff,
+                 size_t sendcount,
+                 datatype_t datatype,
+                 cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclAllGather(
+      sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
   }
 
-  void allgatherv(const void* sendbuf, void* recvbuf, const size_t* recvcounts,
-                  const size_t* displs, datatype_t datatype,
-                  cudaStream_t stream) const {
-    //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf
-    //Listing 1 on page 4.
+  void allgatherv(const void* sendbuf,
+                  void* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  datatype_t datatype,
+                  cudaStream_t stream) const
+  {
+    // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
+    // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
     for (int root = 0; root < size_; ++root) {
-      NCCL_TRY(ncclBroadcast(sendbuf,
-                             static_cast<char*>(recvbuf) +
-                               displs[root] * get_datatype_size(datatype),
-                             recvcounts[root], get_nccl_datatype(datatype),
-                             root, nccl_comm_, stream));
+      NCCL_TRY(
+        ncclBroadcast(sendbuf,
+                      static_cast<char*>(recvbuf) + displs[root] * get_datatype_size(datatype),
+                      recvcounts[root],
+                      get_nccl_datatype(datatype),
+                      root,
+                      nccl_comm_,
+                      stream));
     }
   }
 
-  void gather(const void* sendbuff, void* recvbuff, size_t sendcount,
-              datatype_t datatype, int root, cudaStream_t stream) const {
+  void gather(const void* sendbuff,
+              void* recvbuff,
+              size_t sendcount,
+              datatype_t datatype,
+              int root,
+              cudaStream_t stream) const
+  {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(
-          static_cast<char*>(recvbuff) + sendcount * r * dtype_size, sendcount,
-          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
+                          sendcount,
+                          get_nccl_datatype(datatype),
+                          r,
+                          nccl_comm_,
+                          stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
-                      nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void gatherv(const void* sendbuff, void* recvbuff, size_t sendcount,
-               const size_t* recvcounts, const size_t* displs,
-               datatype_t datatype, int root, cudaStream_t stream) const {
+  void gatherv(const void* sendbuff,
+               void* recvbuff,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               datatype_t datatype,
+               int root,
+               cudaStream_t stream) const
+  {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
         NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
-                          recvcounts[r], get_nccl_datatype(datatype), r,
-                          nccl_comm_, stream));
+                          recvcounts[r],
+                          get_nccl_datatype(datatype),
+                          r,
+                          nccl_comm_,
+                          stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
-                      nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void reducescatter(const void* sendbuff, void* recvbuff, size_t recvcount,
-                     datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
-                               get_nccl_datatype(datatype), get_nccl_op(op),
-                               nccl_comm_, stream));
+  void reducescatter(const void* sendbuff,
+                     void* recvbuff,
+                     size_t recvcount,
+                     datatype_t datatype,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclReduceScatter(sendbuff,
+                               recvbuff,
+                               recvcount,
+                               get_nccl_datatype(datatype),
+                               get_nccl_op(op),
+                               nccl_comm_,
+                               stream));
   }
 
-  status_t sync_stream(cudaStream_t stream) const {
+  status_t sync_stream(cudaStream_t stream) const
+  {
     cudaError_t cudaErr;
     ncclResult_t ncclErr, ncclAsyncErr;
     while (1) {
@@ -309,45 +355,58 @@ class mpi_comms : public comms_iface {
   };
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_send(const void* buf, size_t size, int dest,
-                   cudaStream_t stream) const {
+  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
+  {
     NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_recv(void* buf, size_t size, int source,
-                   cudaStream_t stream) const {
+  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
+  {
     NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
   }
 
-  void device_sendrecv(const void* sendbuf, size_t sendsize, int dest,
-                       void* recvbuf, size_t recvsize, int source,
-                       cudaStream_t stream) const {
+  void device_sendrecv(const void* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       void* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
-    NCCL_TRY(
-      ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
+    NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
   void device_multicast_sendrecv(const void* sendbuf,
                                  std::vector<size_t> const& sendsizes,
                                  std::vector<size_t> const& sendoffsets,
-                                 std::vector<int> const& dests, void* recvbuf,
+                                 std::vector<int> const& dests,
+                                 void* recvbuf,
                                  std::vector<size_t> const& recvsizes,
                                  std::vector<size_t> const& recvoffsets,
                                  std::vector<int> const& sources,
-                                 cudaStream_t stream) const {
+                                 cudaStream_t stream) const
+  {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     for (size_t i = 0; i < sendsizes.size(); ++i) {
       NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
-                        sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream));
+                        sendsizes[i],
+                        ncclUint8,
+                        dests[i],
+                        nccl_comm_,
+                        stream));
     }
     for (size_t i = 0; i < recvsizes.size(); ++i) {
       NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
-                        recvsizes[i], ncclUint8, sources[i], nccl_comm_,
+                        recvsizes[i],
+                        ncclUint8,
+                        sources[i],
+                        nccl_comm_,
                         stream));
     }
     NCCL_TRY(ncclGroupEnd());
@@ -365,9 +424,10 @@ class mpi_comms : public comms_iface {
   mutable std::unordered_set<request_t> free_requests_;
 };
 
-inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm) {
-  auto communicator = std::make_shared<comms_t>(
-    std::unique_ptr<comms_iface>(new mpi_comms(comm, true)));
+inline void initialize_mpi_comms(handle_t* handle, MPI_Comm comm)
+{
+  auto communicator =
+    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new mpi_comms(comm, true)));
   handle->set_comms(communicator);
 };
 
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index 47559b1718..1647c29667 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -64,9 +64,13 @@ class std_comms : public comms_iface {
    * @param stream cuda stream for synchronizing and ordering collective operations
    * @param subcomms_ucp use ucp for subcommunicators
    */
-  std_comms(ncclComm_t nccl_comm, ucp_worker_h ucp_worker,
-            std::shared_ptr<ucp_ep_h *> eps, int num_ranks, int rank,
-            cudaStream_t stream, bool subcomms_ucp = true)
+  std_comms(ncclComm_t nccl_comm,
+            ucp_worker_h ucp_worker,
+            std::shared_ptr<ucp_ep_h*> eps,
+            int num_ranks,
+            int rank,
+            cudaStream_t stream,
+            bool subcomms_ucp = true)
     : nccl_comm_(nccl_comm),
       stream_(stream),
       status_(2, stream),
@@ -75,7 +79,8 @@ class std_comms : public comms_iface {
       subcomms_ucp_(subcomms_ucp),
       ucp_worker_(ucp_worker),
       ucp_eps_(eps),
-      next_request_id_(0) {
+      next_request_id_(0)
+  {
     initialize();
   };
 
@@ -86,18 +91,19 @@ class std_comms : public comms_iface {
    * @param rank rank of the current worker
    * @param stream stream for ordering collective operations
    */
-  std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank,
-            cudaStream_t stream)
+  std_comms(const ncclComm_t nccl_comm, int num_ranks, int rank, cudaStream_t stream)
     : nccl_comm_(nccl_comm),
       stream_(stream),
       status_(2, stream),
       num_ranks_(num_ranks),
       rank_(rank),
-      subcomms_ucp_(false) {
+      subcomms_ucp_(false)
+  {
     initialize();
   };
 
-  void initialize() {
+  void initialize()
+  {
     sendbuff_ = status_.data();
     recvbuff_ = status_.data() + 1;
   }
@@ -106,17 +112,16 @@ class std_comms : public comms_iface {
 
   int get_rank() const { return rank_; }
 
-  std::unique_ptr<comms_iface> comm_split(int color, int key) const {
+  std::unique_ptr<comms_iface> comm_split(int color, int key) const
+  {
     rmm::device_uvector<int> d_colors(get_size(), stream_);
     rmm::device_uvector<int> d_keys(get_size(), stream_);
 
     update_device(d_colors.data() + get_rank(), &color, 1, stream_);
     update_device(d_keys.data() + get_rank(), &key, 1, stream_);
 
-    allgather(d_colors.data() + get_rank(), d_colors.data(), 1,
-              datatype_t::INT32, stream_);
-    allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32,
-              stream_);
+    allgather(d_colors.data() + get_rank(), d_colors.data(), 1, datatype_t::INT32, stream_);
+    allgather(d_keys.data() + get_rank(), d_keys.data(), 1, datatype_t::INT32, stream_);
     this->sync_stream(stream_);
 
     std::vector<int> h_colors(get_size());
@@ -133,9 +138,7 @@ class std_comms : public comms_iface {
     for (int i = 0; i < get_size(); ++i) {
       if (h_colors[i] == color) {
         subcomm_ranks.push_back(i);
-        if (ucp_worker_ != nullptr && subcomms_ucp_) {
-          new_ucx_ptrs.push_back((*ucp_eps_)[i]);
-        }
+        if (ucp_worker_ != nullptr && subcomms_ucp_) { new_ucx_ptrs.push_back((*ucp_eps_)[i]); }
       }
     }
 
@@ -144,8 +147,7 @@ class std_comms : public comms_iface {
       NCCL_TRY(ncclGetUniqueId(&id));
       std::vector<request_t> requests(subcomm_ranks.size() - 1);
       for (size_t i = 1; i < subcomm_ranks.size(); ++i) {
-        isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color,
-              requests.data() + (i - 1));
+        isend(&id, sizeof(ncclUniqueId), subcomm_ranks[i], color, requests.data() + (i - 1));
       }
       waitall(requests.size(), requests.data());
     } else {
@@ -160,17 +162,22 @@ class std_comms : public comms_iface {
     NCCL_TRY(ncclCommInitRank(&nccl_comm, subcomm_ranks.size(), id, key));
 
     if (ucp_worker_ != nullptr && subcomms_ucp_) {
-      auto eps_sp = std::make_shared<ucp_ep_h *>(new_ucx_ptrs.data());
-      return std::unique_ptr<comms_iface>(
-        new std_comms(nccl_comm, (ucp_worker_h)ucp_worker_, eps_sp,
-                      subcomm_ranks.size(), key, stream_, subcomms_ucp_));
+      auto eps_sp = std::make_shared<ucp_ep_h*>(new_ucx_ptrs.data());
+      return std::unique_ptr<comms_iface>(new std_comms(nccl_comm,
+                                                        (ucp_worker_h)ucp_worker_,
+                                                        eps_sp,
+                                                        subcomm_ranks.size(),
+                                                        key,
+                                                        stream_,
+                                                        subcomms_ucp_));
     } else {
       return std::unique_ptr<comms_iface>(
         new std_comms(nccl_comm, subcomm_ranks.size(), key, stream_));
     }
   }
 
-  void barrier() const {
+  void barrier() const
+  {
     CUDA_CHECK(cudaMemsetAsync(sendbuff_, 1, sizeof(int), stream_));
     CUDA_CHECK(cudaMemsetAsync(recvbuff_, 1, sizeof(int), stream_));
 
@@ -180,39 +187,37 @@ class std_comms : public comms_iface {
            "ERROR: syncStream failed. This can be caused by a failed rank_.");
   }
 
-  void get_request_id(request_t *req) const {
+  void get_request_id(request_t* req) const
+  {
     request_t req_id;
 
     if (this->free_requests_.empty())
       req_id = this->next_request_id_++;
     else {
       auto it = this->free_requests_.begin();
-      req_id = *it;
+      req_id  = *it;
       this->free_requests_.erase(it);
     }
     *req = req_id;
   }
 
-  void isend(const void *buf, size_t size, int dest, int tag,
-             request_t *request) const {
-    ASSERT(ucp_worker_ != nullptr,
-           "ERROR: UCX comms not initialized on communicator.");
+  void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
+  {
+    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
 
     get_request_id(request);
     ucp_ep_h ep_ptr = (*ucp_eps_)[dest];
 
-    ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request));
+    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
 
-    this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag,
-                                 default_tag_mask, get_rank());
+    this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank());
 
     requests_in_flight_.insert(std::make_pair(*request, ucp_req));
   }
 
-  void irecv(void *buf, size_t size, int source, int tag,
-             request_t *request) const {
-    ASSERT(ucp_worker_ != nullptr,
-           "ERROR: UCX comms not initialized on communicator.");
+  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
+  {
+    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
 
     get_request_id(request);
 
@@ -220,18 +225,17 @@ class std_comms : public comms_iface {
 
     ucp_tag_t tag_mask = default_tag_mask;
 
-    ucp_request *ucp_req = (ucp_request *)malloc(sizeof(ucp_request));
-    ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag,
-                           tag_mask, source);
+    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
+    ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source);
 
     requests_in_flight_.insert(std::make_pair(*request, ucp_req));
   }
 
-  void waitall(int count, request_t array_of_requests[]) const {
-    ASSERT(ucp_worker_ != nullptr,
-           "ERROR: UCX comms not initialized on communicator.");
+  void waitall(int count, request_t array_of_requests[]) const
+  {
+    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
 
-    std::vector<ucp_request *> requests;
+    std::vector<ucp_request*> requests;
     requests.reserve(count);
 
     time_t start = time(NULL);
@@ -239,7 +243,8 @@ class std_comms : public comms_iface {
     for (int i = 0; i < count; ++i) {
       auto req_it = requests_in_flight_.find(array_of_requests[i]);
       ASSERT(requests_in_flight_.end() != req_it,
-             "ERROR: waitall on invalid request: %d", array_of_requests[i]);
+             "ERROR: waitall on invalid request: %d",
+             array_of_requests[i]);
       requests.push_back(req_it->second);
       free_requests_.insert(req_it->first);
       requests_in_flight_.erase(req_it);
@@ -252,8 +257,7 @@ class std_comms : public comms_iface {
       // in 10 or more seconds.
       ASSERT(now - start < 10, "Timed out waiting for requests.");
 
-      for (std::vector<ucp_request *>::iterator it = requests.begin();
-           it != requests.end();) {
+      for (std::vector<ucp_request*>::iterator it = requests.begin(); it != requests.end();) {
         bool restart = false;  // resets the timeout when any progress was made
 
         // Causes UCP to progress through the send/recv message queue
@@ -266,10 +270,8 @@ class std_comms : public comms_iface {
         // If the message needs release, we know it will be sent/received
         // asynchronously, so we will need to track and verify its state
         if (req->needs_release) {
-          ASSERT(UCS_PTR_IS_PTR(req->req),
-                 "UCX Request Error. Request is not valid UCX pointer");
-          ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n",
-                 UCS_PTR_STATUS(req->req));
+          ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer");
+          ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req));
           ASSERT(req->req->completed == 1 || req->req->completed == 0,
                  "request->completed not a valid value: %d\n",
                  req->req->completed);
@@ -290,101 +292,154 @@ class std_comms : public comms_iface {
           ++it;
         }
         // if any progress was made, reset the timeout start time
-        if (restart) {
-          start = time(NULL);
-        }
+        if (restart) { start = time(NULL); }
       }
     }
   }
 
-  void allreduce(const void *sendbuff, void *recvbuff, size_t count,
-                 datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_TRY(ncclAllReduce(sendbuff, recvbuff, count,
-                           get_nccl_datatype(datatype), get_nccl_op(op),
-                           nccl_comm_, stream));
+  void allreduce(const void* sendbuff,
+                 void* recvbuff,
+                 size_t count,
+                 datatype_t datatype,
+                 op_t op,
+                 cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclAllReduce(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), get_nccl_op(op), nccl_comm_, stream));
   }
 
-  void bcast(void *buff, size_t count, datatype_t datatype, int root,
-             cudaStream_t stream) const {
-    NCCL_TRY(ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root,
-                           nccl_comm_, stream));
+  void bcast(void* buff, size_t count, datatype_t datatype, int root, cudaStream_t stream) const
+  {
+    NCCL_TRY(
+      ncclBroadcast(buff, buff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
-  void bcast(const void *sendbuff, void *recvbuff, size_t count,
-             datatype_t datatype, int root, cudaStream_t stream) const {
-    NCCL_TRY(ncclBroadcast(sendbuff, recvbuff, count,
-                           get_nccl_datatype(datatype), root, nccl_comm_,
-                           stream));
+  void bcast(const void* sendbuff,
+             void* recvbuff,
+             size_t count,
+             datatype_t datatype,
+             int root,
+             cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclBroadcast(
+      sendbuff, recvbuff, count, get_nccl_datatype(datatype), root, nccl_comm_, stream));
   }
 
-  void reduce(const void *sendbuff, void *recvbuff, size_t count,
-              datatype_t datatype, op_t op, int root,
-              cudaStream_t stream) const {
-    NCCL_TRY(ncclReduce(sendbuff, recvbuff, count, get_nccl_datatype(datatype),
-                        get_nccl_op(op), root, nccl_comm_, stream));
+  void reduce(const void* sendbuff,
+              void* recvbuff,
+              size_t count,
+              datatype_t datatype,
+              op_t op,
+              int root,
+              cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclReduce(sendbuff,
+                        recvbuff,
+                        count,
+                        get_nccl_datatype(datatype),
+                        get_nccl_op(op),
+                        root,
+                        nccl_comm_,
+                        stream));
   }
 
-  void allgather(const void *sendbuff, void *recvbuff, size_t sendcount,
-                 datatype_t datatype, cudaStream_t stream) const {
-    NCCL_TRY(ncclAllGather(sendbuff, recvbuff, sendcount,
-                           get_nccl_datatype(datatype), nccl_comm_, stream));
+  void allgather(const void* sendbuff,
+                 void* recvbuff,
+                 size_t sendcount,
+                 datatype_t datatype,
+                 cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclAllGather(
+      sendbuff, recvbuff, sendcount, get_nccl_datatype(datatype), nccl_comm_, stream));
   }
 
-  void allgatherv(const void *sendbuf, void *recvbuf, const size_t *recvcounts,
-                  const size_t *displs, datatype_t datatype,
-                  cudaStream_t stream) const {
-    //From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" - https://arxiv.org/pdf/1812.05964.pdf
-    //Listing 1 on page 4.
+  void allgatherv(const void* sendbuf,
+                  void* recvbuf,
+                  const size_t* recvcounts,
+                  const size_t* displs,
+                  datatype_t datatype,
+                  cudaStream_t stream) const
+  {
+    // From: "An Empirical Evaluation of Allgatherv on Multi-GPU Systems" -
+    // https://arxiv.org/pdf/1812.05964.pdf Listing 1 on page 4.
     for (int root = 0; root < num_ranks_; ++root) {
       size_t dtype_size = get_datatype_size(datatype);
-      NCCL_TRY(ncclBroadcast(
-        sendbuf, static_cast<char *>(recvbuf) + displs[root] * dtype_size,
-        recvcounts[root], get_nccl_datatype(datatype), root, nccl_comm_,
-        stream));
+      NCCL_TRY(ncclBroadcast(sendbuf,
+                             static_cast<char*>(recvbuf) + displs[root] * dtype_size,
+                             recvcounts[root],
+                             get_nccl_datatype(datatype),
+                             root,
+                             nccl_comm_,
+                             stream));
     }
   }
 
-  void gather(const void *sendbuff, void *recvbuff, size_t sendcount,
-              datatype_t datatype, int root, cudaStream_t stream) const {
+  void gather(const void* sendbuff,
+              void* recvbuff,
+              size_t sendcount,
+              datatype_t datatype,
+              int root,
+              cudaStream_t stream) const
+  {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(
-          static_cast<char *>(recvbuff) + sendcount * r * dtype_size, sendcount,
-          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + sendcount * r * dtype_size,
+                          sendcount,
+                          get_nccl_datatype(datatype),
+                          r,
+                          nccl_comm_,
+                          stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
-                      nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void gatherv(const void *sendbuff, void *recvbuff, size_t sendcount,
-               const size_t *recvcounts, const size_t *displs,
-               datatype_t datatype, int root, cudaStream_t stream) const {
+  void gatherv(const void* sendbuff,
+               void* recvbuff,
+               size_t sendcount,
+               const size_t* recvcounts,
+               const size_t* displs,
+               datatype_t datatype,
+               int root,
+               cudaStream_t stream) const
+  {
     size_t dtype_size = get_datatype_size(datatype);
     NCCL_TRY(ncclGroupStart());
     if (get_rank() == root) {
       for (int r = 0; r < get_size(); ++r) {
-        NCCL_TRY(ncclRecv(
-          static_cast<char *>(recvbuff) + displs[r] * dtype_size, recvcounts[r],
-          get_nccl_datatype(datatype), r, nccl_comm_, stream));
+        NCCL_TRY(ncclRecv(static_cast<char*>(recvbuff) + displs[r] * dtype_size,
+                          recvcounts[r],
+                          get_nccl_datatype(datatype),
+                          r,
+                          nccl_comm_,
+                          stream));
       }
     }
-    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root,
-                      nccl_comm_, stream));
+    NCCL_TRY(ncclSend(sendbuff, sendcount, get_nccl_datatype(datatype), root, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void reducescatter(const void *sendbuff, void *recvbuff, size_t recvcount,
-                     datatype_t datatype, op_t op, cudaStream_t stream) const {
-    NCCL_TRY(ncclReduceScatter(sendbuff, recvbuff, recvcount,
-                               get_nccl_datatype(datatype), get_nccl_op(op),
-                               nccl_comm_, stream));
+  void reducescatter(const void* sendbuff,
+                     void* recvbuff,
+                     size_t recvcount,
+                     datatype_t datatype,
+                     op_t op,
+                     cudaStream_t stream) const
+  {
+    NCCL_TRY(ncclReduceScatter(sendbuff,
+                               recvbuff,
+                               recvcount,
+                               get_nccl_datatype(datatype),
+                               get_nccl_op(op),
+                               nccl_comm_,
+                               stream));
   }
 
-  status_t sync_stream(cudaStream_t stream) const {
+  status_t sync_stream(cudaStream_t stream) const
+  {
     cudaError_t cudaErr;
     ncclResult_t ncclErr, ncclAsyncErr;
     while (1) {
@@ -417,45 +472,58 @@ class std_comms : public comms_iface {
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_send(const void *buf, size_t size, int dest,
-                   cudaStream_t stream) const {
+  void device_send(const void* buf, size_t size, int dest, cudaStream_t stream) const
+  {
     NCCL_TRY(ncclSend(buf, size, ncclUint8, dest, nccl_comm_, stream));
   }
 
   // if a thread is sending & receiving at the same time, use device_sendrecv to avoid deadlock
-  void device_recv(void *buf, size_t size, int source,
-                   cudaStream_t stream) const {
+  void device_recv(void* buf, size_t size, int source, cudaStream_t stream) const
+  {
     NCCL_TRY(ncclRecv(buf, size, ncclUint8, source, nccl_comm_, stream));
   }
 
-  void device_sendrecv(const void *sendbuf, size_t sendsize, int dest,
-                       void *recvbuf, size_t recvsize, int source,
-                       cudaStream_t stream) const {
+  void device_sendrecv(const void* sendbuf,
+                       size_t sendsize,
+                       int dest,
+                       void* recvbuf,
+                       size_t recvsize,
+                       int source,
+                       cudaStream_t stream) const
+  {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     NCCL_TRY(ncclSend(sendbuf, sendsize, ncclUint8, dest, nccl_comm_, stream));
-    NCCL_TRY(
-      ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
+    NCCL_TRY(ncclRecv(recvbuf, recvsize, ncclUint8, source, nccl_comm_, stream));
     NCCL_TRY(ncclGroupEnd());
   }
 
-  void device_multicast_sendrecv(const void *sendbuf,
-                                 std::vector<size_t> const &sendsizes,
-                                 std::vector<size_t> const &sendoffsets,
-                                 std::vector<int> const &dests, void *recvbuf,
-                                 std::vector<size_t> const &recvsizes,
-                                 std::vector<size_t> const &recvoffsets,
-                                 std::vector<int> const &sources,
-                                 cudaStream_t stream) const {
+  void device_multicast_sendrecv(const void* sendbuf,
+                                 std::vector<size_t> const& sendsizes,
+                                 std::vector<size_t> const& sendoffsets,
+                                 std::vector<int> const& dests,
+                                 void* recvbuf,
+                                 std::vector<size_t> const& recvsizes,
+                                 std::vector<size_t> const& recvoffsets,
+                                 std::vector<int> const& sources,
+                                 cudaStream_t stream) const
+  {
     // ncclSend/ncclRecv pair needs to be inside ncclGroupStart/ncclGroupEnd to avoid deadlock
     NCCL_TRY(ncclGroupStart());
     for (size_t i = 0; i < sendsizes.size(); ++i) {
-      NCCL_TRY(ncclSend(static_cast<const char *>(sendbuf) + sendoffsets[i],
-                        sendsizes[i], ncclUint8, dests[i], nccl_comm_, stream));
+      NCCL_TRY(ncclSend(static_cast<const char*>(sendbuf) + sendoffsets[i],
+                        sendsizes[i],
+                        ncclUint8,
+                        dests[i],
+                        nccl_comm_,
+                        stream));
     }
     for (size_t i = 0; i < recvsizes.size(); ++i) {
-      NCCL_TRY(ncclRecv(static_cast<char *>(recvbuf) + recvoffsets[i],
-                        recvsizes[i], ncclUint8, sources[i], nccl_comm_,
+      NCCL_TRY(ncclRecv(static_cast<char*>(recvbuf) + recvoffsets[i],
+                        recvsizes[i],
+                        ncclUint8,
+                        sources[i],
+                        nccl_comm_,
                         stream));
     }
     NCCL_TRY(ncclGroupEnd());
@@ -475,10 +543,9 @@ class std_comms : public comms_iface {
 
   comms_ucp_handler ucp_handler_;
   ucp_worker_h ucp_worker_;
-  std::shared_ptr<ucp_ep_h *> ucp_eps_;
+  std::shared_ptr<ucp_ep_h*> ucp_eps_;
   mutable request_t next_request_id_;
-  mutable std::unordered_map<request_t, struct ucp_request *>
-    requests_in_flight_;
+  mutable std::unordered_map<request_t, struct ucp_request*> requests_in_flight_;
   mutable std::unordered_set<request_t> free_requests_;
 };
 }  // end namespace comms
diff --git a/cpp/include/raft/comms/test.hpp b/cpp/include/raft/comms/test.hpp
index 39086de25d..5f87bf41fa 100644
--- a/cpp/include/raft/comms/test.hpp
+++ b/cpp/include/raft/comms/test.hpp
@@ -35,24 +35,23 @@ namespace comms {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_allreduce(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_allreduce(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = 1;
 
   cudaStream_t stream = handle.get_stream();
 
   rmm::device_scalar<int> temp_d(stream);
-  CUDA_CHECK(
-    cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, 1, cudaMemcpyHostToDevice, stream));
 
   communicator.allreduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, stream);
 
   int temp_h = 0;
-  CUDA_CHECK(
-    cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), 1, cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -67,10 +66,11 @@ bool test_collective_allreduce(const handle_t &handle, int root) {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_broadcast(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_broadcast(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = root;
 
@@ -79,14 +79,12 @@ bool test_collective_broadcast(const handle_t &handle, int root) {
   rmm::device_scalar<int> temp_d(stream);
 
   if (communicator.get_rank() == root)
-    CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
-                               cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.bcast(temp_d.data(), 1, root, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int),
-                             cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -101,10 +99,11 @@ bool test_collective_broadcast(const handle_t &handle, int root) {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_reduce(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_reduce(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = root;
 
@@ -112,14 +111,12 @@ bool test_collective_reduce(const handle_t &handle, int root) {
 
   rmm::device_scalar<int> temp_d(stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
-                             cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.reduce(temp_d.data(), temp_d.data(), 1, op_t::SUM, root, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int),
-                             cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, temp_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -137,10 +134,11 @@ bool test_collective_reduce(const handle_t &handle, int root) {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_allgather(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_allgather(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = communicator.get_rank();
 
@@ -149,16 +147,13 @@ bool test_collective_allgather(const handle_t &handle, int root) {
   rmm::device_scalar<int> temp_d(stream);
   rmm::device_uvector<int> recv_d(communicator.get_size(), stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
-                             cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.allgather(temp_d.data(), recv_d.data(), 1, stream);
   communicator.sync_stream(stream);
-  int
-    temp_h[communicator.get_size()];  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(),
-                             sizeof(int) * communicator.get_size(),
-                             cudaMemcpyDeviceToHost, stream));
+  int temp_h[communicator.get_size()];  // Verify more than one byte is being sent
+  CUDA_CHECK(cudaMemcpyAsync(
+    &temp_h, recv_d.data(), sizeof(int) * communicator.get_size(), cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -176,30 +171,29 @@ bool test_collective_allgather(const handle_t &handle, int root) {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_gather(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_gather(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   int const send = communicator.get_rank();
 
   cudaStream_t stream = handle.get_stream();
 
   rmm::device_scalar<int> temp_d(stream);
-  rmm::device_uvector<int> recv_d(
-    communicator.get_rank() == root ? communicator.get_size() : 0, stream);
+  rmm::device_uvector<int> recv_d(communicator.get_rank() == root ? communicator.get_size() : 0,
+                                  stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int),
-                             cudaMemcpyHostToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), &send, sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.gather(temp_d.data(), recv_d.data(), 1, root, stream);
   communicator.sync_stream(stream);
 
   if (communicator.get_rank() == root) {
     std::vector<int> temp_h(communicator.get_size(), 0);
-    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(),
-                               sizeof(int) * temp_h.size(),
-                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(
+      temp_h.data(), recv_d.data(), sizeof(int) * temp_h.size(), cudaMemcpyDeviceToHost, stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     for (int i = 0; i < communicator.get_size(); i++) {
@@ -214,45 +208,47 @@ bool test_collective_gather(const handle_t &handle, int root) {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_gatherv(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_gatherv(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   std::vector<size_t> sendcounts(communicator.get_size());
   std::iota(sendcounts.begin(), sendcounts.end(), size_t{1});
   std::vector<size_t> displacements(communicator.get_size() + 1, 0);
-  std::partial_sum(sendcounts.begin(), sendcounts.end(),
-                   displacements.begin() + 1);
+  std::partial_sum(sendcounts.begin(), sendcounts.end(), displacements.begin() + 1);
 
-  std::vector<int> sends(displacements[communicator.get_rank() + 1] -
-                           displacements[communicator.get_rank()],
-                         communicator.get_rank());
+  std::vector<int> sends(
+    displacements[communicator.get_rank() + 1] - displacements[communicator.get_rank()],
+    communicator.get_rank());
 
   cudaStream_t stream = handle.get_stream();
 
   rmm::device_uvector<int> temp_d(sends.size(), stream);
-  rmm::device_uvector<int> recv_d(
-    communicator.get_rank() == root ? displacements.back() : 0, stream);
+  rmm::device_uvector<int> recv_d(communicator.get_rank() == root ? displacements.back() : 0,
+                                  stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
-                             sends.size() * sizeof(int), cudaMemcpyHostToDevice,
-                             stream));
+  CUDA_CHECK(cudaMemcpyAsync(
+    temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream));
 
   communicator.gatherv(
-    temp_d.data(), recv_d.data(), temp_d.size(),
-    communicator.get_rank() == root ? sendcounts.data()
-                                    : static_cast<size_t *>(nullptr),
-    communicator.get_rank() == root ? displacements.data()
-                                    : static_cast<size_t *>(nullptr),
-    root, stream);
+    temp_d.data(),
+    recv_d.data(),
+    temp_d.size(),
+    communicator.get_rank() == root ? sendcounts.data() : static_cast<size_t*>(nullptr),
+    communicator.get_rank() == root ? displacements.data() : static_cast<size_t*>(nullptr),
+    root,
+    stream);
   communicator.sync_stream(stream);
 
   if (communicator.get_rank() == root) {
     std::vector<int> temp_h(displacements.back(), 0);
-    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(), recv_d.data(),
+    CUDA_CHECK(cudaMemcpyAsync(temp_h.data(),
+                               recv_d.data(),
                                sizeof(int) * displacements.back(),
-                               cudaMemcpyDeviceToHost, stream));
+                               cudaMemcpyDeviceToHost,
+                               stream));
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     for (int i = 0; i < communicator.get_size(); i++) {
@@ -271,10 +267,11 @@ bool test_collective_gatherv(const handle_t &handle, int root) {
  *
  * @param[in] handle the raft handle to use. This is expected to already have an
  *        initialized comms instance.
-*  @param[in] root the root rank id
+ *  @param[in] root the root rank id
  */
-bool test_collective_reducescatter(const handle_t &handle, int root) {
-  comms_t const &communicator = handle.get_comms();
+bool test_collective_reducescatter(const handle_t& handle, int root)
+{
+  comms_t const& communicator = handle.get_comms();
 
   std::vector<int> sends(communicator.get_size(), 1);
 
@@ -283,16 +280,13 @@ bool test_collective_reducescatter(const handle_t &handle, int root) {
   rmm::device_uvector<int> temp_d(sends.size(), stream);
   rmm::device_scalar<int> recv_d(stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(temp_d.data(), sends.data(),
-                             sends.size() * sizeof(int), cudaMemcpyHostToDevice,
-                             stream));
+  CUDA_CHECK(cudaMemcpyAsync(
+    temp_d.data(), sends.data(), sends.size() * sizeof(int), cudaMemcpyHostToDevice, stream));
 
-  communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM,
-                             stream);
+  communicator.reducescatter(temp_d.data(), recv_d.data(), 1, op_t::SUM, stream);
   communicator.sync_stream(stream);
   int temp_h = -1;  // Verify more than one byte is being sent
-  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int),
-                             cudaMemcpyDeviceToHost, stream));
+  CUDA_CHECK(cudaMemcpyAsync(&temp_h, recv_d.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
   CUDA_CHECK(cudaStreamSynchronize(stream));
   communicator.barrier();
 
@@ -309,9 +303,10 @@ bool test_collective_reducescatter(const handle_t &handle, int root) {
  *        initialized comms instance.
  * @param[in] numTrials number of iterations of all-to-all messaging to perform
  */
-bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
+bool test_pointToPoint_simple_send_recv(const handle_t& h, int numTrials)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -320,11 +315,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
     std::vector<request_t> requests;
     requests.resize(2 * (communicator.get_size() - 1));
     int request_idx = 0;
-    //post receives
+    // post receives
     for (int r = 0; r < communicator.get_size(); ++r) {
       if (r != rank) {
-        communicator.irecv(received_data.data() + request_idx, 1, r, 0,
-                           requests.data() + request_idx);
+        communicator.irecv(
+          received_data.data() + request_idx, 1, r, 0, requests.data() + request_idx);
         ++request_idx;
       }
     }
@@ -360,8 +355,7 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
       communicator.barrier();
     }
 
-    if (communicator.get_rank() == 0)
-      std::cout << "=========================" << std::endl;
+    if (communicator.get_rank() == 0) std::cout << "=========================" << std::endl;
   }
 
   return ret;
@@ -374,10 +368,11 @@ bool test_pointToPoint_simple_send_recv(const handle_t &h, int numTrials) {
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
-  cudaStream_t stream = h.get_stream();
+bool test_pointToPoint_device_send_or_recv(const handle_t& h, int numTrials)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
+  cudaStream_t stream         = h.get_stream();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -400,13 +395,9 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) {
 
     communicator.sync_stream(stream);
 
-    if (!sender && received_data.value(stream) != rank - 1) {
-      ret = false;
-    }
+    if (!sender && received_data.value(stream) != rank - 1) { ret = false; }
 
-    if (communicator.get_rank() == 0) {
-      std::cout << "=========================" << std::endl;
-    }
+    if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; }
   }
 
   return ret;
@@ -419,10 +410,11 @@ bool test_pointToPoint_device_send_or_recv(const handle_t &h, int numTrials) {
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
-  cudaStream_t stream = h.get_stream();
+bool test_pointToPoint_device_sendrecv(const handle_t& h, int numTrials)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
+  cudaStream_t stream         = h.get_stream();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -436,12 +428,12 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
 
     if (rank % 2 == 0) {
       if (rank + 1 < communicator.get_size()) {
-        communicator.device_sendrecv(sent_data.data(), 1, rank + 1,
-                                     received_data.data(), 1, rank + 1, stream);
+        communicator.device_sendrecv(
+          sent_data.data(), 1, rank + 1, received_data.data(), 1, rank + 1, stream);
       }
     } else {
-      communicator.device_sendrecv(sent_data.data(), 1, rank - 1,
-                                   received_data.data(), 1, rank - 1, stream);
+      communicator.device_sendrecv(
+        sent_data.data(), 1, rank - 1, received_data.data(), 1, rank - 1, stream);
     }
 
     communicator.sync_stream(stream);
@@ -451,9 +443,7 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
       ret = false;
     }
 
-    if (communicator.get_rank() == 0) {
-      std::cout << "=========================" << std::endl;
-    }
+    if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; }
   }
 
   return ret;
@@ -466,11 +456,11 @@ bool test_pointToPoint_device_sendrecv(const handle_t &h, int numTrials) {
  *        initialized comms instance.
  * @param numTrials number of iterations of send or receive messaging to perform
  */
-bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
-                                                 int numTrials) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
-  cudaStream_t stream = h.get_stream();
+bool test_pointToPoint_device_multicast_sendrecv(const handle_t& h, int numTrials)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
+  cudaStream_t stream         = h.get_stream();
 
   bool ret = true;
   for (int i = 0; i < numTrials; i++) {
@@ -493,25 +483,26 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
     std::vector<int> srcs(communicator.get_size());
     std::iota(srcs.begin(), srcs.end(), int{0});
 
-    communicator.device_multicast_sendrecv(
-      sent_data.data(), sendsizes, sendoffsets, dests, received_data.data(),
-      recvsizes, recvoffsets, srcs, stream);
+    communicator.device_multicast_sendrecv(sent_data.data(),
+                                           sendsizes,
+                                           sendoffsets,
+                                           dests,
+                                           received_data.data(),
+                                           recvsizes,
+                                           recvoffsets,
+                                           srcs,
+                                           stream);
 
     communicator.sync_stream(stream);
 
     std::vector<int> h_received_data(communicator.get_size());
-    raft::update_host(h_received_data.data(), received_data.data(),
-                      received_data.size(), stream);
+    raft::update_host(h_received_data.data(), received_data.data(), received_data.size(), stream);
     CUDA_TRY(cudaStreamSynchronize(stream));
     for (int i = 0; i < communicator.get_size(); ++i) {
-      if (h_received_data[i] != i) {
-        ret = false;
-      }
+      if (h_received_data[i] != i) { ret = false; }
     }
 
-    if (communicator.get_rank() == 0) {
-      std::cout << "=========================" << std::endl;
-    }
+    if (communicator.get_rank() == 0) { std::cout << "=========================" << std::endl; }
   }
 
   return ret;
@@ -524,20 +515,20 @@ bool test_pointToPoint_device_multicast_sendrecv(const handle_t &h,
  *        initialized comms instance.
  * @param n_colors number of different colors to test
  */
-bool test_commsplit(const handle_t &h, int n_colors) {
-  comms_t const &communicator = h.get_comms();
-  int const rank = communicator.get_rank();
-  int const size = communicator.get_size();
+bool test_commsplit(const handle_t& h, int n_colors)
+{
+  comms_t const& communicator = h.get_comms();
+  int const rank              = communicator.get_rank();
+  int const size              = communicator.get_size();
 
   if (n_colors > size) n_colors = size;
 
   // first we need to assign to a color, then assign the rank within the color
   int color = rank % n_colors;
-  int key = rank / n_colors;
+  int key   = rank / n_colors;
 
   handle_t new_handle(1);
-  auto shared_comm =
-    std::make_shared<comms_t>(communicator.comm_split(color, key));
+  auto shared_comm = std::make_shared<comms_t>(communicator.comm_split(color, key));
   new_handle.set_comms(shared_comm);
 
   return test_collective_allreduce(new_handle, 0);
diff --git a/cpp/include/raft/comms/ucp_helper.hpp b/cpp/include/raft/comms/ucp_helper.hpp
index 226b6f0527..89c7b25630 100644
--- a/cpp/include/raft/comms/ucp_helper.hpp
+++ b/cpp/include/raft/comms/ucp_helper.hpp
@@ -25,16 +25,19 @@
 namespace raft {
 namespace comms {
 
-typedef void (*dlsym_print_info)(ucp_ep_h, FILE *);
-typedef void (*dlsym_rec_free)(void *);
+typedef void (*dlsym_print_info)(ucp_ep_h, FILE*);
+typedef void (*dlsym_rec_free)(void*);
 typedef int (*dlsym_worker_progress)(ucp_worker_h);
 
-typedef ucs_status_ptr_t (*dlsym_send)(ucp_ep_h, const void *, size_t,
-                                       ucp_datatype_t, ucp_tag_t,
-                                       ucp_send_callback_t);
-typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h, void *, size_t count,
-                                       ucp_datatype_t datatype, ucp_tag_t,
-                                       ucp_tag_t, ucp_tag_recv_callback_t);
+typedef ucs_status_ptr_t (*dlsym_send)(
+  ucp_ep_h, const void*, size_t, ucp_datatype_t, ucp_tag_t, ucp_send_callback_t);
+typedef ucs_status_ptr_t (*dlsym_recv)(ucp_worker_h,
+                                       void*,
+                                       size_t count,
+                                       ucp_datatype_t datatype,
+                                       ucp_tag_t,
+                                       ucp_tag_t,
+                                       ucp_tag_recv_callback_t);
 
 /**
  * Standard UCX request object that will be passed
@@ -55,9 +58,9 @@ struct ucx_context {
  */
 class ucp_request {
  public:
-  struct ucx_context *req;
-  bool needs_release = true;
-  int other_rank = -1;
+  struct ucx_context* req;
+  bool needs_release   = true;
+  int other_rank       = -1;
   bool is_send_request = false;
 };
 
@@ -67,18 +70,19 @@ static const ucp_tag_t default_tag_mask = -1;
 /**
  * @brief Asynchronous send callback sets request to completed
  */
-static void send_callback(void *request, ucs_status_t status) {
-  struct ucx_context *context = (struct ucx_context *)request;
-  context->completed = 1;
+static void send_callback(void* request, ucs_status_t status)
+{
+  struct ucx_context* context = (struct ucx_context*)request;
+  context->completed          = 1;
 }
 
 /**
  * @brief Asynchronous recv callback sets request to completed
  */
-static void recv_callback(void *request, ucs_status_t status,
-                          ucp_tag_recv_info_t *info) {
-  struct ucx_context *context = (struct ucx_context *)request;
-  context->completed = 1;
+static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_t* info)
+{
+  struct ucx_context* context = (struct ucx_context*)request;
+  context->completed          = 1;
 }
 
 /**
@@ -87,7 +91,8 @@ static void recv_callback(void *request, ucs_status_t status,
  */
 class comms_ucp_handler {
  public:
-  comms_ucp_handler() {
+  comms_ucp_handler()
+  {
     load_ucp_handle();
     load_send_func();
     load_recv_func();
@@ -99,7 +104,7 @@ class comms_ucp_handler {
   ~comms_ucp_handler() { dlclose(ucp_handle); }
 
  private:
-  void *ucp_handle;
+  void* ucp_handle;
 
   dlsym_print_info print_info_func;
   dlsym_rec_free req_free_func;
@@ -107,7 +112,8 @@ class comms_ucp_handler {
   dlsym_send send_func;
   dlsym_recv recv_func;
 
-  void load_ucp_handle() {
+  void load_ucp_handle()
+  {
     ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NOLOAD | RTLD_NODELETE);
     if (!ucp_handle) {
       ucp_handle = dlopen("libucp.so", RTLD_LAZY | RTLD_NODELETE);
@@ -117,51 +123,56 @@ class comms_ucp_handler {
     dlerror();
   }
 
-  void assert_dlerror() {
-    char *error = dlerror();
+  void assert_dlerror()
+  {
+    char* error = dlerror();
     ASSERT(error == NULL, "Error loading function symbol: %s\n", error);
   }
 
-  void load_send_func() {
+  void load_send_func()
+  {
     send_func = (dlsym_send)dlsym(ucp_handle, "ucp_tag_send_nb");
     assert_dlerror();
   }
 
-  void load_free_req_func() {
+  void load_free_req_func()
+  {
     req_free_func = (dlsym_rec_free)dlsym(ucp_handle, "ucp_request_free");
     assert_dlerror();
   }
 
-  void load_print_info_func() {
+  void load_print_info_func()
+  {
     print_info_func = (dlsym_print_info)dlsym(ucp_handle, "ucp_ep_print_info");
     assert_dlerror();
   }
 
-  void load_worker_progress_func() {
-    worker_progress_func =
-      (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress");
+  void load_worker_progress_func()
+  {
+    worker_progress_func = (dlsym_worker_progress)dlsym(ucp_handle, "ucp_worker_progress");
     assert_dlerror();
   }
 
-  void load_recv_func() {
+  void load_recv_func()
+  {
     recv_func = (dlsym_recv)dlsym(ucp_handle, "ucp_tag_recv_nb");
     assert_dlerror();
   }
 
-  ucp_tag_t build_message_tag(int rank, int tag) const {
+  ucp_tag_t build_message_tag(int rank, int tag) const
+  {
     // keeping the rank in the lower bits enables debugging.
     return ((uint32_t)tag << 31) | (uint32_t)rank;
   }
 
  public:
-  int ucp_progress(ucp_worker_h worker) const {
-    return (*(worker_progress_func))(worker);
-  }
+  int ucp_progress(ucp_worker_h worker) const { return (*(worker_progress_func))(worker); }
 
   /**
    * @brief Frees any memory underlying the given ucp request object
    */
-  void free_ucp_request(ucp_request *request) const {
+  void free_ucp_request(ucp_request* request) const
+  {
     if (request->needs_release) {
       request->req->completed = 0;
       (*(req_free_func))(request->req);
@@ -172,56 +183,67 @@ class comms_ucp_handler {
   /**
    * @brief Asynchronously send data to the given endpoint using the given tag
    */
-  void ucp_isend(ucp_request *req, ucp_ep_h ep_ptr, const void *buf,
-                 size_t size, int tag, ucp_tag_t tag_mask, int rank) const {
+  void ucp_isend(ucp_request* req,
+                 ucp_ep_h ep_ptr,
+                 const void* buf,
+                 size_t size,
+                 int tag,
+                 ucp_tag_t tag_mask,
+                 int rank) const
+  {
     ucp_tag_t ucp_tag = build_message_tag(rank, tag);
 
-    ucs_status_ptr_t send_result = (*(send_func))(
-      ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback);
-    struct ucx_context *ucp_req = (struct ucx_context *)send_result;
+    ucs_status_ptr_t send_result =
+      (*(send_func))(ep_ptr, buf, size, ucp_dt_make_contig(1), ucp_tag, send_callback);
+    struct ucx_context* ucp_req = (struct ucx_context*)send_result;
 
     if (UCS_PTR_IS_ERR(send_result)) {
       ASSERT(!UCS_PTR_IS_ERR(send_result),
              "unable to send UCX data message (%d)\n",
              UCS_PTR_STATUS(send_result));
       /**
-     * If the request didn't fail, but it's not OK, it is in flight.
-     * Expect the handler to be invoked
-     */
+       * If the request didn't fail, but it's not OK, it is in flight.
+       * Expect the handler to be invoked
+       */
     } else if (UCS_PTR_STATUS(send_result) != UCS_OK) {
       /**
-      * If the request is OK, it's already been completed and we don't need to wait on it.
-      * The request will be a nullptr, however, so we need to create a new request
-      * and set it to completed to make the "waitall()" function work properly.
-      */
+       * If the request is OK, it's already been completed and we don't need to wait on it.
+       * The request will be a nullptr, however, so we need to create a new request
+       * and set it to completed to make the "waitall()" function work properly.
+       */
       req->needs_release = true;
     } else {
       req->needs_release = false;
     }
 
-    req->other_rank = rank;
+    req->other_rank      = rank;
     req->is_send_request = true;
-    req->req = ucp_req;
+    req->req             = ucp_req;
   }
 
   /**
    * @brief Asynchronously receive data from given endpoint with the given tag.
    */
-  void ucp_irecv(ucp_request *req, ucp_worker_h worker, ucp_ep_h ep_ptr,
-                 void *buf, size_t size, int tag, ucp_tag_t tag_mask,
-                 int sender_rank) const {
+  void ucp_irecv(ucp_request* req,
+                 ucp_worker_h worker,
+                 ucp_ep_h ep_ptr,
+                 void* buf,
+                 size_t size,
+                 int tag,
+                 ucp_tag_t tag_mask,
+                 int sender_rank) const
+  {
     ucp_tag_t ucp_tag = build_message_tag(sender_rank, tag);
 
     ucs_status_ptr_t recv_result =
-      (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag,
-                     tag_mask, recv_callback);
+      (*(recv_func))(worker, buf, size, ucp_dt_make_contig(1), ucp_tag, tag_mask, recv_callback);
 
-    struct ucx_context *ucp_req = (struct ucx_context *)recv_result;
+    struct ucx_context* ucp_req = (struct ucx_context*)recv_result;
 
-    req->req = ucp_req;
-    req->needs_release = true;
+    req->req             = ucp_req;
+    req->needs_release   = true;
     req->is_send_request = false;
-    req->other_rank = sender_rank;
+    req->other_rank      = sender_rank;
 
     ASSERT(!UCS_PTR_IS_ERR(recv_result),
            "unable to receive UCX data message (%d)\n",
diff --git a/cpp/include/raft/comms/util.hpp b/cpp/include/raft/comms/util.hpp
index f3216abc37..1b0548fc00 100644
--- a/cpp/include/raft/comms/util.hpp
+++ b/cpp/include/raft/comms/util.hpp
@@ -26,88 +26,70 @@
  * Invokes a NCCL runtime API function call, if the call does not return ncclSuccess, throws an
  * exception detailing the NCCL error that occurred
  */
-#define NCCL_TRY(call)                                                        \
-  do {                                                                        \
-    ncclResult_t const status = (call);                                       \
-    if (ncclSuccess != status) {                                              \
-      std::string msg{};                                                      \
-      SET_ERROR_MSG(msg,                                                      \
-                    "NCCL error encountered at: ", "call='%s', Reason=%d:%s", \
-                    #call, status, ncclGetErrorString(status));               \
-      throw raft::logic_error(msg);                                           \
-    }                                                                         \
+#define NCCL_TRY(call)                             \
+  do {                                             \
+    ncclResult_t const status = (call);            \
+    if (ncclSuccess != status) {                   \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "NCCL error encountered at: ", \
+                    "call='%s', Reason=%d:%s",     \
+                    #call,                         \
+                    status,                        \
+                    ncclGetErrorString(status));   \
+      throw raft::logic_error(msg);                \
+    }                                              \
   } while (0);
 
-#define NCCL_TRY_NO_THROW(call)                           \
-  do {                                                    \
-    ncclResult_t status = call;                           \
-    if (ncclSuccess != status) {                          \
-      printf("NCCL call='%s' failed. Reason:%s\n", #call, \
-             ncclGetErrorString(status));                 \
-    }                                                     \
+#define NCCL_TRY_NO_THROW(call)                                                        \
+  do {                                                                                 \
+    ncclResult_t status = call;                                                        \
+    if (ncclSuccess != status) {                                                       \
+      printf("NCCL call='%s' failed. Reason:%s\n", #call, ncclGetErrorString(status)); \
+    }                                                                                  \
   } while (0)
 
 namespace raft {
 namespace comms {
 
-constexpr size_t get_datatype_size(const datatype_t datatype) {
+constexpr size_t get_datatype_size(const datatype_t datatype)
+{
   switch (datatype) {
-    case datatype_t::CHAR:
-      return sizeof(char);
-    case datatype_t::UINT8:
-      return sizeof(uint8_t);
-    case datatype_t::INT32:
-      return sizeof(int);
-    case datatype_t::UINT32:
-      return sizeof(unsigned int);
-    case datatype_t::INT64:
-      return sizeof(int64_t);
-    case datatype_t::UINT64:
-      return sizeof(uint64_t);
-    case datatype_t::FLOAT32:
-      return sizeof(float);
-    case datatype_t::FLOAT64:
-      return sizeof(double);
-    default:
-      throw "Unsupported datatype";
+    case datatype_t::CHAR: return sizeof(char);
+    case datatype_t::UINT8: return sizeof(uint8_t);
+    case datatype_t::INT32: return sizeof(int);
+    case datatype_t::UINT32: return sizeof(unsigned int);
+    case datatype_t::INT64: return sizeof(int64_t);
+    case datatype_t::UINT64: return sizeof(uint64_t);
+    case datatype_t::FLOAT32: return sizeof(float);
+    case datatype_t::FLOAT64: return sizeof(double);
+    default: throw "Unsupported datatype";
   }
 }
 
-constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype) {
+constexpr ncclDataType_t get_nccl_datatype(const datatype_t datatype)
+{
   switch (datatype) {
-    case datatype_t::CHAR:
-      return ncclChar;
-    case datatype_t::UINT8:
-      return ncclUint8;
-    case datatype_t::INT32:
-      return ncclInt;
-    case datatype_t::UINT32:
-      return ncclUint32;
-    case datatype_t::INT64:
-      return ncclInt64;
-    case datatype_t::UINT64:
-      return ncclUint64;
-    case datatype_t::FLOAT32:
-      return ncclFloat;
-    case datatype_t::FLOAT64:
-      return ncclDouble;
-    default:
-      throw "Unsupported datatype";
+    case datatype_t::CHAR: return ncclChar;
+    case datatype_t::UINT8: return ncclUint8;
+    case datatype_t::INT32: return ncclInt;
+    case datatype_t::UINT32: return ncclUint32;
+    case datatype_t::INT64: return ncclInt64;
+    case datatype_t::UINT64: return ncclUint64;
+    case datatype_t::FLOAT32: return ncclFloat;
+    case datatype_t::FLOAT64: return ncclDouble;
+    default: throw "Unsupported datatype";
   }
 }
 
-constexpr ncclRedOp_t get_nccl_op(const op_t op) {
+constexpr ncclRedOp_t get_nccl_op(const op_t op)
+{
   switch (op) {
-    case op_t::SUM:
-      return ncclSum;
-    case op_t::PROD:
-      return ncclProd;
-    case op_t::MIN:
-      return ncclMin;
-    case op_t::MAX:
-      return ncclMax;
-    default:
-      throw "Unsupported datatype";
+    case op_t::SUM: return ncclSum;
+    case op_t::PROD: return ncclProd;
+    case op_t::MIN: return ncclMin;
+    case op_t::MAX: return ncclMax;
+    default: throw "Unsupported datatype";
   }
 }
 };  // namespace comms
diff --git a/cpp/include/raft/cuda_utils.cuh b/cpp/include/raft/cuda_utils.cuh
index 14274043f5..8a66eff242 100644
--- a/cpp/include/raft/cuda_utils.cuh
+++ b/cpp/include/raft/cuda_utils.cuh
@@ -36,16 +36,17 @@
 namespace raft {
 
 /** helper macro for device inlined functions */
-#define DI inline __device__
+#define DI  inline __device__
 #define HDI inline __host__ __device__
-#define HD __host__ __device__
+#define HD  __host__ __device__
 
 /**
  * @brief Provide a ceiling division operation ie. ceil(a / b)
  * @tparam IntType supposed to be only integers for now!
  */
 template <typename IntType>
-constexpr HDI IntType ceildiv(IntType a, IntType b) {
+constexpr HDI IntType ceildiv(IntType a, IntType b)
+{
   return (a + b - 1) / b;
 }
 
@@ -54,7 +55,8 @@ constexpr HDI IntType ceildiv(IntType a, IntType b) {
  * @tparam IntType supposed to be only integers for now!
  */
 template <typename IntType>
-constexpr HDI IntType alignTo(IntType a, IntType b) {
+constexpr HDI IntType alignTo(IntType a, IntType b)
+{
   return ceildiv(a, b) * b;
 }
 
@@ -63,7 +65,8 @@ constexpr HDI IntType alignTo(IntType a, IntType b) {
  * @tparam IntType supposed to be only integers for now!
  */
 template <typename IntType>
-constexpr HDI IntType alignDown(IntType a, IntType b) {
+constexpr HDI IntType alignDown(IntType a, IntType b)
+{
   return (a / b) * b;
 }
 
@@ -72,7 +75,8 @@ constexpr HDI IntType alignDown(IntType a, IntType b) {
  * @tparam IntType data type (checked only for integers)
  */
 template <typename IntType>
-constexpr HDI bool isPo2(IntType num) {
+constexpr HDI bool isPo2(IntType num)
+{
   return (num && !(num & (num - 1)));
 }
 
@@ -81,14 +85,16 @@ constexpr HDI bool isPo2(IntType num) {
  * @tparam IntType data type (checked only for integers)
  */
 template <typename IntType>
-constexpr HDI IntType log2(IntType num, IntType ret = IntType(0)) {
+constexpr HDI IntType log2(IntType num, IntType ret = IntType(0))
+{
   return num <= IntType(1) ? ret : log2(num >> IntType(1), ++ret);
 }
 
 /** Device function to apply the input lambda across threads in the grid */
 template <int ItemsPerThread, typename L>
-DI void forEach(int num, L lambda) {
-  int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
+DI void forEach(int num, L lambda)
+{
+  int idx              = (blockDim.x * blockIdx.x) + threadIdx.x;
   const int numThreads = blockDim.x * gridDim.x;
 #pragma unroll
   for (int itr = 0; itr < ItemsPerThread; ++itr, idx += numThreads) {
@@ -100,7 +106,8 @@ DI void forEach(int num, L lambda) {
 static const int WarpSize = 32;
 
 /** get the laneId of the current thread */
-DI int laneId() {
+DI int laneId()
+{
   int id;
   asm("mov.s32 %0, %laneid;" : "=r"(id));
   return id;
@@ -113,15 +120,17 @@ DI int laneId() {
  * @param b second input
  */
 template <typename T>
-HDI void swapVals(T &a, T &b) {
+HDI void swapVals(T& a, T& b)
+{
   T tmp = a;
-  a = b;
-  b = tmp;
+  a     = b;
+  b     = tmp;
 }
 
 /** Device function to have atomic add support for older archs */
 template <typename Type>
-DI void myAtomicAdd(Type *address, Type val) {
+DI void myAtomicAdd(Type* address, Type val)
+{
   atomicAdd(address, val);
 }
 
@@ -129,105 +138,114 @@ DI void myAtomicAdd(Type *address, Type val) {
 // Ref:
 // http://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf
 template <>
-DI void myAtomicAdd(double *address, double val) {
-  unsigned long long int *address_as_ull = (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull, assumed;
+DI void myAtomicAdd(double* address, double val)
+{
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old             = *address_as_ull, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address_as_ull, assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
+    old =
+      atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
   } while (assumed != old);
 }
 #endif
 
 template <typename T, typename ReduceLambda>
-DI void myAtomicReduce(T *address, T val, ReduceLambda op);
+DI void myAtomicReduce(T* address, T val, ReduceLambda op);
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(double *address, double val, ReduceLambda op) {
-  unsigned long long int *address_as_ull = (unsigned long long int *)address;
-  unsigned long long int old = *address_as_ull, assumed;
+DI void myAtomicReduce(double* address, double val, ReduceLambda op)
+{
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old             = *address_as_ull, assumed;
   do {
     assumed = old;
-    old =
-      atomicCAS(address_as_ull, assumed,
-                __double_as_longlong(op(val, __longlong_as_double(assumed))));
+    old     = atomicCAS(
+      address_as_ull, assumed, __double_as_longlong(op(val, __longlong_as_double(assumed))));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(float *address, float val, ReduceLambda op) {
-  unsigned int *address_as_uint = (unsigned int *)address;
-  unsigned int old = *address_as_uint, assumed;
+DI void myAtomicReduce(float* address, float val, ReduceLambda op)
+{
+  unsigned int* address_as_uint = (unsigned int*)address;
+  unsigned int old              = *address_as_uint, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address_as_uint, assumed,
-                    __float_as_uint(op(val, __uint_as_float(assumed))));
+    old = atomicCAS(address_as_uint, assumed, __float_as_uint(op(val, __uint_as_float(assumed))));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(int *address, int val, ReduceLambda op) {
+DI void myAtomicReduce(int* address, int val, ReduceLambda op)
+{
   int old = *address, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address, assumed, op(val, assumed));
+    old     = atomicCAS(address, assumed, op(val, assumed));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(long long *address, long long val, ReduceLambda op) {
+DI void myAtomicReduce(long long* address, long long val, ReduceLambda op)
+{
   long long old = *address, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address, assumed, op(val, assumed));
+    old     = atomicCAS(address, assumed, op(val, assumed));
   } while (assumed != old);
 }
 
 template <typename ReduceLambda>
-DI void myAtomicReduce(unsigned long long *address, unsigned long long val,
-                       ReduceLambda op) {
+DI void myAtomicReduce(unsigned long long* address, unsigned long long val, ReduceLambda op)
+{
   unsigned long long old = *address, assumed;
   do {
     assumed = old;
-    old = atomicCAS(address, assumed, op(val, assumed));
+    old     = atomicCAS(address, assumed, op(val, assumed));
   } while (assumed != old);
 }
 
 /**
  * @brief Provide atomic min operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val)
+ * @param[in] address: address to read old value from, and to atomically update w/ min(old value,
+ * val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMin(T *address, T val);
+DI T myAtomicMin(T* address, T val);
 
 /**
  * @brief Provide atomic max operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val)
+ * @param[in] address: address to read old value from, and to atomically update w/ max(old value,
+ * val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMax(T *address, T val);
+DI T myAtomicMax(T* address, T val);
 
-DI float myAtomicMin(float *address, float val) {
+DI float myAtomicMin(float* address, float val)
+{
   myAtomicReduce(address, val, fminf);
   return *address;
 }
 
-DI float myAtomicMax(float *address, float val) {
+DI float myAtomicMax(float* address, float val)
+{
   myAtomicReduce(address, val, fmaxf);
   return *address;
 }
 
-DI double myAtomicMin(double *address, double val) {
+DI double myAtomicMin(double* address, double val)
+{
   myAtomicReduce<double(double, double)>(address, val, fmin);
   return *address;
 }
 
-DI double myAtomicMax(double *address, double val) {
+DI double myAtomicMax(double* address, double val)
+{
   myAtomicReduce<double(double, double)>(address, val, fmax);
   return *address;
 }
@@ -239,11 +257,13 @@ DI double myAtomicMax(double *address, double val) {
 template <typename T>
 HDI T myMax(T x, T y);
 template <>
-HDI float myMax<float>(float x, float y) {
+HDI float myMax<float>(float x, float y)
+{
   return fmaxf(x, y);
 }
 template <>
-HDI double myMax<double>(double x, double y) {
+HDI double myMax<double>(double x, double y)
+{
   return fmax(x, y);
 }
 /** @} */
@@ -255,11 +275,13 @@ HDI double myMax<double>(double x, double y) {
 template <typename T>
 HDI T myMin(T x, T y);
 template <>
-HDI float myMin<float>(float x, float y) {
+HDI float myMin<float>(float x, float y)
+{
   return fminf(x, y);
 }
 template <>
-HDI double myMin<double>(double x, double y) {
+HDI double myMin<double>(double x, double y)
+{
   return fmin(x, y);
 }
 /** @} */
@@ -267,11 +289,13 @@ HDI double myMin<double>(double x, double y) {
 /**
  * @brief Provide atomic min operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ min(old value, val)
+ * @param[in] address: address to read old value from, and to atomically update w/ min(old value,
+ * val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMin(T *address, T val) {
+DI T myAtomicMin(T* address, T val)
+{
   myAtomicReduce(address, val, myMin<T>);
   return *address;
 }
@@ -279,11 +303,13 @@ DI T myAtomicMin(T *address, T val) {
 /**
  * @brief Provide atomic max operation.
  * @tparam T: data type for input data (float or double).
- * @param[in] address: address to read old value from, and to atomically update w/ max(old value, val)
+ * @param[in] address: address to read old value from, and to atomically update w/ max(old value,
+ * val)
  * @param[in] val: new value to compare with old
  */
 template <typename T>
-DI T myAtomicMax(T *address, T val) {
+DI T myAtomicMax(T* address, T val)
+{
   myAtomicReduce(address, val, myMax<T>);
   return *address;
 }
@@ -292,7 +318,8 @@ DI T myAtomicMax(T *address, T val) {
  * Sign function
  */
 template <typename T>
-HDI int sgn(const T val) {
+HDI int sgn(const T val)
+{
   return (T(0) < val) - (val < T(0));
 }
 
@@ -303,11 +330,13 @@ HDI int sgn(const T val) {
 template <typename T>
 HDI T myExp(T x);
 template <>
-HDI float myExp(float x) {
+HDI float myExp(float x)
+{
   return expf(x);
 }
 template <>
-HDI double myExp(double x) {
+HDI double myExp(double x)
+{
   return exp(x);
 }
 /** @} */
@@ -319,11 +348,13 @@ HDI double myExp(double x) {
 template <typename T>
 inline __device__ T myInf();
 template <>
-inline __device__ float myInf<float>() {
+inline __device__ float myInf<float>()
+{
   return CUDART_INF_F;
 }
 template <>
-inline __device__ double myInf<double>() {
+inline __device__ double myInf<double>()
+{
   return CUDART_INF;
 }
 /** @} */
@@ -335,11 +366,13 @@ inline __device__ double myInf<double>() {
 template <typename T>
 HDI T myLog(T x);
 template <>
-HDI float myLog(float x) {
+HDI float myLog(float x)
+{
   return logf(x);
 }
 template <>
-HDI double myLog(double x) {
+HDI double myLog(double x)
+{
   return log(x);
 }
 /** @} */
@@ -351,11 +384,13 @@ HDI double myLog(double x) {
 template <typename T>
 HDI T mySqrt(T x);
 template <>
-HDI float mySqrt(float x) {
+HDI float mySqrt(float x)
+{
   return sqrtf(x);
 }
 template <>
-HDI double mySqrt(double x) {
+HDI double mySqrt(double x)
+{
   return sqrt(x);
 }
 /** @} */
@@ -365,13 +400,15 @@ HDI double mySqrt(double x) {
  * @{
  */
 template <typename T>
-DI void mySinCos(T x, T &s, T &c);
+DI void mySinCos(T x, T& s, T& c);
 template <>
-DI void mySinCos(float x, float &s, float &c) {
+DI void mySinCos(float x, float& s, float& c)
+{
   sincosf(x, &s, &c);
 }
 template <>
-DI void mySinCos(double x, double &s, double &c) {
+DI void mySinCos(double x, double& s, double& c)
+{
   sincos(x, &s, &c);
 }
 /** @} */
@@ -383,11 +420,13 @@ DI void mySinCos(double x, double &s, double &c) {
 template <typename T>
 DI T mySin(T x);
 template <>
-DI float mySin(float x) {
+DI float mySin(float x)
+{
   return sinf(x);
 }
 template <>
-DI double mySin(double x) {
+DI double mySin(double x)
+{
   return sin(x);
 }
 /** @} */
@@ -397,15 +436,18 @@ DI double mySin(double x) {
  * @{
  */
 template <typename T>
-DI T myAbs(T x) {
+DI T myAbs(T x)
+{
   return x < 0 ? -x : x;
 }
 template <>
-DI float myAbs(float x) {
+DI float myAbs(float x)
+{
   return fabsf(x);
 }
 template <>
-DI double myAbs(double x) {
+DI double myAbs(double x)
+{
   return fabs(x);
 }
 /** @} */
@@ -417,11 +459,13 @@ DI double myAbs(double x) {
 template <typename T>
 HDI T myPow(T x, T power);
 template <>
-HDI float myPow(float x, float power) {
+HDI float myPow(float x, float power)
+{
   return powf(x, power);
 }
 template <>
-HDI double myPow(double x, double power) {
+HDI double myPow(double x, double power)
+{
   return pow(x, power);
 }
 /** @} */
@@ -433,11 +477,13 @@ HDI double myPow(double x, double power) {
 template <typename T>
 HDI T myTanh(T x);
 template <>
-HDI float myTanh(float x) {
+HDI float myTanh(float x)
+{
   return tanhf(x);
 }
 template <>
-HDI double myTanh(double x) {
+HDI double myTanh(double x)
+{
   return tanh(x);
 }
 /** @} */
@@ -449,11 +495,13 @@ HDI double myTanh(double x) {
 template <typename T>
 HDI T myATanh(T x);
 template <>
-HDI float myATanh(float x) {
+HDI float myATanh(float x)
+{
   return atanhf(x);
 }
 template <>
-HDI double myATanh(double x) {
+HDI double myATanh(double x)
+{
   return atanh(x);
 }
 /** @} */
@@ -492,15 +540,18 @@ struct Sum {
  * @{
  */
 template <typename T>
-DI T signPrim(T x) {
+DI T signPrim(T x)
+{
   return x < 0 ? -1 : +1;
 }
 template <>
-DI float signPrim(float x) {
+DI float signPrim(float x)
+{
   return signbit(x) == true ? -1.0f : +1.0f;
 }
 template <>
-DI double signPrim(double x) {
+DI double signPrim(double x)
+{
   return signbit(x) == true ? -1.0 : +1.0;
 }
 /** @} */
@@ -514,28 +565,33 @@ DI double signPrim(double x) {
  * @{
  */
 template <typename T>
-DI T maxPrim(T x, T y) {
+DI T maxPrim(T x, T y)
+{
   return x > y ? x : y;
 }
 template <>
-DI float maxPrim(float x, float y) {
+DI float maxPrim(float x, float y)
+{
   return fmaxf(x, y);
 }
 template <>
-DI double maxPrim(double x, double y) {
+DI double maxPrim(double x, double y)
+{
   return fmax(x, y);
 }
 /** @} */
 
 /** apply a warp-wide fence (useful from Volta+ archs) */
-DI void warpFence() {
+DI void warpFence()
+{
 #if __CUDA_ARCH__ >= 700
   __syncwarp();
 #endif
 }
 
 /** warp-wide any boolean aggregator */
-DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) {
+DI bool any(bool inFlag, uint32_t mask = 0xffffffffu)
+{
 #if CUDART_VERSION >= 9000
   inFlag = __any_sync(mask, inFlag);
 #else
@@ -545,7 +601,8 @@ DI bool any(bool inFlag, uint32_t mask = 0xffffffffu) {
 }
 
 /** warp-wide all boolean aggregator */
-DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) {
+DI bool all(bool inFlag, uint32_t mask = 0xffffffffu)
+{
 #if CUDART_VERSION >= 9000
   inFlag = __all_sync(mask, inFlag);
 #else
@@ -564,8 +621,8 @@ DI bool all(bool inFlag, uint32_t mask = 0xffffffffu) {
  * @return the shuffled data
  */
 template <typename T>
-DI T shfl(T val, int srcLane, int width = WarpSize,
-          uint32_t mask = 0xffffffffu) {
+DI T shfl(T val, int srcLane, int width = WarpSize, uint32_t mask = 0xffffffffu)
+{
 #if CUDART_VERSION >= 9000
   return __shfl_sync(mask, val, srcLane, width);
 #else
@@ -583,8 +640,8 @@ DI T shfl(T val, int srcLane, int width = WarpSize,
  * @return the shuffled data
  */
 template <typename T>
-DI T shfl_xor(T val, int laneMask, int width = WarpSize,
-              uint32_t mask = 0xffffffffu) {
+DI T shfl_xor(T val, int laneMask, int width = WarpSize, uint32_t mask = 0xffffffffu)
+{
 #if CUDART_VERSION >= 9000
   return __shfl_xor_sync(mask, val, laneMask, width);
 #else
@@ -602,7 +659,8 @@ DI T shfl_xor(T val, int laneMask, int width = WarpSize,
  * @todo Expand this to support arbitrary reduction ops
  */
 template <typename T>
-DI T warpReduce(T val) {
+DI T warpReduce(T val)
+{
 #pragma unroll
   for (int i = WarpSize / 2; i > 0; i >>= 1) {
     T tmp = shfl(val, laneId() + i);
@@ -623,12 +681,13 @@ DI T warpReduce(T val) {
  * @todo Expand this to support arbitrary reduction ops
  */
 template <typename T>
-DI T blockReduce(T val, char *smem) {
-  auto *sTemp = reinterpret_cast<T *>(smem);
-  int nWarps = (blockDim.x + WarpSize - 1) / WarpSize;
-  int lid = laneId();
-  int wid = threadIdx.x / WarpSize;
-  val = warpReduce(val);
+DI T blockReduce(T val, char* smem)
+{
+  auto* sTemp = reinterpret_cast<T*>(smem);
+  int nWarps  = (blockDim.x + WarpSize - 1) / WarpSize;
+  int lid     = laneId();
+  int wid     = threadIdx.x / WarpSize;
+  val         = warpReduce(val);
   if (lid == 0) sTemp[wid] = val;
   __syncthreads();
   val = lid < nWarps ? sTemp[lid] : T(0);
@@ -644,8 +703,10 @@ DI T blockReduce(T val, char *smem) {
  * @param idx the index for which to query the stream
  */
 inline cudaStream_t select_stream(cudaStream_t user_stream,
-                                  cudaStream_t *int_streams, int n_int_streams,
-                                  int idx) {
+                                  cudaStream_t* int_streams,
+                                  int n_int_streams,
+                                  int idx)
+{
   return n_int_streams > 0 ? int_streams[idx % n_int_streams] : user_stream;
 }
 
diff --git a/cpp/include/raft/cudart_utils.h b/cpp/include/raft/cudart_utils.h
index 486103dedb..cf06416a96 100644
--- a/cpp/include/raft/cudart_utils.h
+++ b/cpp/include/raft/cudart_utils.h
@@ -54,17 +54,20 @@ struct cuda_error : public raft::exception {
  *
  */
 #ifndef CUDA_TRY
-#define CUDA_TRY(call)                                                        \
-  do {                                                                        \
-    cudaError_t const status = call;                                          \
-    if (status != cudaSuccess) {                                              \
-      cudaGetLastError();                                                     \
-      std::string msg{};                                                      \
-      SET_ERROR_MSG(                                                          \
-        msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \
-        cudaGetErrorName(status), cudaGetErrorString(status));                \
-      throw raft::cuda_error(msg);                                            \
-    }                                                                         \
+#define CUDA_TRY(call)                             \
+  do {                                             \
+    cudaError_t const status = call;               \
+    if (status != cudaSuccess) {                   \
+      cudaGetLastError();                          \
+      std::string msg{};                           \
+      SET_ERROR_MSG(msg,                           \
+                    "CUDA error encountered at: ", \
+                    "call='%s', Reason=%s:%s",     \
+                    #call,                         \
+                    cudaGetErrorName(status),      \
+                    cudaGetErrorString(status));   \
+      throw raft::cuda_error(msg);                 \
+    }                                              \
   } while (0)
 #endif
 /**
@@ -97,13 +100,16 @@ struct cuda_error : public raft::exception {
 //  *        exception.
 //  */
 #ifndef CUDA_CHECK_NO_THROW
-#define CUDA_CHECK_NO_THROW(call)                                         \
-  do {                                                                    \
-    cudaError_t const status = call;                                      \
-    if (cudaSuccess != status) {                                          \
-      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \
-             __FILE__, __LINE__, cudaGetErrorString(status));             \
-    }                                                                     \
+#define CUDA_CHECK_NO_THROW(call)                                  \
+  do {                                                             \
+    cudaError_t const status = call;                               \
+    if (cudaSuccess != status) {                                   \
+      printf("CUDA call='%s' at file=%s line=%d failed with %s\n", \
+             #call,                                                \
+             __FILE__,                                             \
+             __LINE__,                                             \
+             cudaGetErrorString(status));                          \
+    }                                                              \
   } while (0)
 #endif
 
@@ -112,7 +118,7 @@ struct cuda_error : public raft::exception {
  * TODO: Rename original implementations in 22.04 to fix
  * https://github.com/rapidsai/raft/issues/128
  */
-#define RAFT_CUDA_CHECK(call) CUDA_CHECK(call)
+#define RAFT_CUDA_CHECK(call)          CUDA_CHECK(call)
 #define RAFT_CUDA_CHECK_NO_THROW(call) CUDA_CHECK_NO_THROW(call)
 
 namespace raft {
@@ -120,9 +126,7 @@ namespace raft {
 /** Helper method to get to know warp size in device code */
 __host__ __device__ constexpr inline int warp_size() { return 32; }
 
-__host__ __device__ constexpr inline unsigned int warp_full_mask() {
-  return 0xffffffff;
-}
+__host__ __device__ constexpr inline unsigned int warp_full_mask() { return 0xffffffff; }
 
 /**
  * @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
@@ -134,20 +138,23 @@ class grid_1d_thread_t {
   int const num_blocks{0};
 
   /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         * @param elements_per_thread Typically, a single kernel thread processes more than a single
-         * element; this affects the number of threads the grid must contain
-         */
-  grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block,
-                   size_t max_num_blocks_1d, size_t elements_per_thread = 1)
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   * @param elements_per_thread Typically, a single kernel thread processes more than a single
+   * element; this affects the number of threads the grid must contain
+   */
+  grid_1d_thread_t(size_t overall_num_elements,
+                   size_t num_threads_per_block,
+                   size_t max_num_blocks_1d,
+                   size_t elements_per_thread = 1)
     : block_size(num_threads_per_block),
-      num_blocks(std::min((overall_num_elements +
-                           (elements_per_thread * num_threads_per_block) - 1) /
-                            (elements_per_thread * num_threads_per_block),
-                          max_num_blocks_1d)) {
+      num_blocks(
+        std::min((overall_num_elements + (elements_per_thread * num_threads_per_block) - 1) /
+                   (elements_per_thread * num_threads_per_block),
+                 max_num_blocks_1d))
+  {
     RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
     RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
                  "num_threads_per_block / warp_size() must be > 0");
@@ -165,18 +172,19 @@ class grid_1d_warp_t {
   int const num_blocks{0};
 
   /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         */
-  grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block,
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   */
+  grid_1d_warp_t(size_t overall_num_elements,
+                 size_t num_threads_per_block,
                  size_t max_num_blocks_1d)
     : block_size(num_threads_per_block),
-      num_blocks(std::min(
-        (overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
-          (num_threads_per_block / warp_size()),
-        max_num_blocks_1d)) {
+      num_blocks(std::min((overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
+                            (num_threads_per_block / warp_size()),
+                          max_num_blocks_1d))
+  {
     RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
     RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
                  "num_threads_per_block / warp_size() must be > 0");
@@ -193,15 +201,17 @@ class grid_1d_block_t {
   int const num_blocks{0};
 
   /**
-         * @param overall_num_elements The number of elements the kernel needs to handle/process
-         * @param num_threads_per_block The grid block size, determined according to the kernel's
-         * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
-         * this can't be determined generically/automatically (as opposed to the number of blocks)
-         */
-  grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block,
+   * @param overall_num_elements The number of elements the kernel needs to handle/process
+   * @param num_threads_per_block The grid block size, determined according to the kernel's
+   * specific features (amount of shared memory necessary, SM functional units use pattern etc.);
+   * this can't be determined generically/automatically (as opposed to the number of blocks)
+   */
+  grid_1d_block_t(size_t overall_num_elements,
+                  size_t num_threads_per_block,
                   size_t max_num_blocks_1d)
     : block_size(num_threads_per_block),
-      num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) {
+      num_blocks(std::min(overall_num_elements, max_num_blocks_1d))
+  {
     RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
     RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
                  "num_threads_per_block / warp_size() must be > 0");
@@ -217,10 +227,9 @@ class grid_1d_block_t {
  * @param stream cuda stream
  */
 template <typename Type>
-void copy(Type* dst, const Type* src, size_t len,
-          rmm::cuda_stream_view stream) {
-  CUDA_CHECK(
-    cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
+void copy(Type* dst, const Type* src, size_t len, rmm::cuda_stream_view stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
 }
 
 /**
@@ -231,23 +240,22 @@ void copy(Type* dst, const Type* src, size_t len,
  */
 /** performs a host to device copy */
 template <typename Type>
-void update_device(Type* d_ptr, const Type* h_ptr, size_t len,
-                   rmm::cuda_stream_view stream) {
+void update_device(Type* d_ptr, const Type* h_ptr, size_t len, rmm::cuda_stream_view stream)
+{
   copy(d_ptr, h_ptr, len, stream);
 }
 
 /** performs a device to host copy */
 template <typename Type>
-void update_host(Type* h_ptr, const Type* d_ptr, size_t len,
-                 rmm::cuda_stream_view stream) {
+void update_host(Type* h_ptr, const Type* d_ptr, size_t len, rmm::cuda_stream_view stream)
+{
   copy(h_ptr, d_ptr, len, stream);
 }
 
 template <typename Type>
-void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len,
-                rmm::cuda_stream_view stream) {
-  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type),
-                             cudaMemcpyDeviceToDevice, stream));
+void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len, rmm::cuda_stream_view stream)
+{
+  CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type), cudaMemcpyDeviceToDevice, stream));
 }
 /** @} */
 
@@ -256,8 +264,11 @@ void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len,
  * @{
  */
 template <class T, class OutStream>
-void print_host_vector(const char* variable_name, const T* host_mem,
-                       size_t componentsCount, OutStream& out) {
+void print_host_vector(const char* variable_name,
+                       const T* host_mem,
+                       size_t componentsCount,
+                       OutStream& out)
+{
   out << variable_name << "=[";
   for (size_t i = 0; i < componentsCount; ++i) {
     if (i != 0) out << ",";
@@ -267,11 +278,13 @@ void print_host_vector(const char* variable_name, const T* host_mem,
 }
 
 template <class T, class OutStream>
-void print_device_vector(const char* variable_name, const T* devMem,
-                         size_t componentsCount, OutStream& out) {
+void print_device_vector(const char* variable_name,
+                         const T* devMem,
+                         size_t componentsCount,
+                         OutStream& out)
+{
   T* host_mem = new T[componentsCount];
-  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T),
-                        cudaMemcpyDeviceToHost));
+  CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T), cudaMemcpyDeviceToHost));
   print_host_vector(variable_name, host_mem, componentsCount, out);
   delete[] host_mem;
 }
@@ -281,10 +294,10 @@ static std::mutex mutex_;
 static std::unordered_map<void*, size_t> allocations;
 
 template <typename Type>
-void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream,
-              bool setZero = false) {
+void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream, bool setZero = false)
+{
   size_t size = len * sizeof(Type);
-  ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
+  ptr         = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
   if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
 
   std::lock_guard<std::mutex> _(mutex_);
@@ -292,17 +305,19 @@ void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream,
 }
 
 template <typename Type>
-void deallocate(Type*& ptr, rmm::cuda_stream_view stream) {
+void deallocate(Type*& ptr, rmm::cuda_stream_view stream)
+{
   std::lock_guard<std::mutex> _(mutex_);
   size_t size = allocations[ptr];
   allocations.erase(ptr);
   rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
 }
 
-inline void deallocate_all(rmm::cuda_stream_view stream) {
+inline void deallocate_all(rmm::cuda_stream_view stream)
+{
   std::lock_guard<std::mutex> _(mutex_);
   for (auto& alloc : allocations) {
-    void* ptr = alloc.first;
+    void* ptr   = alloc.first;
     size_t size = alloc.second;
     rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
   }
@@ -310,29 +325,29 @@ inline void deallocate_all(rmm::cuda_stream_view stream) {
 }
 
 /** helper method to get max usable shared mem per block parameter */
-inline int getSharedMemPerBlock() {
+inline int getSharedMemPerBlock()
+{
   int devId;
   CUDA_CHECK(cudaGetDevice(&devId));
   int smemPerBlk;
-  CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk,
-                                    cudaDevAttrMaxSharedMemoryPerBlock, devId));
+  CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk, cudaDevAttrMaxSharedMemoryPerBlock, devId));
   return smemPerBlk;
 }
 
 /** helper method to get multi-processor count parameter */
-inline int getMultiProcessorCount() {
+inline int getMultiProcessorCount()
+{
   int devId;
   CUDA_CHECK(cudaGetDevice(&devId));
   int mpCount;
-  CUDA_CHECK(
-    cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
+  CUDA_CHECK(cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
   return mpCount;
 }
 
 /** helper method to convert an array on device to a string on host */
 template <typename T>
-std::string arr2Str(const T* arr, int size, std::string name,
-                    cudaStream_t stream, int width = 4) {
+std::string arr2Str(const T* arr, int size, std::string name, cudaStream_t stream, int width = 4)
+{
   std::stringstream ss;
 
   T* arr_h = (T*)malloc(size * sizeof(T));
@@ -354,53 +369,54 @@ std::string arr2Str(const T* arr, int size, std::string name,
 
 /** this seems to be unused, but may be useful in the future */
 template <typename T>
-void ASSERT_DEVICE_MEM(T* ptr, std::string name) {
+void ASSERT_DEVICE_MEM(T* ptr, std::string name)
+{
   cudaPointerAttributes s_att;
   cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
 
   if (s_err != 0 || s_att.device == -1)
-    std::cout << "Invalid device pointer encountered in " << name
-              << ". device=" << s_att.device << ", err=" << s_err << std::endl;
+    std::cout << "Invalid device pointer encountered in " << name << ". device=" << s_att.device
+              << ", err=" << s_err << std::endl;
 }
 
-inline uint32_t curTimeMillis() {
-  auto now = std::chrono::high_resolution_clock::now();
+inline uint32_t curTimeMillis()
+{
+  auto now      = std::chrono::high_resolution_clock::now();
   auto duration = now.time_since_epoch();
-  return std::chrono::duration_cast<std::chrono::milliseconds>(duration)
-    .count();
+  return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
 }
 
 /** Helper function to calculate need memory for allocate to store dense matrix.
-    * @param rows number of rows in matrix
-    * @param columns number of columns in matrix
-    * @return need number of items to allocate via allocate()
-    * @sa allocate()
-    */
-inline size_t allocLengthForMatrix(size_t rows, size_t columns) {
-  return rows * columns;
-}
+ * @param rows number of rows in matrix
+ * @param columns number of columns in matrix
+ * @return need number of items to allocate via allocate()
+ * @sa allocate()
+ */
+inline size_t allocLengthForMatrix(size_t rows, size_t columns) { return rows * columns; }
 
 /** Helper function to check alignment of pointer.
-    * @param ptr the pointer to check
-    * @param alignment to be checked for
-    * @return true if address in bytes is a multiple of alignment
-    */
+ * @param ptr the pointer to check
+ * @param alignment to be checked for
+ * @return true if address in bytes is a multiple of alignment
+ */
 template <typename Type>
-bool is_aligned(Type* ptr, size_t alignment) {
+bool is_aligned(Type* ptr, size_t alignment)
+{
   return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
 }
 
 /** calculate greatest common divisor of two numbers
-* @a integer
-* @b integer
-* @ return gcd of a and b
-*/
+ * @a integer
+ * @b integer
+ * @ return gcd of a and b
+ */
 template <typename IntType>
-IntType gcd(IntType a, IntType b) {
+IntType gcd(IntType a, IntType b)
+{
   while (b != 0) {
     IntType tmp = b;
-    b = a % b;
-    a = tmp;
+    b           = a % b;
+    a           = tmp;
   }
   return a;
 }
diff --git a/cpp/include/raft/device_atomics.cuh b/cpp/include/raft/device_atomics.cuh
index a4ebcc9900..e3b324d030 100644
--- a/cpp/include/raft/device_atomics.cuh
+++ b/cpp/include/raft/device_atomics.cuh
@@ -39,9 +39,9 @@ namespace detail {
 
 /* @brief binary `sum` operator */
 struct DeviceSum {
-  template <typename T,
-            typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return lhs + rhs;
   }
 };
@@ -49,7 +49,8 @@ struct DeviceSum {
 /* @brief binary `min` operator */
 struct DeviceMin {
   template <typename T>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return lhs < rhs ? lhs : rhs;
   }
 };
@@ -57,43 +58,44 @@ struct DeviceMin {
 /* @brief binary `max` operator */
 struct DeviceMax {
   template <typename T>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return lhs > rhs ? lhs : rhs;
   }
 };
 
 /* @brief binary `product` operator */
 struct DeviceProduct {
-  template <typename T,
-            typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return lhs * rhs;
   }
 };
 
 /* @brief binary `and` operator */
 struct DeviceAnd {
-  template <typename T,
-            typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return (lhs & rhs);
   }
 };
 
 /* @brief binary `or` operator */
 struct DeviceOr {
-  template <typename T,
-            typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return (lhs | rhs);
   }
 };
 
 /* @brief binary `xor` operator */
 struct DeviceXor {
-  template <typename T,
-            typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
-  __device__ T operator()(const T& lhs, const T& rhs) {
+  template <typename T, typename std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+  __device__ T operator()(const T& lhs, const T& rhs)
+  {
     return (lhs ^ rhs);
   }
 };
@@ -103,9 +105,9 @@ struct DeviceXor {
 #define errmsg_cast "size mismatch."
 
 template <typename T_output, typename T_input>
-__forceinline__ __device__ T_output type_reinterpret(T_input value) {
-  static_assert(sizeof(T_output) == sizeof(T_input),
-                "type_reinterpret for different size");
+__forceinline__ __device__ T_output type_reinterpret(T_input value)
+{
+  static_assert(sizeof(T_output) == sizeof(T_input), "type_reinterpret for different size");
   return *(reinterpret_cast<T_output*>(&value));
 }
 
@@ -118,25 +120,22 @@ struct genericAtomicOperationImpl;
 // single byte atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 1> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          Op op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
+  {
     using T_int = unsigned int;
 
-    T_int* address_uint32 =
-      reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
-    T_int shift = ((reinterpret_cast<size_t>(addr) & 3) * 8);
+    T_int* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
+    T_int shift           = ((reinterpret_cast<size_t>(addr) & 3) * 8);
 
     T_int old = *address_uint32;
     T_int assumed;
 
     do {
-      assumed = old;
-      T target_value = T((old >> shift) & 0xff);
-      uint8_t updating_value =
-        type_reinterpret<uint8_t, T>(op(target_value, update_value));
-      T_int new_value =
-        (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift);
-      old = atomicCAS(address_uint32, assumed, new_value);
+      assumed                = old;
+      T target_value         = T((old >> shift) & 0xff);
+      uint8_t updating_value = type_reinterpret<uint8_t, T>(op(target_value, update_value));
+      T_int new_value        = (old & ~(0x000000ff << shift)) | (T_int(updating_value) << shift);
+      old                    = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return T((old >> shift) & 0xff);
@@ -146,26 +145,24 @@ struct genericAtomicOperationImpl<T, Op, 1> {
 // 2 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 2> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          Op op) {
-    using T_int = unsigned int;
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
+  {
+    using T_int      = unsigned int;
     bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) ? false : true;
-    T_int* address_uint32 = reinterpret_cast<T_int*>(
-      reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
+    T_int* address_uint32 =
+      reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
 
     T_int old = *address_uint32;
     T_int assumed;
 
     do {
-      assumed = old;
-      T target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
-      uint16_t updating_value =
-        type_reinterpret<uint16_t, T>(op(target_value, update_value));
-
-      T_int new_value = (is_32_align)
-                          ? (old & 0xffff0000) | updating_value
-                          : (old & 0xffff) | (T_int(updating_value) << 16);
-      old = atomicCAS(address_uint32, assumed, new_value);
+      assumed                 = old;
+      T target_value          = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
+      uint16_t updating_value = type_reinterpret<uint16_t, T>(op(target_value, update_value));
+
+      T_int new_value = (is_32_align) ? (old & 0xffff0000) | updating_value
+                                      : (old & 0xffff) | (T_int(updating_value) << 16);
+      old             = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return (is_32_align) ? T(old & 0xffff) : T(old >> 16);
@@ -176,20 +173,18 @@ struct genericAtomicOperationImpl<T, Op, 2> {
 // 4 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          Op op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
+  {
     using T_int = unsigned int;
     T old_value = *addr;
     T assumed{old_value};
 
     if constexpr (std::is_same<T, float>{} && (std::is_same<Op, DeviceMin>{})) {
-      if (isnan(update_value)) {
-        return old_value;
-      }
+      if (isnan(update_value)) { return old_value; }
     }
 
     do {
-      assumed = old_value;
+      assumed           = old_value;
       const T new_value = op(old_value, update_value);
 
       T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
@@ -206,17 +201,13 @@ struct genericAtomicOperationImpl<T, Op, 4> {
 template <>
 struct genericAtomicOperationImpl<float, DeviceMax, 4> {
   using T = float;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceMax op) {
-    if (isnan(update_value)) {
-      return *addr;
-    }
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+  {
+    if (isnan(update_value)) { return *addr; }
 
-    T old =
-      (update_value >= 0)
-        ? __int_as_float(atomicMax((int*)addr, __float_as_int(update_value)))
-        : __uint_as_float(
-            atomicMin((unsigned int*)addr, __float_as_uint(update_value)));
+    T old = (update_value >= 0)
+              ? __int_as_float(atomicMax((int*)addr, __float_as_int(update_value)))
+              : __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(update_value)));
 
     return old;
   }
@@ -225,8 +216,8 @@ struct genericAtomicOperationImpl<float, DeviceMax, 4> {
 // 8 bytes atomic operation
 template <typename T, typename Op>
 struct genericAtomicOperationImpl<T, Op, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          Op op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, Op op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
 
@@ -234,7 +225,7 @@ struct genericAtomicOperationImpl<T, Op, 8> {
     T assumed{old_value};
 
     do {
-      assumed = old_value;
+      assumed           = old_value;
       const T new_value = op(old_value, update_value);
 
       T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
@@ -250,8 +241,8 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 
 // -------------------------------------------------------------------------------------------------
 // specialized functions for operators
-// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is not supproted.)
-// `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int
+// `atomicAdd` supports int, unsigned int, unsigend long long int, float, double (long long int is
+// not supproted.) `atomicMin`, `atomicMax` support int, unsigned int, unsigned long long int
 // `atomicAnd`, `atomicOr`, `atomicXor` support int, unsigned int, unsigned long long int
 
 // CUDA natively supports `unsigned long long int` for `atomicAdd`,
@@ -264,12 +255,11 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 template <>
 struct genericAtomicOperationImpl<long int, DeviceSum, 8> {
   using T = long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceSum op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -277,12 +267,11 @@ struct genericAtomicOperationImpl<long int, DeviceSum, 8> {
 template <>
 struct genericAtomicOperationImpl<unsigned long int, DeviceSum, 8> {
   using T = unsigned long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceSum op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -297,12 +286,11 @@ struct genericAtomicOperationImpl<unsigned long int, DeviceSum, 8> {
 template <>
 struct genericAtomicOperationImpl<long long int, DeviceSum, 8> {
   using T = long long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceSum op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceSum op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAdd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -310,12 +298,11 @@ struct genericAtomicOperationImpl<long long int, DeviceSum, 8> {
 template <>
 struct genericAtomicOperationImpl<unsigned long int, DeviceMin, 8> {
   using T = unsigned long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceMin op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMin op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T ret = atomicMin(reinterpret_cast<T_int*>(addr),
-                      type_reinterpret<T_int, T>(update_value));
+    T ret = atomicMin(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -323,48 +310,44 @@ struct genericAtomicOperationImpl<unsigned long int, DeviceMin, 8> {
 template <>
 struct genericAtomicOperationImpl<unsigned long int, DeviceMax, 8> {
   using T = unsigned long int;
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceMax op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceMax op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T ret = atomicMax(reinterpret_cast<T_int*>(addr),
-                      type_reinterpret<T_int, T>(update_value));
+    T ret = atomicMax(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
 
 template <typename T>
 struct genericAtomicOperationImpl<T, DeviceAnd, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceAnd op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicAnd(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicAnd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
 
 template <typename T>
 struct genericAtomicOperationImpl<T, DeviceOr, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceOr op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicOr(reinterpret_cast<T_int*>(addr),
-                         type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicOr(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
 
 template <typename T>
 struct genericAtomicOperationImpl<T, DeviceXor, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value,
-                                          DeviceXor op) {
+  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
-    T_int ret = atomicXor(reinterpret_cast<T_int*>(addr),
-                          type_reinterpret<T_int, T>(update_value));
+    T_int ret = atomicXor(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
     return type_reinterpret<T, T_int>(ret);
   }
 };
@@ -377,13 +360,12 @@ struct typesAtomicCASImpl;
 
 template <typename T>
 struct typesAtomicCASImpl<T, 1> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare,
-                                          T const& update_value) {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
+  {
     using T_int = unsigned int;
 
-    T_int shift = ((reinterpret_cast<size_t>(addr) & 3) * 8);
-    T_int* address_uint32 =
-      reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
+    T_int shift           = ((reinterpret_cast<size_t>(addr) & 3) * 8);
+    T_int* address_uint32 = reinterpret_cast<T_int*>(addr - (reinterpret_cast<size_t>(addr) & 3));
 
     // the 'target_value' in `old` can be different from `compare`
     // because other thread may update the value
@@ -394,15 +376,14 @@ struct typesAtomicCASImpl<T, 1> {
     uint8_t u_val = type_reinterpret<uint8_t, T>(update_value);
 
     do {
-      assumed = old;
+      assumed      = old;
       target_value = T((old >> shift) & 0xff);
       // have to compare `target_value` and `compare` before calling atomicCAS
       // the `target_value` in `old` can be different with `compare`
       if (target_value != compare) break;
 
-      T_int new_value =
-        (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift);
-      old = atomicCAS(address_uint32, assumed, new_value);
+      T_int new_value = (old & ~(0x000000ff << shift)) | (T_int(u_val) << shift);
+      old             = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
     return target_value;
@@ -411,13 +392,13 @@ struct typesAtomicCASImpl<T, 1> {
 
 template <typename T>
 struct typesAtomicCASImpl<T, 2> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare,
-                                          T const& update_value) {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
+  {
     using T_int = unsigned int;
 
     bool is_32_align = (reinterpret_cast<size_t>(addr) & 2) ? false : true;
-    T_int* address_uint32 = reinterpret_cast<T_int*>(
-      reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
+    T_int* address_uint32 =
+      reinterpret_cast<T_int*>(reinterpret_cast<size_t>(addr) - (is_32_align ? 0 : 2));
 
     T_int old = *address_uint32;
     T_int assumed;
@@ -425,12 +406,12 @@ struct typesAtomicCASImpl<T, 2> {
     uint16_t u_val = type_reinterpret<uint16_t, T>(update_value);
 
     do {
-      assumed = old;
+      assumed      = old;
       target_value = (is_32_align) ? T(old & 0xffff) : T(old >> 16);
       if (target_value != compare) break;
 
-      T_int new_value = (is_32_align) ? (old & 0xffff0000) | u_val
-                                      : (old & 0xffff) | (T_int(u_val) << 16);
+      T_int new_value =
+        (is_32_align) ? (old & 0xffff0000) | u_val : (old & 0xffff) | (T_int(u_val) << 16);
       old = atomicCAS(address_uint32, assumed, new_value);
     } while (assumed != old);
 
@@ -440,8 +421,8 @@ struct typesAtomicCASImpl<T, 2> {
 
 template <typename T>
 struct typesAtomicCASImpl<T, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare,
-                                          T const& update_value) {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
+  {
     using T_int = unsigned int;
 
     T_int ret = atomicCAS(reinterpret_cast<T_int*>(addr),
@@ -454,8 +435,8 @@ struct typesAtomicCASImpl<T, 4> {
 // 8 bytes atomic operation
 template <typename T>
 struct typesAtomicCASImpl<T, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& compare,
-                                          T const& update_value) {
+  __forceinline__ __device__ T operator()(T* addr, T const& compare, T const& update_value)
+  {
     using T_int = unsigned long long int;
     static_assert(sizeof(T) == sizeof(T_int), errmsg_cast);
 
@@ -487,11 +468,10 @@ struct typesAtomicCASImpl<T, 8> {
  * @returns The old value at `address`
  * -------------------------------------------------------------------------**/
 template <typename T, typename BinaryOp>
-typename std::enable_if_t<std::is_arithmetic<T>::value, T> __forceinline__
-  __device__
-  genericAtomicOperation(T* address, T const& update_value, BinaryOp op) {
-  auto fun =
-    raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
+typename std::enable_if_t<std::is_arithmetic<T>::value, T> __forceinline__ __device__
+genericAtomicOperation(T* address, T const& update_value, BinaryOp op)
+{
+  auto fun = raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
   return T(fun(address, update_value, op));
 }
 
@@ -499,11 +479,11 @@ typename std::enable_if_t<std::is_arithmetic<T>::value, T> __forceinline__
 template <typename BinaryOp>
 __forceinline__ __device__ bool genericAtomicOperation(bool* address,
                                                        bool const& update_value,
-                                                       BinaryOp op) {
+                                                       BinaryOp op)
+{
   using T = bool;
   // don't use underlying type to apply operation for bool
-  auto fun =
-    raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
+  auto fun = raft::device_atomics::detail::genericAtomicOperationImpl<T, BinaryOp>{};
   return T(fun(address, update_value, op));
 }
 
@@ -525,9 +505,9 @@ __forceinline__ __device__ bool genericAtomicOperation(bool* address,
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicAdd(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceSum{});
+__forceinline__ __device__ T atomicAdd(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceSum{});
 }
 
 /**
@@ -546,9 +526,9 @@ __forceinline__ __device__ T atomicAdd(T* address, T val) {
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMin(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceMin{});
+__forceinline__ __device__ T atomicMin(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMin{});
 }
 
 /**
@@ -567,9 +547,9 @@ __forceinline__ __device__ T atomicMin(T* address, T val) {
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicMax(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceMax{});
+__forceinline__ __device__ T atomicMax(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceMax{});
 }
 
 /**
@@ -589,9 +569,9 @@ __forceinline__ __device__ T atomicMax(T* address, T val) {
  * @returns The old value at `address`
  */
 template <typename T>
-__forceinline__ __device__ T atomicCAS(T* address, T compare, T val) {
-  return raft::device_atomics::detail::typesAtomicCASImpl<T>()(address, compare,
-                                                               val);
+__forceinline__ __device__ T atomicCAS(T* address, T compare, T val)
+{
+  return raft::device_atomics::detail::typesAtomicCASImpl<T>()(address, compare, val);
 }
 
 /**
@@ -609,11 +589,10 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val) {
  *
  * @returns The old value at `address`
  */
-template <typename T,
-          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-__forceinline__ __device__ T atomicAnd(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceAnd{});
+template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__forceinline__ __device__ T atomicAnd(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceAnd{});
 }
 
 /**
@@ -631,11 +610,10 @@ __forceinline__ __device__ T atomicAnd(T* address, T val) {
  *
  * @returns The old value at `address`
  */
-template <typename T,
-          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-__forceinline__ __device__ T atomicOr(T* address, T val) {
-  return raft::genericAtomicOperation(address, val,
-                                      raft::device_atomics::detail::DeviceOr{});
+template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__forceinline__ __device__ T atomicOr(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceOr{});
 }
 
 /**
@@ -653,9 +631,8 @@ __forceinline__ __device__ T atomicOr(T* address, T val) {
  *
  * @returns The old value at `address`
  */
-template <typename T,
-          typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
-__forceinline__ __device__ T atomicXor(T* address, T val) {
-  return raft::genericAtomicOperation(
-    address, val, raft::device_atomics::detail::DeviceXor{});
+template <typename T, typename std::enable_if_t<std::is_integral<T>::value, T>* = nullptr>
+__forceinline__ __device__ T atomicXor(T* address, T val)
+{
+  return raft::genericAtomicOperation(address, val, raft::device_atomics::detail::DeviceXor{});
 }
diff --git a/cpp/include/raft/distance/detail/canberra.cuh b/cpp/include/raft/distance/detail/canberra.cuh
index c4c384c45f..46edf0bf47 100644
--- a/cpp/include/raft/distance/detail/canberra.cuh
+++ b/cpp/include/raft/distance/detail/canberra.cuh
@@ -45,75 +45,108 @@ namespace detail {
  * @param fin_op    the final gemm epilogue lambda
  * @param stream    cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void canberraImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                         IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                         FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void canberraImpl(const DataT* x,
+                         const DataT* y,
+                         IdxT m,
+                         IdxT n,
+                         IdxT k,
+                         IdxT lda,
+                         IdxT ldb,
+                         IdxT ldd,
+                         OutT* dOutput,
+                         FinalLambda fin_op,
+                         cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     const auto diff = raft::L1Op<AccT, IdxT>()(x - y);
-    const auto add = raft::myAbs(x) + raft::myAbs(y);
+    const auto add  = raft::myAbs(x) + raft::myAbs(y);
     // deal with potential for 0 in denominator by
     // forcing 1/0 instead
     acc += ((add != 0) * diff / (add + (add == 0)));
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) { return; };
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) { return; };
 
   if (isRowMajor) {
-    auto canberraRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraRowMajor);
+    auto canberraRowMajor = pairwiseDistanceMatKernel<false,
+                                                      DataT,
+                                                      AccT,
+                                                      OutT,
+                                                      IdxT,
+                                                      KPolicy,
+                                                      decltype(core_lambda),
+                                                      decltype(epilog_lambda),
+                                                      FinalLambda,
+                                                      true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraRowMajor);
 
     canberraRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto canberraColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraColMajor);
+    auto canberraColMajor = pairwiseDistanceMatKernel<false,
+                                                      DataT,
+                                                      AccT,
+                                                      OutT,
+                                                      IdxT,
+                                                      KPolicy,
+                                                      decltype(core_lambda),
+                                                      decltype(epilog_lambda),
+                                                      FinalLambda,
+                                                      false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, canberraColMajor);
     canberraColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-              const DataT *x, const DataT *y, OutT *dOutput, FinalLambda fin_op,
-              cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void canberra(IdxT m,
+              IdxT n,
+              IdxT k,
+              IdxT lda,
+              IdxT ldb,
+              IdxT ldd,
+              const DataT* x,
+              const DataT* y,
+              OutT* dOutput,
+              FinalLambda fin_op,
+              cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    canberraImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                 isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                             stream);
+    canberraImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    canberraImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                 isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                             stream);
+    canberraImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     canberraImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -138,16 +171,25 @@ void canberra(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param[in] stream cuda stream to launch work
  * @param[in] isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void canberraImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                  OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                  bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void canberraImpl(int m,
+                  int n,
+                  int k,
+                  const InType* pA,
+                  const InType* pB,
+                  OutType* pD,
+                  FinalLambda fin_op,
+                  cudaStream_t stream,
+                  bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    canberraOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type canberraOutType;
   Index_ lda, ldb, ldd;
-  canberraOutType *pDcast = reinterpret_cast<canberraOutType *>(pD);
+  canberraOutType* pDcast = reinterpret_cast<canberraOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     canberra<InType, AccType, canberraOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/detail/chebyshev.cuh b/cpp/include/raft/distance/detail/chebyshev.cuh
index 77fba28310..99b314bd08 100644
--- a/cpp/include/raft/distance/detail/chebyshev.cuh
+++ b/cpp/include/raft/distance/detail/chebyshev.cuh
@@ -44,72 +44,105 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void chebyshevImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
-                          IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                          FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void chebyshevImpl(const DataT* x,
+                          const DataT* y,
+                          IdxT m,
+                          IdxT n,
+                          IdxT k,
+                          IdxT lda,
+                          IdxT ldb,
+                          IdxT ldd,
+                          OutT* dOutput,
+                          FinalLambda fin_op,
+                          cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
     const auto diff = raft::L1Op<AccT, IdxT>()(x - y);
-    acc = raft::myMax(acc, diff);
+    acc             = raft::myMax(acc, diff);
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) { return; };
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) { return; };
 
   if (isRowMajor) {
-    auto chebyshevRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               chebyshevRowMajor);
+    auto chebyshevRowMajor = pairwiseDistanceMatKernel<false,
+                                                       DataT,
+                                                       AccT,
+                                                       OutT,
+                                                       IdxT,
+                                                       KPolicy,
+                                                       decltype(core_lambda),
+                                                       decltype(epilog_lambda),
+                                                       FinalLambda,
+                                                       true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, chebyshevRowMajor);
 
     chebyshevRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto chebyshevColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               chebyshevColMajor);
+    auto chebyshevColMajor = pairwiseDistanceMatKernel<false,
+                                                       DataT,
+                                                       AccT,
+                                                       OutT,
+                                                       IdxT,
+                                                       KPolicy,
+                                                       decltype(core_lambda),
+                                                       decltype(epilog_lambda),
+                                                       FinalLambda,
+                                                       false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, chebyshevColMajor);
     chebyshevColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-               const DataT *x, const DataT *y, OutT *dOutput,
-               FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void chebyshev(IdxT m,
+               IdxT n,
+               IdxT k,
+               IdxT lda,
+               IdxT ldb,
+               IdxT ldd,
+               const DataT* x,
+               const DataT* y,
+               OutT* dOutput,
+               FinalLambda fin_op,
+               cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                              stream);
+    chebyshevImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    chebyshevImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                              stream);
+    chebyshevImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     chebyshevImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -134,16 +167,25 @@ void chebyshev(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param[in] stream cuda stream to launch work
  * @param[in] isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void chebyshevImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                   OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                   bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void chebyshevImpl(int m,
+                   int n,
+                   int k,
+                   const InType* pA,
+                   const InType* pB,
+                   OutType* pD,
+                   FinalLambda fin_op,
+                   cudaStream_t stream,
+                   bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    chebyshevOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type chebyshevOutType;
   Index_ lda, ldb, ldd;
-  chebyshevOutType *pDcast = reinterpret_cast<chebyshevOutType *>(pD);
+  chebyshevOutType* pDcast = reinterpret_cast<chebyshevOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     chebyshev<InType, AccType, chebyshevOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/detail/correlation.cuh b/cpp/include/raft/distance/detail/correlation.cuh
index cee986997a..159f9ab580 100644
--- a/cpp/include/raft/distance/detail/correlation.cuh
+++ b/cpp/include/raft/distance/detail/correlation.cuh
@@ -47,69 +47,81 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void correlationImpl(const DataT *x, const DataT *y, const DataT *xn,
-                            const DataT *yn, const DataT *x2n, const DataT *y2n,
-                            IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb,
-                            IdxT ldd, OutT *dOutput, FinalLambda fin_op,
-                            cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void correlationImpl(const DataT* x,
+                            const DataT* y,
+                            const DataT* xn,
+                            const DataT* yn,
+                            const DataT* x2n,
+                            const DataT* y2n,
+                            IdxT m,
+                            IdxT n,
+                            IdxT k,
+                            IdxT lda,
+                            IdxT ldb,
+                            IdxT ldd,
+                            OutT* dOutput,
+                            FinalLambda fin_op,
+                            cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   // epilogue operation lambda for final value calculation
   auto epilog_lambda = [x2n, y2n, m, n, k] __device__(
                          AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         DataT * regxn,
+                         DataT * regyn,
+                         IdxT gridStrideX,
                          IdxT gridStrideY) {
     DataT regx2n[KPolicy::AccRowsPerTh], regy2n[KPolicy::AccColsPerTh];
 
     extern __shared__ char smem[];
-    DataT *sx2Norm =
-      (DataT *)(&smem[KPolicy::SmemSize +
-                      (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]);
-    DataT *sy2Norm = (&sx2Norm[KPolicy::Mblk]);
+    DataT* sx2Norm =
+      (DataT*)(&smem[KPolicy::SmemSize + (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)]);
+    DataT* sy2Norm = (&sx2Norm[KPolicy::Mblk]);
 
     // Load x & y norms required by this threadblock in shmem buffer
     if (gridStrideX == blockIdx.x * KPolicy::Nblk) {
       for (int i = threadIdx.x; i < KPolicy::Mblk; i += KPolicy::Nthreads) {
-        auto idx = gridStrideY + i;
+        auto idx   = gridStrideY + i;
         sx2Norm[i] = idx < m ? x2n[idx] : 0;
       }
     }
 
     for (int i = threadIdx.x; i < KPolicy::Nblk; i += KPolicy::Nthreads) {
-      auto idx = gridStrideX + i;
+      auto idx   = gridStrideX + i;
       sy2Norm[i] = idx < n ? y2n[idx] : 0;
     }
     __syncthreads();
 
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
-      regx2n[i] =
-        sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)];
+      regx2n[i] = sx2Norm[i * KPolicy::AccThRows + (threadIdx.x / KPolicy::AccThCols)];
     }
 #pragma unroll
     for (int i = 0; i < KPolicy::AccColsPerTh; ++i) {
-      regy2n[i] =
-        sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)];
+      regy2n[i] = sy2Norm[i * KPolicy::AccThCols + (threadIdx.x % KPolicy::AccThCols)];
     }
 
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
-        auto numer = k * acc[i][j] - (regxn[i] * regyn[j]);
+        auto numer   = k * acc[i][j] - (regxn[i] * regyn[j]);
         auto Q_denom = k * regx2n[i] - (regxn[i] * regxn[i]);
         auto R_denom = k * regy2n[j] - (regyn[j] * regyn[j]);
 
@@ -121,46 +133,68 @@ static void correlationImpl(const DataT *x, const DataT *y, const DataT *xn,
   constexpr size_t shmemSize =
     KPolicy::SmemSize + (2 * (KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
   if (isRowMajor) {
-    constexpr auto correlationRowMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               correlationRowMajor);
+    constexpr auto correlationRowMajor = pairwiseDistanceMatKernel<true,
+                                                                   DataT,
+                                                                   AccT,
+                                                                   OutT,
+                                                                   IdxT,
+                                                                   KPolicy,
+                                                                   decltype(core_lambda),
+                                                                   decltype(epilog_lambda),
+                                                                   FinalLambda,
+                                                                   true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, correlationRowMajor);
     correlationRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    constexpr auto correlationColMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               correlationColMajor);
+    constexpr auto correlationColMajor = pairwiseDistanceMatKernel<true,
+                                                                   DataT,
+                                                                   AccT,
+                                                                   OutT,
+                                                                   IdxT,
+                                                                   KPolicy,
+                                                                   decltype(core_lambda),
+                                                                   decltype(epilog_lambda),
+                                                                   FinalLambda,
+                                                                   false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, correlationColMajor);
     correlationColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void correlation(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                 const DataT *x, const DataT *y, const DataT *xn,
-                 const DataT *yn, const DataT *x2n, const DataT *y2n,
-                 OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void correlation(IdxT m,
+                 IdxT n,
+                 IdxT k,
+                 IdxT lda,
+                 IdxT ldb,
+                 IdxT ldd,
+                 const DataT* x,
+                 const DataT* y,
+                 const DataT* xn,
+                 const DataT* yn,
+                 const DataT* x2n,
+                 const DataT* y2n,
+                 OutT* dOutput,
+                 FinalLambda fin_op,
+                 cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    correlationImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                    isRowMajor>(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd,
-                                dOutput, fin_op, stream);
+    correlationImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    correlationImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                    isRowMajor>(x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd,
-                                dOutput, fin_op, stream);
+    correlationImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     correlationImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, xn, yn, x2n, y2n, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -185,63 +219,118 @@ void correlation(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void correlationImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                     OutType *pD, AccType *workspace, size_t &worksize,
-                     FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void correlationImpl(int m,
+                     int n,
+                     int k,
+                     const InType* pA,
+                     const InType* pB,
+                     OutType* pD,
+                     AccType* workspace,
+                     size_t& worksize,
+                     FinalLambda fin_op,
+                     cudaStream_t stream,
+                     bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    correlationOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type correlationOutType;
   Index_ lda, ldb, ldd;
-  correlationOutType *pDcast = reinterpret_cast<correlationOutType *>(pD);
+  correlationOutType* pDcast = reinterpret_cast<correlationOutType*>(pD);
 
   ASSERT(!(((pA != pB) && (worksize < 2 * (m + n) * sizeof(AccType))) ||
            (worksize < 2 * m * sizeof(AccType))),
          "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
-  AccType *norm_col_vec = workspace;
-  AccType *norm_row_vec = workspace;
-  AccType *sq_norm_col_vec = workspace;
-  AccType *sq_norm_row_vec = workspace;
+  AccType* norm_col_vec    = workspace;
+  AccType* norm_row_vec    = workspace;
+  AccType* sq_norm_col_vec = workspace;
+  AccType* sq_norm_row_vec = workspace;
   if (pA != pB) {
     norm_row_vec += m;
 
-    raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true,
-                         stream, false, raft::Nop<InType>(),
+    raft::linalg::reduce(norm_col_vec,
+                         pA,
+                         k,
+                         m,
+                         (AccType)0,
+                         isRowMajor,
+                         true,
+                         stream,
+                         false,
+                         raft::Nop<InType>(),
                          raft::Sum<InType>());
-    raft::linalg::reduce(norm_row_vec, pB, k, n, (AccType)0, isRowMajor, true,
-                         stream, false, raft::Nop<InType>(),
+    raft::linalg::reduce(norm_row_vec,
+                         pB,
+                         k,
+                         n,
+                         (AccType)0,
+                         isRowMajor,
+                         true,
+                         stream,
+                         false,
+                         raft::Nop<InType>(),
                          raft::Sum<InType>());
 
     sq_norm_col_vec += (m + n);
     sq_norm_row_vec = sq_norm_col_vec + m;
-    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm,
-                          isRowMajor, stream);
-    raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm,
-                          isRowMajor, stream);
+    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream);
+    raft::linalg::rowNorm(sq_norm_row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream);
   } else {
-    raft::linalg::reduce(norm_col_vec, pA, k, m, (AccType)0, isRowMajor, true,
-                         stream, false, raft::Nop<InType>(),
+    raft::linalg::reduce(norm_col_vec,
+                         pA,
+                         k,
+                         m,
+                         (AccType)0,
+                         isRowMajor,
+                         true,
+                         stream,
+                         false,
+                         raft::Nop<InType>(),
                          raft::Sum<InType>());
     sq_norm_col_vec += m;
     sq_norm_row_vec = sq_norm_col_vec;
-    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm,
-                          isRowMajor, stream);
+    raft::linalg::rowNorm(sq_norm_col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream);
   }
 
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
-    correlation<InType, AccType, correlationOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, norm_col_vec, norm_row_vec,
-      sq_norm_col_vec, sq_norm_row_vec, pDcast, fin_op, stream);
+    correlation<InType, AccType, correlationOutType, Index_, FinalLambda, true>(m,
+                                                                                n,
+                                                                                k,
+                                                                                lda,
+                                                                                ldb,
+                                                                                ldd,
+                                                                                pA,
+                                                                                pB,
+                                                                                norm_col_vec,
+                                                                                norm_row_vec,
+                                                                                sq_norm_col_vec,
+                                                                                sq_norm_row_vec,
+                                                                                pDcast,
+                                                                                fin_op,
+                                                                                stream);
   } else {
     lda = n, ldb = m, ldd = m;
-    correlation<InType, AccType, correlationOutType, Index_, FinalLambda,
-                false>(n, m, k, lda, ldb, ldd, pB, pA, norm_row_vec,
-                       norm_col_vec, sq_norm_row_vec, sq_norm_col_vec, pDcast,
-                       fin_op, stream);
+    correlation<InType, AccType, correlationOutType, Index_, FinalLambda, false>(n,
+                                                                                 m,
+                                                                                 k,
+                                                                                 lda,
+                                                                                 ldb,
+                                                                                 ldd,
+                                                                                 pB,
+                                                                                 pA,
+                                                                                 norm_row_vec,
+                                                                                 norm_col_vec,
+                                                                                 sq_norm_row_vec,
+                                                                                 sq_norm_col_vec,
+                                                                                 pDcast,
+                                                                                 fin_op,
+                                                                                 stream);
   }
 }
 
diff --git a/cpp/include/raft/distance/detail/cosine.cuh b/cpp/include/raft/distance/detail/cosine.cuh
index 900e045edc..5684fd0a16 100644
--- a/cpp/include/raft/distance/detail/cosine.cuh
+++ b/cpp/include/raft/distance/detail/cosine.cuh
@@ -25,7 +25,7 @@ namespace detail {
 
 /**
  * @brief the cosine distance matrix calculation implementer
- *  It computes the following equation: 
+ *  It computes the following equation:
  *    C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2)))
  * @tparam DataT input data-type (for A and B matrices)
  * @tparam AccT   accumulation data-type
@@ -50,30 +50,43 @@ namespace detail {
  * @param fin_op  the final gemm epilogue lambda
 *  @param stream  cuda stream to launch cuda operations.
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-void cosineImpl(const DataT *x, const DataT *y, const DataT *xn,
-                const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb,
-                IdxT ldd, OutT *dOutput, FinalLambda fin_op,
-                cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+void cosineImpl(const DataT* x,
+                const DataT* y,
+                const DataT* xn,
+                const DataT* yn,
+                IdxT m,
+                IdxT n,
+                IdxT k,
+                IdxT lda,
+                IdxT ldb,
+                IdxT ldd,
+                OutT* dOutput,
+                FinalLambda fin_op,
+                cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -86,43 +99,66 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn,
   constexpr size_t shmemSize =
     KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
   if (isRowMajor) {
-    auto cosineRowMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
+    auto cosineRowMajor = pairwiseDistanceMatKernel<true,
+                                                    DataT,
+                                                    AccT,
+                                                    OutT,
+                                                    IdxT,
+                                                    KPolicy,
+                                                    decltype(core_lambda),
+                                                    decltype(epilog_lambda),
+                                                    FinalLambda,
+                                                    true>;
+    dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineRowMajor);
     cosineRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto cosineColMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
+    auto cosineColMajor = pairwiseDistanceMatKernel<true,
+                                                    DataT,
+                                                    AccT,
+                                                    OutT,
+                                                    IdxT,
+                                                    KPolicy,
+                                                    decltype(core_lambda),
+                                                    decltype(epilog_lambda),
+                                                    FinalLambda,
+                                                    false>;
+    dim3 grid           = launchConfigGenerator<KPolicy>(m, n, shmemSize, cosineColMajor);
     cosineColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-            const DataT *x, const DataT *y, const DataT *xn, const DataT *yn,
-            OutT *dOutput, FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void cosine(IdxT m,
+            IdxT n,
+            IdxT k,
+            IdxT lda,
+            IdxT ldb,
+            IdxT ldd,
+            const DataT* x,
+            const DataT* y,
+            const DataT* xn,
+            const DataT* yn,
+            OutT* dOutput,
+            FinalLambda fin_op,
+            cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    cosineImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-               isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput,
-                           fin_op, stream);
+    cosineImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    cosineImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-               isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput,
-                           fin_op, stream);
+    cosineImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     cosineImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -131,7 +167,7 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the expanded cosine distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
  *              C = 1 - op(A * B / sqrt(A^2) * sqrt(B^2)))
  * @tparam IType input data-type (for A and B matrices)
  * @tparam AccType accumulation data-type
@@ -152,12 +188,23 @@ void cosine(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
-                 const InType *pB, OutType *pD, AccType *workspace,
-                 size_t worksize, FinalLambda fin_op, cudaStream_t stream,
-                 bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void cosineAlgo1(Index_ m,
+                 Index_ n,
+                 Index_ k,
+                 const InType* pA,
+                 const InType* pB,
+                 OutType* pD,
+                 AccType* workspace,
+                 size_t worksize,
+                 FinalLambda fin_op,
+                 cudaStream_t stream,
+                 bool isRowMajor)
+{
   auto norm_op = [] __device__(AccType in) { return raft::mySqrt(in); };
 
   // Wrap fin_op to allow computing 1 - pA before calling fin_op
@@ -166,39 +213,33 @@ void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
   };
 
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    CosOutType;
-  CosOutType *pDcast = reinterpret_cast<CosOutType *>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type CosOutType;
+  CosOutType* pDcast = reinterpret_cast<CosOutType*>(pD);
 
-  ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) ||
-           (worksize < m * sizeof(AccType))),
-         "workspace size error");
+  ASSERT(
+    !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))),
+    "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
   Index_ lda, ldb, ldd;
-  InType *col_vec = workspace;
-  InType *row_vec = workspace;
+  InType* col_vec = workspace;
+  InType* row_vec = workspace;
   if (pA != pB) {
     row_vec += m;
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
-    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
   } else {
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
   }
 
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op), true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op,
-      stream);
+      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, pDcast, wrapped_fin_op, stream);
   } else {
     lda = n, ldb = m, ldd = m;
-    cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op),
-           false>(n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast,
-                  wrapped_fin_op, stream);
+    cosine<InType, AccType, CosOutType, Index_, decltype(wrapped_fin_op), false>(
+      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, pDcast, wrapped_fin_op, stream);
   }
 }
 
diff --git a/cpp/include/raft/distance/detail/distance.cuh b/cpp/include/raft/distance/detail/distance.cuh
index 199dc73fb6..91838e8bfa 100644
--- a/cpp/include/raft/distance/detail/distance.cuh
+++ b/cpp/include/raft/distance/detail/distance.cuh
@@ -85,211 +85,461 @@ enum DistanceType : unsigned short {
 };
 
 namespace {
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename FinalLambda,
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
           typename Index_>
 struct DistanceImpl {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType metric_arg = 2.0f) {}
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg = 2.0f)
+  {
+  }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2Expanded, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType) {
-    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType,
-                                           FinalLambda, Index_>(
-      m, n, k, x, y, dist, false, (AccType *)workspace, worksize, fin_op,
-      stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2Expanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, false, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2SqrtExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType) {
-    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType,
-                                           FinalLambda, Index_>(
-      m, n, k, x, y, dist, true, (AccType *)workspace, worksize, fin_op, stream,
-      isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2SqrtExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::euclideanAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, true, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::CosineExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType) {
-    raft::distance::detail::cosineAlgo1<InType, AccType, OutType, FinalLambda,
-                                        Index_>(m, n, k, x, y, dist,
-                                                (AccType *)workspace, worksize,
-                                                fin_op, stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::CosineExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::cosineAlgo1<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2Unexpanded, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType,
-                                           FinalLambda, Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2Unexpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, false, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L2SqrtUnexpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType,
-                                           FinalLambda, Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L2SqrtUnexpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::euclideanAlgo2<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, true, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::L1, InType, AccType, OutType,
-                    FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::l1Impl<InType, AccType, OutType, FinalLambda,
-                                   Index_>(m, n, k, x, y, dist, fin_op, stream,
-                                           isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::L1,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::l1Impl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::Linf, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::chebyshevImpl<InType, AccType, OutType, FinalLambda,
-                                          Index_>(m, n, k, x, y, dist, fin_op,
-                                                  stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::Linf,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::chebyshevImpl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::HellingerExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::hellingerImpl<InType, AccType, OutType, FinalLambda,
-                                          Index_>(m, n, k, x, y, dist, fin_op,
-                                                  stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::HellingerExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::hellingerImpl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::LpUnexpanded, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType metric_arg) {
-    raft::distance::detail::minkowskiImpl<InType, AccType, OutType, FinalLambda,
-                                          Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::LpUnexpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType metric_arg)
+  {
+    raft::distance::detail::minkowskiImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor, metric_arg);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::Canberra, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::canberraImpl<InType, AccType, OutType, FinalLambda,
-                                         Index_>(m, n, k, x, y, dist, fin_op,
-                                                 stream, isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::Canberra,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::canberraImpl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::HammingUnexpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::hammingUnexpandedImpl<InType, AccType, OutType,
-                                                  FinalLambda, Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::HammingUnexpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::hammingUnexpandedImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::JensenShannon, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::jensenShannonImpl<InType, AccType, OutType,
-                                              FinalLambda, Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::JensenShannon,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::jensenShannonImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::RusselRaoExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::russellRaoImpl<InType, AccType, OutType,
-                                           FinalLambda, Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::RusselRaoExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::russellRaoImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::KLDivergence, InType, AccType,
-                    OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *, size_t, FinalLambda fin_op, cudaStream_t stream,
-           bool isRowMajor, InType) {
-    raft::distance::detail::klDivergenceImpl<InType, AccType, OutType,
-                                             FinalLambda, Index_>(
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::KLDivergence,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void*,
+           size_t,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::klDivergenceImpl<InType, AccType, OutType, FinalLambda, Index_>(
       m, n, k, x, y, dist, fin_op, stream, isRowMajor);
   }
 };
 
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_>
-struct DistanceImpl<raft::distance::DistanceType::CorrelationExpanded, InType,
-                    AccType, OutType, FinalLambda, Index_> {
-  void run(const InType *x, const InType *y, OutType *dist, Index_ m, Index_ n,
-           Index_ k, void *workspace, size_t worksize, FinalLambda fin_op,
-           cudaStream_t stream, bool isRowMajor, InType) {
-    raft::distance::detail::correlationImpl<InType, AccType, OutType,
-                                            FinalLambda, Index_>(
-      m, n, k, x, y, dist, (AccType *)workspace, worksize, fin_op, stream,
-      isRowMajor);
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_>
+struct DistanceImpl<raft::distance::DistanceType::CorrelationExpanded,
+                    InType,
+                    AccType,
+                    OutType,
+                    FinalLambda,
+                    Index_> {
+  void run(const InType* x,
+           const InType* y,
+           OutType* dist,
+           Index_ m,
+           Index_ n,
+           Index_ k,
+           void* workspace,
+           size_t worksize,
+           FinalLambda fin_op,
+           cudaStream_t stream,
+           bool isRowMajor,
+           InType)
+  {
+    raft::distance::detail::correlationImpl<InType, AccType, OutType, FinalLambda, Index_>(
+      m, n, k, x, y, dist, (AccType*)workspace, worksize, fin_op, stream, isRowMajor);
   }
 };
 
@@ -320,53 +570,71 @@ struct DistanceImpl<raft::distance::DistanceType::CorrelationExpanded, InType,
  * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
  * any other parameters, feel free to pass them via closure.
  */
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename FinalLambda,
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
           typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, void *workspace, size_t worksize,
-              FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
-  DistanceImpl<distanceType, InType, AccType, OutType, FinalLambda, Index_>
-    distImpl;
-  distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream,
-               isRowMajor, metric_arg);
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              FinalLambda fin_op,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  DistanceImpl<distanceType, InType, AccType, OutType, FinalLambda, Index_> distImpl;
+  distImpl.run(x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 /**
-         * @brief Evaluate pairwise distances for the simple use case
-         * @tparam DistanceType which distance to evaluate
-         * @tparam InType input argument type
-         * @tparam AccType accumulation type
-         * @tparam OutType output type
-         * @tparam Index_ Index type
-         * @param x first set of points
-         * @param y second set of points
-         * @param dist output distance matrix
-         * @param m number of points in x
-         * @param n number of points in y
-         * @param k dimensionality
-         * @param workspace temporary workspace needed for computations
-         * @param worksize number of bytes of the workspace
-         * @param stream cuda stream
-         * @param isRowMajor whether the matrices are row-major or col-major
-         *
-         * @note if workspace is passed as nullptr, this will return in
-         *  worksize, the number of bytes of workspace required
-         */
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, void *workspace, size_t worksize,
-              cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
-  auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) {
-    return d_val;
-  };
-  distance<distanceType, InType, AccType, OutType, decltype(default_fin_op),
-           Index_>(x, y, dist, m, n, k, workspace, worksize, default_fin_op,
-                   stream, isRowMajor, metric_arg);
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ *
+ * @note if workspace is passed as nullptr, this will return in
+ *  worksize, the number of bytes of workspace required
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
+  auto default_fin_op = [] __device__(AccType d_val, Index_ g_d_idx) { return d_val; };
+  distance<distanceType, InType, AccType, OutType, decltype(default_fin_op), Index_>(
+    x, y, dist, m, n, k, workspace, worksize, default_fin_op, stream, isRowMajor, metric_arg);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -386,14 +654,16 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
  * @note If the specifed distanceType doesn't need the workspace at all, it
  * returns 0.
  */
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
-                        Index_ k) {
-  size_t worksize = 0;
-  constexpr bool is_allocated =
-    (distanceType <= raft::distance::DistanceType::CosineExpanded) ||
-    (distanceType == raft::distance::DistanceType::CorrelationExpanded);
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
+{
+  size_t worksize             = 0;
+  constexpr bool is_allocated = (distanceType <= raft::distance::DistanceType::CosineExpanded) ||
+                                (distanceType == raft::distance::DistanceType::CorrelationExpanded);
   constexpr int numOfBuffers =
     (distanceType == raft::distance::DistanceType::CorrelationExpanded) ? 2 : 1;
 
@@ -425,17 +695,21 @@ size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
  * @param isRowMajor whether the matrices are row-major or col-major
  */
 template <typename Type, typename Index_, raft::distance::DistanceType DistType>
-void pairwise_distance_impl(const Type *x, const Type *y, Type *dist, Index_ m,
-                            Index_ n, Index_ k,
-                            rmm::device_uvector<char> &workspace,
-                            cudaStream_t stream, bool isRowMajor,
-                            Type metric_arg = 2.0f) {
-  auto worksize =
-    getWorkspaceSize<DistType, Type, Type, Type, Index_>(x, y, m, n, k);
+void pairwise_distance_impl(const Type* x,
+                            const Type* y,
+                            Type* dist,
+                            Index_ m,
+                            Index_ n,
+                            Index_ k,
+                            rmm::device_uvector<char>& workspace,
+                            cudaStream_t stream,
+                            bool isRowMajor,
+                            Type metric_arg = 2.0f)
+{
+  auto worksize = getWorkspaceSize<DistType, Type, Type, Type, Index_>(x, y, m, n, k);
   workspace.resize(worksize, stream);
-  distance<DistType, Type, Type, Type, Index_>(x, y, dist, m, n, k,
-                                               workspace.data(), worksize,
-                                               stream, isRowMajor, metric_arg);
+  distance<DistType, Type, Type, Type, Index_>(
+    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
 }
 /** @} */
 };  // namespace detail
diff --git a/cpp/include/raft/distance/detail/euclidean.cuh b/cpp/include/raft/distance/detail/euclidean.cuh
index 8b8882c244..1166543f8c 100644
--- a/cpp/include/raft/distance/detail/euclidean.cuh
+++ b/cpp/include/raft/distance/detail/euclidean.cuh
@@ -49,30 +49,44 @@ namespace detail {
  * @param fin_op  the final gemm epilogue lambda
 *  @param stream  cuda stream to launch cuda operations.
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn,
-                      const DataT *yn, IdxT m, IdxT n, IdxT k, IdxT lda,
-                      IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput,
-                      FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+void euclideanExpImpl(const DataT* x,
+                      const DataT* y,
+                      const DataT* xn,
+                      const DataT* yn,
+                      IdxT m,
+                      IdxT n,
+                      IdxT k,
+                      IdxT lda,
+                      IdxT ldb,
+                      IdxT ldd,
+                      bool sqrt,
+                      OutT* dOutput,
+                      FinalLambda fin_op,
+                      cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [sqrt] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                         DataT * regxn,
+                                         DataT * regyn,
+                                         IdxT gridStrideX,
+                                         IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -94,47 +108,68 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn,
   constexpr size_t shmemSize =
     KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT));
   if (isRowMajor) {
-    auto euclideanExpRowMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
+    auto euclideanExpRowMajor = pairwiseDistanceMatKernel<true,
+                                                          DataT,
+                                                          AccT,
+                                                          OutT,
+                                                          IdxT,
+                                                          KPolicy,
+                                                          decltype(core_lambda),
+                                                          decltype(epilog_lambda),
+                                                          FinalLambda,
+                                                          true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpRowMajor);
 
     euclideanExpRowMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto euclideanExpColMajor =
-      pairwiseDistanceMatKernel<true, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
+    auto euclideanExpColMajor = pairwiseDistanceMatKernel<true,
+                                                          DataT,
+                                                          AccT,
+                                                          OutT,
+                                                          IdxT,
+                                                          KPolicy,
+                                                          decltype(core_lambda),
+                                                          decltype(epilog_lambda),
+                                                          FinalLambda,
+                                                          false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, shmemSize, euclideanExpColMajor);
     euclideanExpColMajor<<<grid, blk, shmemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda,
-      fin_op);
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                  const DataT *x, const DataT *y, const DataT *xn,
-                  const DataT *yn, bool sqrt, OutT *dOutput, FinalLambda fin_op,
-                  cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void euclideanExp(IdxT m,
+                  IdxT n,
+                  IdxT k,
+                  IdxT lda,
+                  IdxT ldb,
+                  IdxT ldd,
+                  const DataT* x,
+                  const DataT* y,
+                  const DataT* xn,
+                  const DataT* yn,
+                  bool sqrt,
+                  OutT* dOutput,
+                  FinalLambda fin_op,
+                  cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                     isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt,
-                                 dOutput, fin_op, stream);
+    euclideanExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    euclideanExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                     isRowMajor>(x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt,
-                                 dOutput, fin_op, stream);
+    euclideanExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
   } else {
     euclideanExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, xn, yn, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
@@ -162,53 +197,59 @@ void euclideanExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
-                    const InType *pB, OutType *pD, bool enable_sqrt,
-                    AccType *workspace, size_t &worksize, FinalLambda fin_op,
-                    cudaStream_t stream, bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void euclideanAlgo1(Index_ m,
+                    Index_ n,
+                    Index_ k,
+                    const InType* pA,
+                    const InType* pB,
+                    OutType* pD,
+                    bool enable_sqrt,
+                    AccType* workspace,
+                    size_t& worksize,
+                    FinalLambda fin_op,
+                    cudaStream_t stream,
+                    bool isRowMajor)
+{
   auto norm_op = [] __device__(InType in) { return in; };
 
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    ExpOutType;
-  ExpOutType *pDcast = reinterpret_cast<ExpOutType *>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type ExpOutType;
+  ExpOutType* pDcast = reinterpret_cast<ExpOutType*>(pD);
 
-  ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) ||
-           (worksize < m * sizeof(AccType))),
-         "workspace size error");
+  ASSERT(
+    !(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) || (worksize < m * sizeof(AccType))),
+    "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
   Index_ lda, ldb, ldd;
-  InType *col_vec = workspace;
-  InType *row_vec = workspace;
+  InType* col_vec = workspace;
+  InType* row_vec = workspace;
   if (pA != pB) {
     row_vec += m;
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
-    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+    raft::linalg::rowNorm(row_vec, pB, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
   } else {
-    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor,
-                          stream, norm_op);
+    raft::linalg::rowNorm(col_vec, pA, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
   }
 
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     euclideanExp<InType, AccType, ExpOutType, Index_, FinalLambda, true>(
-      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast,
-      fin_op, stream);
+      m, n, k, lda, ldb, ldd, pA, pB, col_vec, row_vec, enable_sqrt, pDcast, fin_op, stream);
   } else {
     lda = n, ldb = m, ldd = m;
     euclideanExp<InType, AccType, ExpOutType, Index_, FinalLambda, false>(
-      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast,
-      fin_op, stream);
+      n, m, k, lda, ldb, ldd, pB, pA, row_vec, col_vec, enable_sqrt, pDcast, fin_op, stream);
   }
 }
 
 /**
- * @brief the unexpanded euclidean distance matrix calculation 
+ * @brief the unexpanded euclidean distance matrix calculation
  *  It computes the following equation: cij = op((ai-bj)^2)
  * @tparam DataT          input data-type (for A and B matrices)
  * @tparam AccT           accumulation data-type
@@ -228,16 +269,30 @@ void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
  * @param[output]   pD output matrix
  * @param fin_op    the final gemm epilogue lambda
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                        IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *dOutput,
-                        FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+void euclideanUnExpImpl(const DataT* x,
+                        const DataT* y,
+                        IdxT m,
+                        IdxT n,
+                        IdxT k,
+                        IdxT lda,
+                        IdxT ldb,
+                        IdxT ldd,
+                        bool sqrt,
+                        OutT* dOutput,
+                        FinalLambda fin_op,
+                        cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -248,10 +303,11 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [sqrt] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [sqrt] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                         DataT * regxn,
+                                         DataT * regyn,
+                                         IdxT gridStrideX,
+                                         IdxT gridStrideY) {
     if (sqrt) {
 #pragma unroll
       for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
@@ -264,48 +320,68 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   if (isRowMajor) {
-    auto euclideanUnExpRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               euclideanUnExpRowMajor);
+    auto euclideanUnExpRowMajor = pairwiseDistanceMatKernel<false,
+                                                            DataT,
+                                                            AccT,
+                                                            OutT,
+                                                            IdxT,
+                                                            KPolicy,
+                                                            decltype(core_lambda),
+                                                            decltype(epilog_lambda),
+                                                            FinalLambda,
+                                                            true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, euclideanUnExpRowMajor);
 
     euclideanUnExpRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
 
   } else {
-    auto euclideanUnExpColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               euclideanUnExpColMajor);
+    auto euclideanUnExpColMajor = pairwiseDistanceMatKernel<false,
+                                                            DataT,
+                                                            AccT,
+                                                            OutT,
+                                                            IdxT,
+                                                            KPolicy,
+                                                            decltype(core_lambda),
+                                                            decltype(epilog_lambda),
+                                                            FinalLambda,
+                                                            false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, euclideanUnExpColMajor);
 
     euclideanUnExpColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                    const DataT *x, const DataT *y, bool sqrt, OutT *dOutput,
-                    FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void euclideanUnExp(IdxT m,
+                    IdxT n,
+                    IdxT k,
+                    IdxT lda,
+                    IdxT ldb,
+                    IdxT ldd,
+                    const DataT* x,
+                    const DataT* y,
+                    bool sqrt,
+                    OutT* dOutput,
+                    FinalLambda fin_op,
+                    cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput,
-                                   fin_op, stream);
+    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput,
-                                   fin_op, stream);
+    euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
   } else {
     euclideanUnExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, sqrt, dOutput, fin_op, stream);
@@ -331,15 +407,25 @@ void euclideanUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void euclideanAlgo2(Index_ m, Index_ n, Index_ k, const InType *pA,
-                    const InType *pB, OutType *pD, bool enable_sqrt,
-                    FinalLambda fin_op, cudaStream_t stream, bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void euclideanAlgo2(Index_ m,
+                    Index_ n,
+                    Index_ k,
+                    const InType* pA,
+                    const InType* pB,
+                    OutType* pD,
+                    bool enable_sqrt,
+                    FinalLambda fin_op,
+                    cudaStream_t stream,
+                    bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    UnExpOutType;
-  UnExpOutType *pDcast = reinterpret_cast<UnExpOutType *>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type UnExpOutType;
+  UnExpOutType* pDcast = reinterpret_cast<UnExpOutType*>(pD);
   Index_ lda, ldb, ldd;
 
   if (isRowMajor) {
diff --git a/cpp/include/raft/distance/detail/fused_l2_nn.cuh b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
index ca8f729a68..9373992ada 100644
--- a/cpp/include/raft/distance/detail/fused_l2_nn.cuh
+++ b/cpp/include/raft/distance/detail/fused_l2_nn.cuh
@@ -36,24 +36,24 @@ template <typename LabelT, typename DataT>
 struct KVPMinReduceImpl {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
 };  // KVPMinReduce
 
 template <typename LabelT, typename DataT>
 struct MinAndDistanceReduceOpImpl {
   typedef typename cub::KeyValuePair<LabelT, DataT> KVP;
-  DI void operator()(LabelT rid, KVP* out, const KVP& other) {
+  DI void operator()(LabelT rid, KVP* out, const KVP& other)
+  {
     if (other.value < out->value) {
-      out->key = other.key;
+      out->key   = other.key;
       out->value = other.value;
     }
   }
 
-  DI void init(KVP* out, DataT maxVal) {
-    out->key = -1;
+  DI void init(KVP* out, DataT maxVal)
+  {
+    out->key   = -1;
     out->value = maxVal;
   }
 };
@@ -61,38 +61,35 @@ struct MinAndDistanceReduceOpImpl {
 template <typename LabelT, typename DataT>
 struct MinReduceOpImpl {
   typedef typename cub::KeyValuePair<LabelT, DataT> KVP;
-  DI void operator()(LabelT rid, DataT* out, const KVP& other) {
-    if (other.value < *out) {
-      *out = other.value;
-    }
+  DI void operator()(LabelT rid, DataT* out, const KVP& other)
+  {
+    if (other.value < *out) { *out = other.value; }
   }
 
   DI void init(DataT* out, DataT maxVal) { *out = maxVal; }
 };
 
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp) {
+__global__ void initKernel(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+{
   auto tid = IdxT(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (tid < m) {
-    redOp.init(min + tid, maxVal);
-  }
+  if (tid < m) { redOp.init(min + tid, maxVal); }
 }
 
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp,
-                cudaStream_t stream) {
+void initialize(OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp, cudaStream_t stream)
+{
   auto blks = raft::ceildiv(m, 256);
-  initKernel<DataT, OutT, IdxT>
-    <<<blks, 256, 0, stream>>>(min, m, maxVal, redOp);
+  initKernel<DataT, OutT, IdxT><<<blks, 256, 0, stream>>>(min, m, maxVal, redOp);
 }
 
 // TODO: specialize this function for MinAndDistanceReduceOp<int, float>
 // with atomicCAS of 64 bit which will eliminate mutex and shfls
-template <typename P, typename OutT, typename IdxT, typename KVPair,
-          typename ReduceOpT>
-DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op,
-                         IdxT m, IdxT gridStrideY) {
-  const auto lid = threadIdx.x % raft::WarpSize;
+template <typename P, typename OutT, typename IdxT, typename KVPair, typename ReduceOpT>
+DI void updateReducedVal(
+  int* mutex, OutT* min, KVPair* val, ReduceOpT red_op, IdxT m, IdxT gridStrideY)
+{
+  const auto lid      = threadIdx.x % raft::WarpSize;
   const auto accrowid = threadIdx.x / P::AccThCols;
 
   // for now have first lane from each warp update a unique output row. This
@@ -117,21 +114,38 @@ DI void updateReducedVal(int* mutex, OutT* min, KVPair* val, ReduceOpT red_op,
     if (j < (raft::WarpSize / P::AccThCols) - 1) {
 #pragma unroll
       for (int i = 0; i < P::AccRowsPerTh; ++i) {
-        auto tmpkey = raft::shfl(val[i].key, (j + 1) * P::AccThCols);
+        auto tmpkey   = raft::shfl(val[i].key, (j + 1) * P::AccThCols);
         auto tmpvalue = raft::shfl(val[i].value, (j + 1) * P::AccThCols);
-        val[i] = {tmpkey, tmpvalue};
+        val[i]        = {tmpkey, tmpvalue};
       }
     }
   }
 }
 
-template <typename DataT, typename OutT, typename IdxT, bool Sqrt, typename P,
-          typename ReduceOpT, typename KVPReduceOpT, typename CoreLambda,
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          bool Sqrt,
+          typename P,
+          typename ReduceOpT,
+          typename KVPReduceOpT,
+          typename CoreLambda,
           typename FinalLambda>
-__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(
-  OutT* min, const DataT* x, const DataT* y, const DataT* xn, const DataT* yn,
-  IdxT m, IdxT n, IdxT k, DataT maxVal, int* mutex, ReduceOpT redOp,
-  KVPReduceOpT pairRedOp, CoreLambda core_op, FinalLambda fin_op) {
+__global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(OutT* min,
+                                                                  const DataT* x,
+                                                                  const DataT* y,
+                                                                  const DataT* xn,
+                                                                  const DataT* yn,
+                                                                  IdxT m,
+                                                                  IdxT n,
+                                                                  IdxT k,
+                                                                  DataT maxVal,
+                                                                  int* mutex,
+                                                                  ReduceOpT redOp,
+                                                                  KVPReduceOpT pairRedOp,
+                                                                  CoreLambda core_op,
+                                                                  FinalLambda fin_op)
+{
   extern __shared__ char smem[];
 
   typedef cub::KeyValuePair<IdxT, DataT> KVPair;
@@ -144,7 +158,9 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(
   // epilogue operation lambda for final value calculation
   auto epilog_lambda = [n, pairRedOp, &val, maxVal] __device__(
                          DataT acc[P::AccRowsPerTh][P::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         DataT * regxn,
+                         DataT * regyn,
+                         IdxT gridStrideX,
                          IdxT gridStrideY) {
     KVPReduceOpT pairRed_op(pairRedOp);
 
@@ -173,72 +189,105 @@ __global__ __launch_bounds__(P::Nthreads, 2) void fusedL2NNkernel(
 #pragma unroll
       for (int j = 0; j < P::AccColsPerTh; ++j) {
         auto tmpkey = acccolid + j * P::AccThCols + gridStrideX;
-        KVPair tmp = {tmpkey, acc[i][j]};
+        KVPair tmp  = {tmpkey, acc[i][j]};
         if (tmpkey < n) {
-          val[i] =
-            pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
+          val[i] = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
         }
       }
     }
   };
 
-  auto rowEpilog_lambda = [m, mutex, min, pairRedOp, redOp, &val,
-                           maxVal] __device__(IdxT gridStrideY) {
-    KVPReduceOpT pairRed_op(pairRedOp);
-    ReduceOpT red_op(redOp);
+  auto rowEpilog_lambda =
+    [m, mutex, min, pairRedOp, redOp, &val, maxVal] __device__(IdxT gridStrideY) {
+      KVPReduceOpT pairRed_op(pairRedOp);
+      ReduceOpT red_op(redOp);
 
-    const auto accrowid = threadIdx.x / P::AccThCols;
-    const auto lid = raft::laneId();
+      const auto accrowid = threadIdx.x / P::AccThCols;
+      const auto lid      = raft::laneId();
 
     // reduce
 #pragma unroll
-    for (int i = 0; i < P::AccRowsPerTh; ++i) {
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
 #pragma unroll
-      for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
-        auto tmpkey = raft::shfl(val[i].key, lid + j);
-        auto tmpvalue = raft::shfl(val[i].value, lid + j);
-        KVPair tmp = {tmpkey, tmpvalue};
-        val[i] =
-          pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
+        for (int j = P::AccThCols / 2; j > 0; j >>= 1) {
+          auto tmpkey   = raft::shfl(val[i].key, lid + j);
+          auto tmpvalue = raft::shfl(val[i].value, lid + j);
+          KVPair tmp    = {tmpkey, tmpvalue};
+          val[i]        = pairRed_op(accrowid + i * P::AccThRows + gridStrideY, tmp, val[i]);
+        }
       }
-    }
 
-    updateReducedVal<P, OutT, IdxT, KVPair, ReduceOpT>(mutex, min, val, red_op,
-                                                       m, gridStrideY);
+      updateReducedVal<P, OutT, IdxT, KVPair, ReduceOpT>(mutex, min, val, red_op, m, gridStrideY);
 
     // reset the val array.
 #pragma unroll
-    for (int i = 0; i < P::AccRowsPerTh; ++i) {
-      val[i] = {-1, maxVal};
-    }
-  };
+      for (int i = 0; i < P::AccRowsPerTh; ++i) {
+        val[i] = {-1, maxVal};
+      }
+    };
 
   IdxT lda = k, ldb = k, ldd = n;
-  PairwiseDistances<true, DataT, DataT, DataT, IdxT, P, CoreLambda,
-                    decltype(epilog_lambda), FinalLambda,
-                    decltype(rowEpilog_lambda), true, false>
-    obj(x, y, m, n, k, lda, ldb, ldd, xn, yn, nullptr, smem, core_op,
-        epilog_lambda, fin_op, rowEpilog_lambda);
+  PairwiseDistances<true,
+                    DataT,
+                    DataT,
+                    DataT,
+                    IdxT,
+                    P,
+                    CoreLambda,
+                    decltype(epilog_lambda),
+                    FinalLambda,
+                    decltype(rowEpilog_lambda),
+                    true,
+                    false>
+    obj(x,
+        y,
+        m,
+        n,
+        k,
+        lda,
+        ldb,
+        ldd,
+        xn,
+        yn,
+        nullptr,
+        smem,
+        core_op,
+        epilog_lambda,
+        fin_op,
+        rowEpilog_lambda);
   obj.run();
 }
 
-template <typename DataT, typename OutT, typename IdxT, int VecLen,
-          typename ReduceOpT, typename KVPReduceOpT>
-void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
-                   const DataT* yn, IdxT m, IdxT n, IdxT k, int* workspace,
-                   ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt,
-                   bool initOutBuffer, cudaStream_t stream) {
+template <typename DataT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename ReduceOpT,
+          typename KVPReduceOpT>
+void fusedL2NNImpl(OutT* min,
+                   const DataT* x,
+                   const DataT* y,
+                   const DataT* xn,
+                   const DataT* yn,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   int* workspace,
+                   ReduceOpT redOp,
+                   KVPReduceOpT pairRedOp,
+                   bool sqrt,
+                   bool initOutBuffer,
+                   cudaStream_t stream)
+{
   typedef typename linalg::Policy4x4<DataT, VecLen>::Policy P;
 
   dim3 blk(P::Nthreads);
-  auto nblks = raft::ceildiv<int>(m, P::Nthreads);
+  auto nblks            = raft::ceildiv<int>(m, P::Nthreads);
   constexpr auto maxVal = std::numeric_limits<DataT>::max();
   typedef cub::KeyValuePair<IdxT, DataT> KVPair;
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(DataT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
   if (initOutBuffer) {
@@ -249,25 +298,34 @@ void fusedL2NNImpl(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
 
   auto fin_op = [] __device__(DataT d_val, int g_d_idx) { return d_val; };
 
-  constexpr size_t shmemSize =
-    P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
+  constexpr size_t shmemSize = P::SmemSize + ((P::Mblk + P::Nblk) * sizeof(DataT));
   if (sqrt) {
-    auto fusedL2NNSqrt =
-      fusedL2NNkernel<DataT, OutT, IdxT, true, P, ReduceOpT, KVPReduceOpT,
-                      decltype(core_lambda), decltype(fin_op)>;
-    dim3 grid = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NNSqrt);
+    auto fusedL2NNSqrt = fusedL2NNkernel<DataT,
+                                         OutT,
+                                         IdxT,
+                                         true,
+                                         P,
+                                         ReduceOpT,
+                                         KVPReduceOpT,
+                                         decltype(core_lambda),
+                                         decltype(fin_op)>;
+    dim3 grid          = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NNSqrt);
 
     fusedL2NNSqrt<<<grid, blk, shmemSize, stream>>>(
-      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp,
-      core_lambda, fin_op);
+      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op);
   } else {
-    auto fusedL2NN =
-      fusedL2NNkernel<DataT, OutT, IdxT, false, P, ReduceOpT, KVPReduceOpT,
-                      decltype(core_lambda), decltype(fin_op)>;
-    dim3 grid = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NN);
-    fusedL2NN<<<grid, blk, shmemSize, stream>>>(min, x, y, xn, yn, m, n, k,
-                                                maxVal, workspace, redOp,
-                                                pairRedOp, core_lambda, fin_op);
+    auto fusedL2NN = fusedL2NNkernel<DataT,
+                                     OutT,
+                                     IdxT,
+                                     false,
+                                     P,
+                                     ReduceOpT,
+                                     KVPReduceOpT,
+                                     decltype(core_lambda),
+                                     decltype(fin_op)>;
+    dim3 grid      = launchConfigGenerator<P>(m, n, shmemSize, fusedL2NN);
+    fusedL2NN<<<grid, blk, shmemSize, stream>>>(
+      min, x, y, xn, yn, m, n, k, maxVal, workspace, redOp, pairRedOp, core_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
diff --git a/cpp/include/raft/distance/detail/hamming.cuh b/cpp/include/raft/distance/detail/hamming.cuh
index 0169ba33a2..886b9d1426 100644
--- a/cpp/include/raft/distance/detail/hamming.cuh
+++ b/cpp/include/raft/distance/detail/hamming.cuh
@@ -23,7 +23,7 @@ namespace detail {
 
 /**
  * @brief the Hamming distance matrix using the unexpanded form:
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = sum(x_i != y_i) / k
  *
  * @tparam DataT          input data-type (for A and B matrices)
@@ -47,30 +47,41 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void hammingUnexpandedImpl(const DataT *x, const DataT *y, IdxT m,
-                                  IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                                  OutT *dOutput, FinalLambda fin_op,
-                                  cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void hammingUnexpandedImpl(const DataT* x,
+                                  const DataT* y,
+                                  IdxT m,
+                                  IdxT n,
+                                  IdxT k,
+                                  IdxT lda,
+                                  IdxT ldb,
+                                  IdxT ldd,
+                                  OutT* dOutput,
+                                  FinalLambda fin_op,
+                                  cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += (x != y);
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += (x != y); };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [k] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [k] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                      DataT * regxn,
+                                      DataT * regyn,
+                                      IdxT gridStrideX,
+                                      IdxT gridStrideY) {
     const DataT one_over_k = DataT(1.0) / k;
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
@@ -82,46 +93,65 @@ static void hammingUnexpandedImpl(const DataT *x, const DataT *y, IdxT m,
   };
 
   if (isRowMajor) {
-    auto hammingUnexpandedRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               hammingUnexpandedRowMajor);
+    auto hammingUnexpandedRowMajor = pairwiseDistanceMatKernel<false,
+                                                               DataT,
+                                                               AccT,
+                                                               OutT,
+                                                               IdxT,
+                                                               KPolicy,
+                                                               decltype(core_lambda),
+                                                               decltype(epilog_lambda),
+                                                               FinalLambda,
+                                                               true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hammingUnexpandedRowMajor);
 
     hammingUnexpandedRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto hammingUnexpandedColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               hammingUnexpandedColMajor);
+    auto hammingUnexpandedColMajor = pairwiseDistanceMatKernel<false,
+                                                               DataT,
+                                                               AccT,
+                                                               OutT,
+                                                               IdxT,
+                                                               KPolicy,
+                                                               decltype(core_lambda),
+                                                               decltype(epilog_lambda),
+                                                               FinalLambda,
+                                                               false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hammingUnexpandedColMajor);
     hammingUnexpandedColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                       const DataT *x, const DataT *y, OutT *dOutput,
-                       FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void hammingUnexpanded(IdxT m,
+                       IdxT n,
+                       IdxT k,
+                       IdxT lda,
+                       IdxT ldb,
+                       IdxT ldd,
+                       const DataT* x,
+                       const DataT* y,
+                       OutT* dOutput,
+                       FinalLambda fin_op,
+                       cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT),
-                          FinalLambda, isRowMajor>(x, y, m, n, k, lda, ldb, ldd,
-                                                   dOutput, fin_op, stream);
+    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT),
-                          FinalLambda, isRowMajor>(x, y, m, n, k, lda, ldb, ldd,
-                                                   dOutput, fin_op, stream);
+    hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     hammingUnexpandedImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -130,7 +160,7 @@ void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the Hamming Unexpanded distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = sum(x_i != y_i) / k
  *
  * @tparam InType input data-type (for A and B matrices)
@@ -148,28 +178,35 @@ void hammingUnexpanded(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void hammingUnexpandedImpl(int m, int n, int k, const InType *pA,
-                           const InType *pB, OutType *pD, FinalLambda fin_op,
-                           cudaStream_t stream, bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void hammingUnexpandedImpl(int m,
+                           int n,
+                           int k,
+                           const InType* pA,
+                           const InType* pB,
+                           OutType* pD,
+                           FinalLambda fin_op,
+                           cudaStream_t stream,
+                           bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    hammingUnexpandedOutType;
+  typedef
+    typename std::conditional<is_bool::value, OutType, AccType>::type hammingUnexpandedOutType;
   Index_ lda, ldb, ldd;
-  hammingUnexpandedOutType *pDcast =
-    reinterpret_cast<hammingUnexpandedOutType *>(pD);
+  hammingUnexpandedOutType* pDcast = reinterpret_cast<hammingUnexpandedOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
-    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_,
-                      FinalLambda, true>(m, n, k, lda, ldb, ldd, pA, pB, pDcast,
-                                         fin_op, stream);
+    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_, FinalLambda, true>(
+      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
 
   } else {
     lda = n, ldb = m, ldd = m;
-    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_,
-                      FinalLambda, false>(n, m, k, lda, ldb, ldd, pB, pA,
-                                          pDcast, fin_op, stream);
+    hammingUnexpanded<InType, AccType, hammingUnexpandedOutType, Index_, FinalLambda, false>(
+      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
   }
 }
 
diff --git a/cpp/include/raft/distance/detail/hellinger.cuh b/cpp/include/raft/distance/detail/hellinger.cuh
index 933d850dbf..189bbed491 100644
--- a/cpp/include/raft/distance/detail/hellinger.cuh
+++ b/cpp/include/raft/distance/detail/hellinger.cuh
@@ -24,7 +24,7 @@ namespace detail {
 
 /**
  * @brief the Hellinger distance matrix using the expanded form:
- *  It computes the following equation: 
+ *  It computes the following equation:
     cij = sqrt(1 - sum(sqrt(x_k * y_k)))
  * This distance computation modifies A and B by computing a sqrt
  * and then performing a `pow(x, 2)` to convert it back. Because of this,
@@ -52,29 +52,40 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
-                          IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                          FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void hellingerImpl(const DataT* x,
+                          const DataT* y,
+                          IdxT m,
+                          IdxT n,
+                          IdxT k,
+                          IdxT lda,
+                          IdxT ldb,
+                          IdxT ldd,
+                          OutT* dOutput,
+                          FinalLambda fin_op,
+                          cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
-  auto unaryOp_lambda = [] __device__(DataT input) {
-    return raft::mySqrt(input);
-  };
+  auto unaryOp_lambda = [] __device__(DataT input) { return raft::mySqrt(input); };
   // First sqrt x and y
   raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-    (DataT *)x, x, m * k, unaryOp_lambda, stream);
+    (DataT*)x, x, m * k, unaryOp_lambda, stream);
 
   if (x != y) {
     raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-      (DataT *)y, y, n * k, unaryOp_lambda, stream);
+      (DataT*)y, y, n * k, unaryOp_lambda, stream);
   }
 
   // Accumulation operation lambda
@@ -85,71 +96,91 @@ static void hellingerImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
       for (int j = 0; j < KPolicy::AccColsPerTh; ++j) {
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
-        const auto finalVal = (1 - acc[i][j]);
+        const auto finalVal  = (1 - acc[i][j]);
         const auto rectifier = (!signbit(finalVal));
-        acc[i][j] = raft::mySqrt(rectifier * finalVal);
+        acc[i][j]            = raft::mySqrt(rectifier * finalVal);
       }
     }
   };
 
   if (isRowMajor) {
-    auto hellingerRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               hellingerRowMajor);
+    auto hellingerRowMajor = pairwiseDistanceMatKernel<false,
+                                                       DataT,
+                                                       AccT,
+                                                       OutT,
+                                                       IdxT,
+                                                       KPolicy,
+                                                       decltype(core_lambda),
+                                                       decltype(epilog_lambda),
+                                                       FinalLambda,
+                                                       true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hellingerRowMajor);
 
     hellingerRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto hellingerColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               hellingerColMajor);
+    auto hellingerColMajor = pairwiseDistanceMatKernel<false,
+                                                       DataT,
+                                                       AccT,
+                                                       OutT,
+                                                       IdxT,
+                                                       KPolicy,
+                                                       decltype(core_lambda),
+                                                       decltype(epilog_lambda),
+                                                       FinalLambda,
+                                                       false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, hellingerColMajor);
     hellingerColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   // Revert sqrt of x and y
   raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-    (DataT *)x, x, m * k, unaryOp_lambda, stream);
+    (DataT*)x, x, m * k, unaryOp_lambda, stream);
   if (x != y) {
     raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-      (DataT *)y, y, n * k, unaryOp_lambda, stream);
+      (DataT*)y, y, n * k, unaryOp_lambda, stream);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-               const DataT *x, const DataT *y, OutT *dOutput,
-               FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void hellinger(IdxT m,
+               IdxT n,
+               IdxT k,
+               IdxT lda,
+               IdxT ldb,
+               IdxT ldd,
+               const DataT* x,
+               const DataT* y,
+               OutT* dOutput,
+               FinalLambda fin_op,
+               cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                              stream);
+    hellingerImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    hellingerImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                  isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                              stream);
+    hellingerImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     hellingerImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -158,7 +189,7 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the Hellinger distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
     sqrt(1 - sum(sqrt(x_k * y_k))
  * This distance computation modifies A and B by computing a sqrt
  * and then performing a `pow(x, 2)` to convert it back. Because of this,
@@ -180,16 +211,25 @@ void hellinger(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void hellingerImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                   OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                   bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void hellingerImpl(int m,
+                   int n,
+                   int k,
+                   const InType* pA,
+                   const InType* pB,
+                   OutType* pD,
+                   FinalLambda fin_op,
+                   cudaStream_t stream,
+                   bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    hellingerOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type hellingerOutType;
   Index_ lda, ldb, ldd;
-  hellingerOutType *pDcast = reinterpret_cast<hellingerOutType *>(pD);
+  hellingerOutType* pDcast = reinterpret_cast<hellingerOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     hellinger<InType, AccType, hellingerOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/detail/jensen_shannon.cuh b/cpp/include/raft/distance/detail/jensen_shannon.cuh
index 1e39f39682..b3240fe398 100644
--- a/cpp/include/raft/distance/detail/jensen_shannon.cuh
+++ b/cpp/include/raft/distance/detail/jensen_shannon.cuh
@@ -23,7 +23,7 @@ namespace detail {
 
 /**
  * @brief the Jensen Shannon distance matrix:
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
             + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
  *
@@ -48,37 +48,49 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void jensenShannonImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
-                              IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                              OutT *dOutput, FinalLambda fin_op,
-                              cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void jensenShannonImpl(const DataT* x,
+                              const DataT* y,
+                              IdxT m,
+                              IdxT n,
+                              IdxT k,
+                              IdxT lda,
+                              IdxT ldb,
+                              IdxT ldd,
+                              OutT* dOutput,
+                              FinalLambda fin_op,
+                              cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
   auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    const DataT m = 0.5f * (x + y);
+    const DataT m     = 0.5f * (x + y);
     const bool m_zero = (m == 0);
-    const auto logM = (!m_zero) * raft::myLog(m + m_zero);
+    const auto logM   = (!m_zero) * raft::myLog(m + m_zero);
 
     const bool x_zero = (x == 0);
     const bool y_zero = (y == 0);
-    acc += (-x * (logM - raft::myLog(x + x_zero))) +
-           (-y * (logM - raft::myLog(y + y_zero)));
+    acc += (-x * (logM - raft::myLog(x + x_zero))) + (-y * (logM - raft::myLog(y + y_zero)));
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -89,46 +101,65 @@ static void jensenShannonImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
   };
 
   if (isRowMajor) {
-    auto jensenShannonRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               jensenShannonRowMajor);
+    auto jensenShannonRowMajor = pairwiseDistanceMatKernel<false,
+                                                           DataT,
+                                                           AccT,
+                                                           OutT,
+                                                           IdxT,
+                                                           KPolicy,
+                                                           decltype(core_lambda),
+                                                           decltype(epilog_lambda),
+                                                           FinalLambda,
+                                                           true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, jensenShannonRowMajor);
 
     jensenShannonRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto jensenShannonColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               jensenShannonColMajor);
+    auto jensenShannonColMajor = pairwiseDistanceMatKernel<false,
+                                                           DataT,
+                                                           AccT,
+                                                           OutT,
+                                                           IdxT,
+                                                           KPolicy,
+                                                           decltype(core_lambda),
+                                                           decltype(epilog_lambda),
+                                                           FinalLambda,
+                                                           false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, jensenShannonColMajor);
     jensenShannonColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                   const DataT *x, const DataT *y, OutT *dOutput,
-                   FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void jensenShannon(IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   IdxT lda,
+                   IdxT ldb,
+                   IdxT ldd,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* dOutput,
+                   FinalLambda fin_op,
+                   cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    jensenShannonImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                                  stream);
+    jensenShannonImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    jensenShannonImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                                  stream);
+    jensenShannonImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     jensenShannonImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -137,7 +168,7 @@ void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the Jensen Shannon distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = sqrt(0.5 * sum( -x_i * (log(0.5 * (x_i + y_i)) - log(x_i))
             + (-y_i * (log(0.5 * (x_i + y_i)) - log(y_i)))))
  *
@@ -156,26 +187,34 @@ void jensenShannon(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void jensenShannonImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                       OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                       bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void jensenShannonImpl(int m,
+                       int n,
+                       int k,
+                       const InType* pA,
+                       const InType* pB,
+                       OutType* pD,
+                       FinalLambda fin_op,
+                       cudaStream_t stream,
+                       bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    jensenShannonOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type jensenShannonOutType;
   Index_ lda, ldb, ldd;
-  jensenShannonOutType *pDcast = reinterpret_cast<jensenShannonOutType *>(pD);
+  jensenShannonOutType* pDcast = reinterpret_cast<jensenShannonOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
-    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda,
-                  true>(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda, true>(
+      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
 
   } else {
     lda = n, ldb = m, ldd = m;
-    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda,
-                  false>(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op,
-                         stream);
+    jensenShannon<InType, AccType, jensenShannonOutType, Index_, FinalLambda, false>(
+      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
   }
 }
 }  // namespace detail
diff --git a/cpp/include/raft/distance/detail/kl_divergence.cuh b/cpp/include/raft/distance/detail/kl_divergence.cuh
index 5a18ba1670..31127a4d8d 100644
--- a/cpp/include/raft/distance/detail/kl_divergence.cuh
+++ b/cpp/include/raft/distance/detail/kl_divergence.cuh
@@ -23,7 +23,7 @@ namespace detail {
 
 /**
  * @brief the KL Divergence distance matrix:
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = 0.5 * sum(x * log (x / y));
  * This distance computation modifies A or B by computing a log(x)
  * and then performing a `pow(e, log(x))` to convert it back. Because of this,
@@ -51,17 +51,29 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
-                             IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                             OutT *dOutput, FinalLambda fin_op,
-                             cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void klDivergenceImpl(const DataT* x,
+                             const DataT* y,
+                             IdxT m,
+                             IdxT n,
+                             IdxT k,
+                             IdxT lda,
+                             IdxT ldb,
+                             IdxT ldd,
+                             OutT* dOutput,
+                             FinalLambda fin_op,
+                             cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -80,13 +92,11 @@ static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
     if (isRowMajor) {
       const bool x_zero = (x == 0);
       const bool y_zero = (y == 0);
-      acc +=
-        x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero));
+      acc += x * (raft::myLog(x + x_zero) - (!y_zero) * raft::myLog(y + y_zero));
     } else {
       const bool y_zero = (y == 0);
       const bool x_zero = (x == 0);
-      acc +=
-        y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero));
+      acc += y * (raft::myLog(y + y_zero) - (!x_zero) * raft::myLog(x + x_zero));
     }
   };
 
@@ -102,10 +112,11 @@ static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
 #pragma unroll
@@ -116,79 +127,158 @@ static void klDivergenceImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
   };
 
   if (isRowMajor) {
-    constexpr auto klDivergenceRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
+    constexpr auto klDivergenceRowMajor = pairwiseDistanceMatKernel<false,
+                                                                    DataT,
+                                                                    AccT,
+                                                                    OutT,
+                                                                    IdxT,
+                                                                    KPolicy,
+                                                                    decltype(core_lambda),
+                                                                    decltype(epilog_lambda),
+                                                                    FinalLambda,
+                                                                    true>;
     constexpr auto klDivergenceRowMajorXequalY =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+      pairwiseDistanceMatKernel<false,
+                                DataT,
+                                AccT,
+                                OutT,
+                                IdxT,
+                                KPolicy,
                                 decltype(core_lambda_x_equal_y),
-                                decltype(epilog_lambda), FinalLambda, true>;
+                                decltype(epilog_lambda),
+                                FinalLambda,
+                                true>;
     if (x != y) {
       raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-        (DataT *)y, y, n * k, unaryOp_lambda, stream);
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                                 klDivergenceRowMajor);
-      klDivergenceRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-        epilog_lambda, fin_op);
+        (DataT*)y, y, n * k, unaryOp_lambda, stream);
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceRowMajor);
+      klDivergenceRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
+                                                                     y,
+                                                                     nullptr,
+                                                                     nullptr,
+                                                                     m,
+                                                                     n,
+                                                                     k,
+                                                                     lda,
+                                                                     ldb,
+                                                                     ldd,
+                                                                     dOutput,
+                                                                     core_lambda,
+                                                                     epilog_lambda,
+                                                                     fin_op);
       // Now reverse previous log (x) back to x using (e ^ log(x))
       raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
-        (DataT *)y, y, n * k, unaryOp_lambda_reverse, stream);
+        (DataT*)y, y, n * k, unaryOp_lambda_reverse, stream);
     } else {
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                                 klDivergenceRowMajorXequalY);
-      klDivergenceRowMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(
-        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput,
-        core_lambda_x_equal_y, epilog_lambda, fin_op);
+      dim3 grid =
+        launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceRowMajorXequalY);
+      klDivergenceRowMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
+                                                                            y,
+                                                                            nullptr,
+                                                                            nullptr,
+                                                                            m,
+                                                                            n,
+                                                                            k,
+                                                                            lda,
+                                                                            ldb,
+                                                                            ldd,
+                                                                            dOutput,
+                                                                            core_lambda_x_equal_y,
+                                                                            epilog_lambda,
+                                                                            fin_op);
     }
   } else {
-    constexpr auto klDivergenceColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
+    constexpr auto klDivergenceColMajor = pairwiseDistanceMatKernel<false,
+                                                                    DataT,
+                                                                    AccT,
+                                                                    OutT,
+                                                                    IdxT,
+                                                                    KPolicy,
+                                                                    decltype(core_lambda),
+                                                                    decltype(epilog_lambda),
+                                                                    FinalLambda,
+                                                                    false>;
     constexpr auto klDivergenceColMajorXequalY =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
+      pairwiseDistanceMatKernel<false,
+                                DataT,
+                                AccT,
+                                OutT,
+                                IdxT,
+                                KPolicy,
                                 decltype(core_lambda_x_equal_y),
-                                decltype(epilog_lambda), FinalLambda, false>;
+                                decltype(epilog_lambda),
+                                FinalLambda,
+                                false>;
     if (x != y) {
       raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda), IdxT>(
-        (DataT *)x, x, m * k, unaryOp_lambda, stream);
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                                 klDivergenceColMajor);
-      klDivergenceColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-        epilog_lambda, fin_op);
+        (DataT*)x, x, m * k, unaryOp_lambda, stream);
+      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceColMajor);
+      klDivergenceColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
+                                                                     y,
+                                                                     nullptr,
+                                                                     nullptr,
+                                                                     m,
+                                                                     n,
+                                                                     k,
+                                                                     lda,
+                                                                     ldb,
+                                                                     ldd,
+                                                                     dOutput,
+                                                                     core_lambda,
+                                                                     epilog_lambda,
+                                                                     fin_op);
       // Now reverse previous log (x) back to x using (e ^ log(x))
       raft::linalg::unaryOp<DataT, decltype(unaryOp_lambda_reverse), IdxT>(
-        (DataT *)x, x, m * k, unaryOp_lambda_reverse, stream);
+        (DataT*)x, x, m * k, unaryOp_lambda_reverse, stream);
     } else {
-      dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                                 klDivergenceColMajorXequalY);
-      klDivergenceColMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(
-        x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput,
-        core_lambda_x_equal_y, epilog_lambda, fin_op);
+      dim3 grid =
+        launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, klDivergenceColMajorXequalY);
+      klDivergenceColMajorXequalY<<<grid, blk, KPolicy::SmemSize, stream>>>(x,
+                                                                            y,
+                                                                            nullptr,
+                                                                            nullptr,
+                                                                            m,
+                                                                            n,
+                                                                            k,
+                                                                            lda,
+                                                                            ldb,
+                                                                            ldd,
+                                                                            dOutput,
+                                                                            core_lambda_x_equal_y,
+                                                                            epilog_lambda,
+                                                                            fin_op);
     }
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                  const DataT *x, const DataT *y, OutT *dOutput,
-                  FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void klDivergence(IdxT m,
+                  IdxT n,
+                  IdxT k,
+                  IdxT lda,
+                  IdxT ldb,
+                  IdxT ldd,
+                  const DataT* x,
+                  const DataT* y,
+                  OutT* dOutput,
+                  FinalLambda fin_op,
+                  cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    klDivergenceImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                     isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                                 stream);
+    klDivergenceImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    klDivergenceImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                     isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                                 stream);
+    klDivergenceImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     klDivergenceImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -197,7 +287,7 @@ void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the KL Divergence distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
       Cij = 0.5 * sum(x * log (x / y));
  * This distance computation modifies A or B by computing a log(x)
  * and then performing a `pow(e, log(x))` to convert it back. Because of this,
@@ -218,25 +308,34 @@ void klDivergence(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void klDivergenceImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                      OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                      bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void klDivergenceImpl(int m,
+                      int n,
+                      int k,
+                      const InType* pA,
+                      const InType* pB,
+                      OutType* pD,
+                      FinalLambda fin_op,
+                      cudaStream_t stream,
+                      bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    klDivergenceOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type klDivergenceOutType;
   Index_ lda, ldb, ldd;
-  klDivergenceOutType *pDcast = reinterpret_cast<klDivergenceOutType *>(pD);
+  klDivergenceOutType* pDcast = reinterpret_cast<klDivergenceOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
-    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda,
-                 true>(m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
+    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda, true>(
+      m, n, k, lda, ldb, ldd, pA, pB, pDcast, fin_op, stream);
 
   } else {
     lda = n, ldb = m, ldd = m;
-    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda,
-                 false>(n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
+    klDivergence<InType, AccType, klDivergenceOutType, Index_, FinalLambda, false>(
+      n, m, k, lda, ldb, ldd, pB, pA, pDcast, fin_op, stream);
   }
 }
 }  // namespace detail
diff --git a/cpp/include/raft/distance/detail/l1.cuh b/cpp/include/raft/distance/detail/l1.cuh
index 33e9bae206..e444e65d1f 100644
--- a/cpp/include/raft/distance/detail/l1.cuh
+++ b/cpp/include/raft/distance/detail/l1.cuh
@@ -43,16 +43,29 @@ namespace detail {
  * @param[output]   pD output matrix
  * @param fin_op    the final gemm epilogue lambda
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                   IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                   FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void l1Impl(const DataT* x,
+                   const DataT* y,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   IdxT lda,
+                   IdxT ldb,
+                   IdxT ldd,
+                   OutT* dOutput,
+                   FinalLambda fin_op,
+                   cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -63,47 +76,69 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) { return; };
+  auto epilog_lambda = [] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                     DataT * regxn,
+                                     DataT * regyn,
+                                     IdxT gridStrideX,
+                                     IdxT gridStrideY) { return; };
 
   if (isRowMajor) {
-    auto l1RowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1RowMajor);
+    auto l1RowMajor = pairwiseDistanceMatKernel<false,
+                                                DataT,
+                                                AccT,
+                                                OutT,
+                                                IdxT,
+                                                KPolicy,
+                                                decltype(core_lambda),
+                                                decltype(epilog_lambda),
+                                                FinalLambda,
+                                                true>;
+    dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1RowMajor);
 
     l1RowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    auto l1ColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid =
-      launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1ColMajor);
+    auto l1ColMajor = pairwiseDistanceMatKernel<false,
+                                                DataT,
+                                                AccT,
+                                                OutT,
+                                                IdxT,
+                                                KPolicy,
+                                                decltype(core_lambda),
+                                                decltype(epilog_lambda),
+                                                FinalLambda,
+                                                false>;
+    dim3 grid       = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, l1ColMajor);
     l1ColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x,
-        const DataT *y, OutT *dOutput, FinalLambda fin_op,
-        cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void l1(IdxT m,
+        IdxT n,
+        IdxT k,
+        IdxT lda,
+        IdxT ldb,
+        IdxT ldd,
+        const DataT* x,
+        const DataT* y,
+        OutT* dOutput,
+        FinalLambda fin_op,
+        cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    l1Impl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-           isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
+    l1Impl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
     l1Impl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -131,16 +166,25 @@ void l1(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd, const DataT *x,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void l1Impl(int m, int n, int k, const InType *pA, const InType *pB,
-            OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-            bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void l1Impl(int m,
+            int n,
+            int k,
+            const InType* pA,
+            const InType* pB,
+            OutType* pD,
+            FinalLambda fin_op,
+            cudaStream_t stream,
+            bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef
-    typename std::conditional<is_bool::value, OutType, AccType>::type L1OutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type L1OutType;
   Index_ lda, ldb, ldd;
-  L1OutType *pDcast = reinterpret_cast<L1OutType *>(pD);
+  L1OutType* pDcast = reinterpret_cast<L1OutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     l1<InType, AccType, L1OutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/detail/minkowski.cuh b/cpp/include/raft/distance/detail/minkowski.cuh
index 8bd3deb08f..22a183c22c 100644
--- a/cpp/include/raft/distance/detail/minkowski.cuh
+++ b/cpp/include/raft/distance/detail/minkowski.cuh
@@ -22,7 +22,7 @@ namespace distance {
 namespace detail {
 
 /**
- * @brief the unexpanded Minkowski distance matrix calculation 
+ * @brief the unexpanded Minkowski distance matrix calculation
  *  It computes the following equation: cij = sum(|x - y|^p)^(1/p)
  * @tparam DataT          input data-type (for A and B matrices)
  * @tparam AccT           accumulation data-type
@@ -45,16 +45,30 @@ namespace detail {
  * @param[in]       stream cuda stream to launch work
  * @param[in]       the value of `p` for Minkowski (l-p) distances.
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                        IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                        FinalLambda fin_op, cudaStream_t stream, DataT p) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+void minkowskiUnExpImpl(const DataT* x,
+                        const DataT* y,
+                        IdxT m,
+                        IdxT n,
+                        IdxT k,
+                        IdxT lda,
+                        IdxT ldb,
+                        IdxT ldd,
+                        OutT* dOutput,
+                        FinalLambda fin_op,
+                        cudaStream_t stream,
+                        DataT p)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
@@ -65,10 +79,11 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   // epilogue operation lambda for final value calculation
-  auto epilog_lambda = [p] __device__(
-                         AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
-                         IdxT gridStrideY) {
+  auto epilog_lambda = [p] __device__(AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
+                                      DataT * regxn,
+                                      DataT * regyn,
+                                      IdxT gridStrideX,
+                                      IdxT gridStrideY) {
     const auto one_over_p = 1.0f / p;
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
@@ -80,48 +95,68 @@ void minkowskiUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   };
 
   if (isRowMajor) {
-    auto minkowskiUnExpRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               minkowskiUnExpRowMajor);
+    auto minkowskiUnExpRowMajor = pairwiseDistanceMatKernel<false,
+                                                            DataT,
+                                                            AccT,
+                                                            OutT,
+                                                            IdxT,
+                                                            KPolicy,
+                                                            decltype(core_lambda),
+                                                            decltype(epilog_lambda),
+                                                            FinalLambda,
+                                                            true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, minkowskiUnExpRowMajor);
 
     minkowskiUnExpRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
 
   } else {
-    auto minkowskiUnExpColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               minkowskiUnExpColMajor);
+    auto minkowskiUnExpColMajor = pairwiseDistanceMatKernel<false,
+                                                            DataT,
+                                                            AccT,
+                                                            OutT,
+                                                            IdxT,
+                                                            KPolicy,
+                                                            decltype(core_lambda),
+                                                            decltype(epilog_lambda),
+                                                            FinalLambda,
+                                                            false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, minkowskiUnExpColMajor);
 
     minkowskiUnExpColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                    const DataT *x, const DataT *y, OutT *dOutput,
-                    FinalLambda fin_op, cudaStream_t stream, DataT metric_arg) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void minkowskiUnExp(IdxT m,
+                    IdxT n,
+                    IdxT k,
+                    IdxT lda,
+                    IdxT ldb,
+                    IdxT ldd,
+                    const DataT* x,
+                    const DataT* y,
+                    OutT* dOutput,
+                    FinalLambda fin_op,
+                    cudaStream_t stream,
+                    DataT metric_arg)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput,
-                                   fin_op, stream, metric_arg);
+    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                       isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput,
-                                   fin_op, stream, metric_arg);
+    minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
   } else {
     minkowskiUnExpImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream, metric_arg);
@@ -147,15 +182,25 @@ void minkowskiUnExp(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param[in] isRowMajor whether the input and output matrices are row major
  * @param[in] metric_arg the value of `p` for Minkowski (l-p) distances.
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void minkowskiImpl(Index_ m, Index_ n, Index_ k, const InType *pA,
-                   const InType *pB, OutType *pD, FinalLambda fin_op,
-                   cudaStream_t stream, bool isRowMajor, InType metric_arg) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void minkowskiImpl(Index_ m,
+                   Index_ n,
+                   Index_ k,
+                   const InType* pA,
+                   const InType* pB,
+                   OutType* pD,
+                   FinalLambda fin_op,
+                   cudaStream_t stream,
+                   bool isRowMajor,
+                   InType metric_arg)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    LpUnexpOutType;
-  LpUnexpOutType *pDcast = reinterpret_cast<LpUnexpOutType *>(pD);
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type LpUnexpOutType;
+  LpUnexpOutType* pDcast = reinterpret_cast<LpUnexpOutType*>(pD);
   Index_ lda, ldb, ldd;
 
   if (isRowMajor) {
diff --git a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
index a98bda1541..8fa7801c70 100644
--- a/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
+++ b/cpp/include/raft/distance/detail/pairwise_distance_base.cuh
@@ -34,11 +34,11 @@ namespace detail {
  * @tparam OutT           output data-type (for C and D matrices)
  * @tparam IdxT           index data-type
  * @tparam Policy         struct which tunes the Contraction kernel
- * @tparam CoreLambda     tells how to accumulate an x and y into 
+ * @tparam CoreLambda     tells how to accumulate an x and y into
                           acc. its signature:
     template <typename AccT, typename DataT> void core_lambda(AccT& acc,
       const DataT& x, const DataT& y)
- * @tparam EpilogueLambda applies an elementwise function to compute final 
+ * @tparam EpilogueLambda applies an elementwise function to compute final
     values. Its signature is:
     template <typename AccT, typename DataT> void epilogue_lambda
     (AccT acc[][], DataT* regxn, DataT* regyn);
@@ -60,21 +60,27 @@ namespace detail {
  * @param fin_op the final gemm epilogue lambda
  */
 
-template <bool useNorms, typename DataT, typename AccT, typename OutT,
-          typename IdxT, typename Policy, typename CoreLambda,
-          typename EpilogueLambda, typename FinalLambda,
-          typename rowEpilogueLambda, bool isRowMajor = true,
-          bool writeOut = true,
-          typename BaseClass =
-            raft::linalg::Contractions_NT<DataT, IdxT, Policy, isRowMajor>>
+template <bool useNorms,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename CoreLambda,
+          typename EpilogueLambda,
+          typename FinalLambda,
+          typename rowEpilogueLambda,
+          bool isRowMajor    = true,
+          bool writeOut      = true,
+          typename BaseClass = raft::linalg::Contractions_NT<DataT, IdxT, Policy, isRowMajor>>
 struct PairwiseDistances : public BaseClass {
  private:
   typedef Policy P;
-  const DataT *xn;
-  const DataT *yn;
-  const DataT *const yBase;
-  OutT *dOutput;
-  char *smem;
+  const DataT* xn;
+  const DataT* yn;
+  const DataT* const yBase;
+  OutT* dOutput;
+  char* smem;
   CoreLambda core_op;
   EpilogueLambda epilog_op;
   FinalLambda fin_op;
@@ -84,11 +90,21 @@ struct PairwiseDistances : public BaseClass {
 
  public:
   // Constructor
-  DI PairwiseDistances(const DataT *_x, const DataT *_y, IdxT _m, IdxT _n,
-                       IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd,
-                       const DataT *_xn, const DataT *_yn, OutT *_dOutput,
-                       char *_smem, CoreLambda _core_op,
-                       EpilogueLambda _epilog_op, FinalLambda _fin_op,
+  DI PairwiseDistances(const DataT* _x,
+                       const DataT* _y,
+                       IdxT _m,
+                       IdxT _n,
+                       IdxT _k,
+                       IdxT _lda,
+                       IdxT _ldb,
+                       IdxT _ldd,
+                       const DataT* _xn,
+                       const DataT* _yn,
+                       OutT* _dOutput,
+                       char* _smem,
+                       CoreLambda _core_op,
+                       EpilogueLambda _epilog_op,
+                       FinalLambda _fin_op,
                        rowEpilogueLambda _rowEpilog_op)
     : BaseClass(_x, _y, _m, _n, _k, _lda, _ldb, _ldd, _smem),
       xn(_xn),
@@ -99,9 +115,12 @@ struct PairwiseDistances : public BaseClass {
       core_op(_core_op),
       epilog_op(_epilog_op),
       fin_op(_fin_op),
-      rowEpilog_op(_rowEpilog_op) {}
+      rowEpilog_op(_rowEpilog_op)
+  {
+  }
 
-  DI void run() {
+  DI void run()
+  {
     for (auto gridStrideY = blockIdx.y * P::Mblk; gridStrideY < this->m;
          gridStrideY += P::Mblk * gridDim.y) {
       for (auto gridStrideX = blockIdx.x * P::Nblk; gridStrideX < this->n;
@@ -115,7 +134,8 @@ struct PairwiseDistances : public BaseClass {
   }
 
  private:
-  DI void updateIndicesY() {
+  DI void updateIndicesY()
+  {
     const auto stride = P::Nblk * gridDim.x;
     if (isRowMajor) {
       this->y += stride * this->ldb;
@@ -125,21 +145,23 @@ struct PairwiseDistances : public BaseClass {
     this->yrowid += stride;
   }
 
-  DI void updateIndicesXY() {
+  DI void updateIndicesXY()
+  {
     const auto stride = P::Mblk * gridDim.y;
     if (isRowMajor) {
       this->x += stride * this->lda;
       this->yrowid = IdxT(blockIdx.x) * P::Nblk + this->srowid;
-      this->y = yBase + this->yrowid * this->ldb;
+      this->y      = yBase + this->yrowid * this->ldb;
     } else {
       this->x += stride;
       this->yrowid = IdxT(blockIdx.x) * P::Nblk;
-      this->y = yBase + this->yrowid + this->srowid * this->ldb;
+      this->y      = yBase + this->yrowid + this->srowid * this->ldb;
     }
     this->xrowid += stride;
   }
 
-  DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY) {
+  DI void ldgNextGridStride(IdxT gridStrideX, IdxT gridStrideY)
+  {
     // Fetch next grid stride ldg if within range
     if ((gridStrideX + gridDim.x * P::Nblk) < this->n) {
       updateIndicesY();
@@ -150,10 +172,9 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void prolog(IdxT gridStrideX, IdxT gridStrideY) {
-    if (gridStrideX == blockIdx.x * P::Nblk) {
-      this->ldgXY(0);
-    }
+  DI void prolog(IdxT gridStrideX, IdxT gridStrideY)
+  {
+    if (gridStrideX == blockIdx.x * P::Nblk) { this->ldgXY(0); }
 
 #pragma unroll
     for (int i = 0; i < P::AccRowsPerTh; ++i) {
@@ -168,7 +189,8 @@ struct PairwiseDistances : public BaseClass {
     this->pageWr ^= 1;
   }
 
-  DI void loop() {
+  DI void loop()
+  {
     for (int kidx = P::Kblk; kidx < this->k; kidx += P::Kblk) {
       this->ldgXY(kidx);
       accumulate();  // on the previous k-block
@@ -185,7 +207,8 @@ struct PairwiseDistances : public BaseClass {
     this->pageRd ^= 1;
   }
 
-  DI void accumulate() {
+  DI void accumulate()
+  {
 #pragma unroll
     for (int ki = 0; ki < P::Kblk; ki += P::Veclen) {
       this->ldsXY(ki);
@@ -202,21 +225,22 @@ struct PairwiseDistances : public BaseClass {
     }
   }
 
-  DI void epilog(IdxT gridStrideX, IdxT gridStrideY) {
+  DI void epilog(IdxT gridStrideX, IdxT gridStrideY)
+  {
     if (useNorms) {
-      DataT *sxNorm = (DataT *)(&smem[P::SmemSize]);
-      DataT *syNorm = (&sxNorm[P::Mblk]);
+      DataT* sxNorm = (DataT*)(&smem[P::SmemSize]);
+      DataT* syNorm = (&sxNorm[P::Mblk]);
 
       // Load x & y norms required by this threadblock in shmem buffer
       if (gridStrideX == blockIdx.x * P::Nblk) {
         for (int i = threadIdx.x; i < P::Mblk; i += P::Nthreads) {
-          auto idx = gridStrideY + i;
+          auto idx  = gridStrideY + i;
           sxNorm[i] = idx < this->m ? xn[idx] : 0;
         }
       }
 
       for (int i = threadIdx.x; i < P::Nblk; i += P::Nthreads) {
-        auto idx = gridStrideX + i;
+        auto idx  = gridStrideX + i;
         syNorm[i] = idx < this->n ? yn[idx] : 0;
       }
 
@@ -291,41 +315,68 @@ struct PairwiseDistances : public BaseClass {
  * @param fin_op    the final gemm epilogue lambda
  */
 
-template <bool useNorms, typename DataT, typename AccT, typename OutT,
-          typename IdxT, typename Policy, typename CoreLambda,
-          typename EpilogueLambda, typename FinalLambda, bool isRowMajor = true,
-          bool writeOut = true>
+template <bool useNorms,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename CoreLambda,
+          typename EpilogueLambda,
+          typename FinalLambda,
+          bool isRowMajor = true,
+          bool writeOut   = true>
 __global__ __launch_bounds__(Policy::Nthreads, 2)
 
-  void pairwiseDistanceMatKernel(const DataT *x, const DataT *y,
-                                 const DataT *_xn, const DataT *_yn, IdxT m,
-                                 IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                                 OutT *dOutput, CoreLambda core_op,
-                                 EpilogueLambda epilog_op, FinalLambda fin_op) {
+  void pairwiseDistanceMatKernel(const DataT* x,
+                                 const DataT* y,
+                                 const DataT* _xn,
+                                 const DataT* _yn,
+                                 IdxT m,
+                                 IdxT n,
+                                 IdxT k,
+                                 IdxT lda,
+                                 IdxT ldb,
+                                 IdxT ldd,
+                                 OutT* dOutput,
+                                 CoreLambda core_op,
+                                 EpilogueLambda epilog_op,
+                                 FinalLambda fin_op)
+{
   extern __shared__ char smem[];
   auto rowEpilog = [] __device__(IdxT starty) { return; };
 
-  PairwiseDistances<useNorms, DataT, AccT, OutT, IdxT, Policy, CoreLambda,
-                    EpilogueLambda, FinalLambda, decltype(rowEpilog),
-                    isRowMajor, writeOut>
-    obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op,
-        epilog_op, fin_op, rowEpilog);
+  PairwiseDistances<useNorms,
+                    DataT,
+                    AccT,
+                    OutT,
+                    IdxT,
+                    Policy,
+                    CoreLambda,
+                    EpilogueLambda,
+                    FinalLambda,
+                    decltype(rowEpilog),
+                    isRowMajor,
+                    writeOut>
+    obj(
+      x, y, m, n, k, lda, ldb, ldd, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op, rowEpilog);
   obj.run();
 }
 
 template <typename P, typename IdxT, typename T>
-dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func) {
-  const auto numSMs = raft::getMultiProcessorCount();
+dim3 launchConfigGenerator(IdxT m, IdxT n, std::size_t sMemSize, T func)
+{
+  const auto numSMs  = raft::getMultiProcessorCount();
   int numBlocksPerSm = 0;
   dim3 grid;
 
-  CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &numBlocksPerSm, func, P::Nthreads, sMemSize));
+  CUDA_CHECK(
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocksPerSm, func, P::Nthreads, sMemSize));
   std::size_t minGridSize = numSMs * numBlocksPerSm;
-  std::size_t yChunks = raft::ceildiv<int>(m, P::Mblk);
-  std::size_t xChunks = raft::ceildiv<int>(n, P::Nblk);
-  grid.y = yChunks > minGridSize ? minGridSize : yChunks;
-  grid.x = (minGridSize - grid.y) <= 0 ? 1 : xChunks;
+  std::size_t yChunks     = raft::ceildiv<int>(m, P::Mblk);
+  std::size_t xChunks     = raft::ceildiv<int>(n, P::Nblk);
+  grid.y                  = yChunks > minGridSize ? minGridSize : yChunks;
+  grid.x                  = (minGridSize - grid.y) <= 0 ? 1 : xChunks;
   if (grid.x != 1) {
     std::size_t i = 1;
     while (grid.y * i < minGridSize) {
diff --git a/cpp/include/raft/distance/detail/russell_rao.cuh b/cpp/include/raft/distance/detail/russell_rao.cuh
index 8e4c4824c3..d4fbb039e7 100644
--- a/cpp/include/raft/distance/detail/russell_rao.cuh
+++ b/cpp/include/raft/distance/detail/russell_rao.cuh
@@ -23,7 +23,7 @@ namespace detail {
 
 /**
  * @brief the Russell Rao distance matrix:
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = (k - sum(x_i * y_i)) / k
  *
  * @tparam DataT          input data-type (for A and B matrices)
@@ -47,29 +47,42 @@ namespace detail {
  * @param[in]       fin_op the final gemm epilogue lambda
  * @param[in]       stream cuda stream to launch work
  */
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, typename FinalLambda, bool isRowMajor>
-static void russellRaoImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
-                           IdxT k, IdxT lda, IdxT ldb, IdxT ldd, OutT *dOutput,
-                           FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          typename FinalLambda,
+          bool isRowMajor>
+static void russellRaoImpl(const DataT* x,
+                           const DataT* y,
+                           IdxT m,
+                           IdxT n,
+                           IdxT k,
+                           IdxT lda,
+                           IdxT ldb,
+                           IdxT ldd,
+                           OutT* dOutput,
+                           FinalLambda fin_op,
+                           cudaStream_t stream)
+{
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
-  typedef
-    typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
+  typedef typename std::conditional<isRowMajor, RowPolicy, ColPolicy>::type KPolicy;
 
   dim3 blk(KPolicy::Nthreads);
 
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   const float one_over_k = 1.0 / k;
   // epilogue operation lambda for final value calculation
   auto epilog_lambda = [k, one_over_k] __device__(
                          AccT acc[KPolicy::AccRowsPerTh][KPolicy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         DataT * regxn,
+                         DataT * regyn,
+                         IdxT gridStrideX,
                          IdxT gridStrideY) {
 #pragma unroll
     for (int i = 0; i < KPolicy::AccRowsPerTh; ++i) {
@@ -81,46 +94,65 @@ static void russellRaoImpl(const DataT *x, const DataT *y, IdxT m, IdxT n,
   };
 
   if (isRowMajor) {
-    constexpr auto russellRaoRowMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, true>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               russellRaoRowMajor);
+    constexpr auto russellRaoRowMajor = pairwiseDistanceMatKernel<false,
+                                                                  DataT,
+                                                                  AccT,
+                                                                  OutT,
+                                                                  IdxT,
+                                                                  KPolicy,
+                                                                  decltype(core_lambda),
+                                                                  decltype(epilog_lambda),
+                                                                  FinalLambda,
+                                                                  true>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, russellRaoRowMajor);
 
     russellRaoRowMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   } else {
-    constexpr auto russellRaoColMajor =
-      pairwiseDistanceMatKernel<false, DataT, AccT, OutT, IdxT, KPolicy,
-                                decltype(core_lambda), decltype(epilog_lambda),
-                                FinalLambda, false>;
-    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize,
-                                               russellRaoColMajor);
+    constexpr auto russellRaoColMajor = pairwiseDistanceMatKernel<false,
+                                                                  DataT,
+                                                                  AccT,
+                                                                  OutT,
+                                                                  IdxT,
+                                                                  KPolicy,
+                                                                  decltype(core_lambda),
+                                                                  decltype(epilog_lambda),
+                                                                  FinalLambda,
+                                                                  false>;
+    dim3 grid = launchConfigGenerator<KPolicy>(m, n, KPolicy::SmemSize, russellRaoColMajor);
     russellRaoColMajor<<<grid, blk, KPolicy::SmemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda,
-      epilog_lambda, fin_op);
+      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, dOutput, core_lambda, epilog_lambda, fin_op);
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          typename FinalLambda, bool isRowMajor>
-void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                const DataT *x, const DataT *y, OutT *dOutput,
-                FinalLambda fin_op, cudaStream_t stream) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename FinalLambda,
+          bool isRowMajor>
+void russellRao(IdxT m,
+                IdxT n,
+                IdxT k,
+                IdxT lda,
+                IdxT ldb,
+                IdxT ldd,
+                const DataT* x,
+                const DataT* y,
+                OutT* dOutput,
+                FinalLambda fin_op,
+                cudaStream_t stream)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    russellRaoImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda,
-                   isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                               stream);
+    russellRaoImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    russellRaoImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda,
-                   isRowMajor>(x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op,
-                               stream);
+    russellRaoImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), FinalLambda, isRowMajor>(
+      x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
   } else {
     russellRaoImpl<DataT, AccT, OutT, IdxT, 1, FinalLambda, isRowMajor>(
       x, y, m, n, k, lda, ldb, ldd, dOutput, fin_op, stream);
@@ -129,7 +161,7 @@ void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
 
 /**
  * @brief the Russell Rao distance matrix calculation
- *  It computes the following equation: 
+ *  It computes the following equation:
     Cij = (k - sum(x_i * y_i)) / k
  *
  * @tparam InType input data-type (for A and B matrices)
@@ -147,16 +179,25 @@ void russellRao(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param stream cuda stream where to launch work
  * @param isRowMajor whether the input and output matrices are row major
  */
-template <typename InType, typename AccType, typename OutType,
-          typename FinalLambda, typename Index_ = int>
-void russellRaoImpl(int m, int n, int k, const InType *pA, const InType *pB,
-                    OutType *pD, FinalLambda fin_op, cudaStream_t stream,
-                    bool isRowMajor) {
+template <typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
+          typename Index_ = int>
+void russellRaoImpl(int m,
+                    int n,
+                    int k,
+                    const InType* pA,
+                    const InType* pB,
+                    OutType* pD,
+                    FinalLambda fin_op,
+                    cudaStream_t stream,
+                    bool isRowMajor)
+{
   typedef std::is_same<OutType, bool> is_bool;
-  typedef typename std::conditional<is_bool::value, OutType, AccType>::type
-    russellRaoOutType;
+  typedef typename std::conditional<is_bool::value, OutType, AccType>::type russellRaoOutType;
   Index_ lda, ldb, ldd;
-  russellRaoOutType *pDcast = reinterpret_cast<russellRaoOutType *>(pD);
+  russellRaoOutType* pDcast = reinterpret_cast<russellRaoOutType*>(pD);
   if (isRowMajor) {
     lda = k, ldb = k, ldd = n;
     russellRao<InType, AccType, russellRaoOutType, Index_, FinalLambda, true>(
diff --git a/cpp/include/raft/distance/distance.hpp b/cpp/include/raft/distance/distance.hpp
index 8b55543ff8..66832c12d2 100644
--- a/cpp/include/raft/distance/distance.hpp
+++ b/cpp/include/raft/distance/distance.hpp
@@ -25,132 +25,163 @@ namespace raft {
 namespace distance {
 
 /**
-* @brief Evaluate pairwise distances with the user epilogue lamba allowed
-* @tparam DistanceType which distance to evaluate
-* @tparam InType input argument type
-* @tparam AccType accumulation type
-* @tparam OutType output type
-* @tparam FinalLambda user-defined epilogue lamba
-* @tparam Index_ Index type
-* @param x first set of points
-* @param y second set of points
-* @param dist output distance matrix
-* @param m number of points in x
-* @param n number of points in y
-* @param k dimensionality
-* @param workspace temporary workspace needed for computations
-* @param worksize number of bytes of the workspace
-* @param fin_op the final gemm epilogue lambda
-* @param stream cuda stream
-* @param isRowMajor whether the matrices are row-major or col-major
-* @param metric_arg metric argument (used for Minkowski distance)
-*
-* @note fin_op: This is a device lambda which is supposed to operate upon the
-* input which is AccType and returns the output in OutType. It's signature is
-* as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
-* any other parameters, feel free to pass them via closure.
-*/
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename FinalLambda,
+ * @brief Evaluate pairwise distances with the user epilogue lamba allowed
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam FinalLambda user-defined epilogue lamba
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param fin_op the final gemm epilogue lambda
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note fin_op: This is a device lambda which is supposed to operate upon the
+ * input which is AccType and returns the output in OutType. It's signature is
+ * as follows:  <pre>OutType fin_op(AccType in, int g_idx);</pre>. If one needs
+ * any other parameters, feel free to pass them via closure.
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename FinalLambda,
           typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, void *workspace, size_t worksize,
-              FinalLambda fin_op, cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              FinalLambda fin_op,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
   detail::distance<distanceType, InType, AccType, OutType, FinalLambda, Index_>(
-    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor,
-    metric_arg);
+    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
 }
 
 /**
-* @brief Evaluate pairwise distances for the simple use case
-* @tparam DistanceType which distance to evaluate
-* @tparam InType input argument type
-* @tparam AccType accumulation type
-* @tparam OutType output type
-* @tparam Index_ Index type
-* @param x first set of points
-* @param y second set of points
-* @param dist output distance matrix
-* @param m number of points in x
-* @param n number of points in y
-* @param k dimensionality
-* @param workspace temporary workspace needed for computations
-* @param worksize number of bytes of the workspace
-* @param stream cuda stream
-* @param isRowMajor whether the matrices are row-major or col-major
-* @param metric_arg metric argument (used for Minkowski distance)
-*
-* @note if workspace is passed as nullptr, this will return in
-*  worksize, the number of bytes of workspace required
-*/
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, void *workspace, size_t worksize,
-              cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param workspace temporary workspace needed for computations
+ * @param worksize number of bytes of the workspace
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note if workspace is passed as nullptr, this will return in
+ *  worksize, the number of bytes of workspace required
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              void* workspace,
+              size_t worksize,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
   detail::distance<distanceType, InType, AccType, OutType, Index_>(
     x, y, dist, m, n, k, workspace, worksize, stream, isRowMajor, metric_arg);
 }
 
 /**
-* @brief Return the exact workspace size to compute the distance
-* @tparam DistanceType which distance to evaluate
-* @tparam InType input argument type
-* @tparam AccType accumulation type
-* @tparam OutType output type
-* @tparam Index_ Index type
-* @param x first set of points
-* @param y second set of points
-* @param m number of points in x
-* @param n number of points in y
-* @param k dimensionality
-*
-* @note If the specified distanceType doesn't need the workspace at all, it
-* returns 0.
-*/
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-size_t getWorkspaceSize(const InType *x, const InType *y, Index_ m, Index_ n,
-                        Index_ k) {
-  return detail::getWorkspaceSize<distanceType, InType, AccType, OutType,
-                                  Index_>(x, y, m, n, k);
+ * @brief Return the exact workspace size to compute the distance
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ *
+ * @note If the specified distanceType doesn't need the workspace at all, it
+ * returns 0.
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
+{
+  return detail::getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
 }
 
 /**
-* @brief Evaluate pairwise distances for the simple use case
-* @tparam DistanceType which distance to evaluate
-* @tparam InType input argument type
-* @tparam AccType accumulation type
-* @tparam OutType output type
-* @tparam Index_ Index type
-* @param x first set of points
-* @param y second set of points
-* @param dist output distance matrix
-* @param m number of points in x
-* @param n number of points in y
-* @param k dimensionality
-* @param stream cuda stream
-* @param isRowMajor whether the matrices are row-major or col-major
-* @param metric_arg metric argument (used for Minkowski distance)
-*
-* @note if workspace is passed as nullptr, this will return in
-*  worksize, the number of bytes of workspace required
-*/
-template <raft::distance::DistanceType distanceType, typename InType,
-          typename AccType, typename OutType, typename Index_ = int>
-void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
-              Index_ n, Index_ k, cudaStream_t stream, bool isRowMajor = true,
-              InType metric_arg = 2.0f) {
+ * @brief Evaluate pairwise distances for the simple use case
+ * @tparam DistanceType which distance to evaluate
+ * @tparam InType input argument type
+ * @tparam AccType accumulation type
+ * @tparam OutType output type
+ * @tparam Index_ Index type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ * @param metric_arg metric argument (used for Minkowski distance)
+ *
+ * @note if workspace is passed as nullptr, this will return in
+ *  worksize, the number of bytes of workspace required
+ */
+template <raft::distance::DistanceType distanceType,
+          typename InType,
+          typename AccType,
+          typename OutType,
+          typename Index_ = int>
+void distance(const InType* x,
+              const InType* y,
+              OutType* dist,
+              Index_ m,
+              Index_ n,
+              Index_ k,
+              cudaStream_t stream,
+              bool isRowMajor   = true,
+              InType metric_arg = 2.0f)
+{
   rmm::device_uvector<char> workspace(0, stream);
-  auto worksize =
-    getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n,
-                                                                     k);
+  auto worksize = getWorkspaceSize<distanceType, InType, AccType, OutType, Index_>(x, y, m, n, k);
   workspace.resize(worksize, stream);
   detail::distance<distanceType, InType, AccType, OutType, Index_>(
-    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor,
-    metric_arg);
+    x, y, dist, m, n, k, workspace.data(), worksize, stream, isRowMajor, metric_arg);
 }
 
 /**
@@ -173,119 +204,117 @@ void distance(const InType *x, const InType *y, OutType *dist, Index_ m,
  * @param isRowMajor whether the matrices are row-major or col-major
  */
 template <typename Type, typename Index_ = int>
-void pairwise_distance(const raft::handle_t &handle, const Type *x,
-                       const Type *y, Type *dist, Index_ m, Index_ n, Index_ k,
-                       rmm::device_uvector<char> &workspace,
+void pairwise_distance(const raft::handle_t& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       Index_ m,
+                       Index_ n,
+                       Index_ k,
+                       rmm::device_uvector<char>& workspace,
                        raft::distance::DistanceType metric,
-                       bool isRowMajor = true, Type metric_arg = 2.0f) {
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f)
+{
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
-      detail::pairwise_distance_impl<Type, Index_,
-                                     raft::distance::DistanceType::L2Expanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Expanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtExpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtExpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::CosineExpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CosineExpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L1:
-      detail::pairwise_distance_impl<Type, Index_,
-                                     raft::distance::DistanceType::L1>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L1>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L2Unexpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2Unexpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::L2SqrtUnexpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::L2SqrtUnexpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::Linf:
-      detail::pairwise_distance_impl<Type, Index_,
-                                     raft::distance::DistanceType::Linf>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Linf>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HellingerExpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor,
-        metric_arg);
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::LpUnexpanded>(
+        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::Canberra:
-      detail::pairwise_distance_impl<Type, Index_,
-                                     raft::distance::DistanceType::Canberra>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::Canberra>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::HammingUnexpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::HammingUnexpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::HammingUnexpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::JensenShannon:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::JensenShannon>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::JensenShannon>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::RusselRaoExpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::RusselRaoExpanded>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::KLDivergence:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::KLDivergence>(
+      detail::pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::KLDivergence>(
         x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
     case raft::distance::DistanceType::CorrelationExpanded:
-      detail::pairwise_distance_impl<
-        Type, Index_, raft::distance::DistanceType::CorrelationExpanded>(
-        x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
+      detail::
+        pairwise_distance_impl<Type, Index_, raft::distance::DistanceType::CorrelationExpanded>(
+          x, y, dist, m, n, k, workspace, handle.get_stream(), isRowMajor);
       break;
-    default:
-      THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
+    default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
 }
 /** @} */
 
 /**
-     * @defgroup pairwise_distance pairwise distance prims
-     * @{
-     * @brief Convenience wrapper around 'distance' prim to convert runtime metric
-     * into compile time for the purpose of dispatch
-     * @tparam Type input/accumulation/output data-type
-     * @tparam Index_ indexing type
-     * @param x first set of points
-     * @param y second set of points
-     * @param dist output distance matrix
-     * @param m number of points in x
-     * @param n number of points in y
-     * @param k dimensionality
-     * @param metric distance metric
-     * @param stream cuda stream
-     * @param isRowMajor whether the matrices are row-major or col-major
-     */
+ * @defgroup pairwise_distance pairwise distance prims
+ * @{
+ * @brief Convenience wrapper around 'distance' prim to convert runtime metric
+ * into compile time for the purpose of dispatch
+ * @tparam Type input/accumulation/output data-type
+ * @tparam Index_ indexing type
+ * @param x first set of points
+ * @param y second set of points
+ * @param dist output distance matrix
+ * @param m number of points in x
+ * @param n number of points in y
+ * @param k dimensionality
+ * @param metric distance metric
+ * @param stream cuda stream
+ * @param isRowMajor whether the matrices are row-major or col-major
+ */
 template <typename Type, typename Index_ = int>
-void pairwise_distance(const raft::handle_t &handle, const Type *x,
-                       const Type *y, Type *dist, Index_ m, Index_ n, Index_ k,
+void pairwise_distance(const raft::handle_t& handle,
+                       const Type* x,
+                       const Type* y,
+                       Type* dist,
+                       Index_ m,
+                       Index_ n,
+                       Index_ k,
                        raft::distance::DistanceType metric,
-                       bool isRowMajor = true, Type metric_arg = 2.0f) {
+                       bool isRowMajor = true,
+                       Type metric_arg = 2.0f)
+{
   rmm::device_uvector<char> workspace(0, handle.get_stream());
-  pairwise_distance<Type, Index_>(handle, x, y, dist, m, n, k, workspace,
-                                  metric, isRowMajor, metric_arg);
+  pairwise_distance<Type, Index_>(
+    handle, x, y, dist, m, n, k, workspace, metric, isRowMajor, metric_arg);
 }
 
 };  // namespace distance
diff --git a/cpp/include/raft/distance/fused_l2_nn.hpp b/cpp/include/raft/distance/fused_l2_nn.hpp
index 0a730506c8..d924ef217c 100644
--- a/cpp/include/raft/distance/fused_l2_nn.hpp
+++ b/cpp/include/raft/distance/fused_l2_nn.hpp
@@ -30,8 +30,7 @@ template <typename LabelT, typename DataT>
 using KVPMinReduce = detail::KVPMinReduceImpl<LabelT, DataT>;
 
 template <typename LabelT, typename DataT>
-using MinAndDistanceReduceOp =
-  detail::MinAndDistanceReduceOpImpl<LabelT, DataT>;
+using MinAndDistanceReduceOp = detail::MinAndDistanceReduceOpImpl<LabelT, DataT>;
 
 template <typename LabelT, typename DataT>
 using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
@@ -40,10 +39,9 @@ using MinReduceOp = detail::MinReduceOpImpl<LabelT, DataT>;
  * Initialize array using init value from reduction op
  */
 template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT>
-void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal,
-                ReduceOpT redOp) {
-  detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp,
-                                                   handle.get_stream());
+void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal, ReduceOpT redOp)
+{
+  detail::initialize<DataT, OutT, IdxT, ReduceOpT>(min, m, maxVal, redOp, handle.get_stream());
 }
 
 /**
@@ -82,25 +80,32 @@ void initialize(const raft::handle_t& handle, OutT* min, IdxT m, DataT maxVal,
  *                           main kernel launch
  * @param[in]  stream        cuda stream
  */
-template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT,
-          typename KVPReduceOpT>
-void fusedL2NN(OutT* min, const DataT* x, const DataT* y, const DataT* xn,
-               const DataT* yn, IdxT m, IdxT n, IdxT k, void* workspace,
-               ReduceOpT redOp, KVPReduceOpT pairRedOp, bool sqrt,
-               bool initOutBuffer, cudaStream_t stream) {
+template <typename DataT, typename OutT, typename IdxT, typename ReduceOpT, typename KVPReduceOpT>
+void fusedL2NN(OutT* min,
+               const DataT* x,
+               const DataT* y,
+               const DataT* xn,
+               const DataT* yn,
+               IdxT m,
+               IdxT n,
+               IdxT k,
+               void* workspace,
+               ReduceOpT redOp,
+               KVPReduceOpT pairRedOp,
+               bool sqrt,
+               bool initOutBuffer,
+               cudaStream_t stream)
+{
   size_t bytes = sizeof(DataT) * k;
   if (16 % sizeof(DataT) == 0 && bytes % 16 == 0) {
     detail::fusedL2NNImpl<DataT, OutT, IdxT, 16 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
-      initOutBuffer, stream);
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
   } else if (8 % sizeof(DataT) == 0 && bytes % 8 == 0) {
     detail::fusedL2NNImpl<DataT, OutT, IdxT, 8 / sizeof(DataT), ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
-      initOutBuffer, stream);
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
   } else {
     detail::fusedL2NNImpl<DataT, OutT, IdxT, 1, ReduceOpT>(
-      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt,
-      initOutBuffer, stream);
+      min, x, y, xn, yn, m, n, k, (int*)workspace, redOp, pairRedOp, sqrt, initOutBuffer, stream);
   }
 }
 
diff --git a/cpp/include/raft/error.hpp b/cpp/include/raft/error.hpp
index c62f2e5f79..773b83ab13 100644
--- a/cpp/include/raft/error.hpp
+++ b/cpp/include/raft/error.hpp
@@ -31,14 +31,14 @@ class exception : public std::exception {
   explicit exception() noexcept : std::exception(), msg_() {}
 
   /** copy ctor */
-  exception(exception const& src) noexcept
-    : std::exception(), msg_(src.what()) {
+  exception(exception const& src) noexcept : std::exception(), msg_(src.what())
+  {
     collect_call_stack();
   }
 
   /** ctor from an input message */
-  explicit exception(std::string const msg) noexcept
-    : std::exception(), msg_(std::move(msg)) {
+  explicit exception(std::string const msg) noexcept : std::exception(), msg_(std::move(msg))
+  {
     collect_call_stack();
   }
 
@@ -51,7 +51,8 @@ class exception : public std::exception {
 
   /** append call stack info to this exception's message for ease of debug */
   // Courtesy: https://www.gnu.org/software/libc/manual/html_node/Backtraces.html
-  void collect_call_stack() noexcept {
+  void collect_call_stack() noexcept
+  {
 #ifdef __GNUC__
     constexpr int kMaxStackDepth = 64;
     void* stack[kMaxStackDepth];  // NOLINT
@@ -90,16 +91,16 @@ struct logic_error : public raft::exception {
 
 // FIXME: Need to be replaced with RAFT_FAIL
 /** macro to throw a runtime error */
-#define THROW(fmt, ...)                                                        \
-  do {                                                                         \
-    std::string msg;                                                           \
-    char errMsg[2048]; /* NOLINT */                                            \
-    std::snprintf(errMsg, sizeof(errMsg),                                      \
-                  "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
-    msg += errMsg;                                                             \
-    std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__);                 \
-    msg += errMsg;                                                             \
-    throw raft::exception(msg);                                                \
+#define THROW(fmt, ...)                                                                    \
+  do {                                                                                     \
+    std::string msg;                                                                       \
+    char errMsg[2048]; /* NOLINT */                                                        \
+    std::snprintf(                                                                         \
+      errMsg, sizeof(errMsg), "exception occured! file=%s line=%d: ", __FILE__, __LINE__); \
+    msg += errMsg;                                                                         \
+    std::snprintf(errMsg, sizeof(errMsg), fmt, ##__VA_ARGS__);                             \
+    msg += errMsg;                                                                         \
+    throw raft::exception(msg);                                                            \
   } while (0)
 
 // FIXME: Need to be replaced with RAFT_EXPECTS
@@ -109,16 +110,15 @@ struct logic_error : public raft::exception {
     if (!(check)) THROW(fmt, ##__VA_ARGS__); \
   } while (0)
 
-#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                      \
-  do {                                                                     \
-    char err_msg[2048]; /* NOLINT */                                       \
-    std::snprintf(err_msg, sizeof(err_msg), location_prefix);              \
-    msg += err_msg;                                                        \
-    std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, \
-                  __LINE__);                                               \
-    msg += err_msg;                                                        \
-    std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__);           \
-    msg += err_msg;                                                        \
+#define SET_ERROR_MSG(msg, location_prefix, fmt, ...)                                 \
+  do {                                                                                \
+    char err_msg[2048]; /* NOLINT */                                                  \
+    std::snprintf(err_msg, sizeof(err_msg), location_prefix);                         \
+    msg += err_msg;                                                                   \
+    std::snprintf(err_msg, sizeof(err_msg), "file=%s line=%d: ", __FILE__, __LINE__); \
+    msg += err_msg;                                                                   \
+    std::snprintf(err_msg, sizeof(err_msg), fmt, ##__VA_ARGS__);                      \
+    msg += err_msg;                                                                   \
   } while (0)
 
 /**
diff --git a/cpp/include/raft/handle.hpp b/cpp/include/raft/handle.hpp
index 794951ca9c..70fff1e210 100644
--- a/cpp/include/raft/handle.hpp
+++ b/cpp/include/raft/handle.hpp
@@ -61,34 +61,30 @@ class handle_t {
         int cur_dev = -1;
         CUDA_CHECK(cudaGetDevice(&cur_dev));
         return cur_dev;
-      }()) {
-    if (n_streams != 0) {
-      streams_ = std::make_unique<rmm::cuda_stream_pool>(n_streams);
-    }
+      }())
+  {
+    if (n_streams != 0) { streams_ = std::make_unique<rmm::cuda_stream_pool>(n_streams); }
     create_resources();
     thrust_policy_ = std::make_unique<rmm::exec_policy>(user_stream_);
   }
 
   /**
-   * @brief Construct a light handle copy from another 
+   * @brief Construct a light handle copy from another
    * user stream, cuda handles, comms and worker pool are not copied
-   * The user_stream of the returned handle is set to the specified stream 
+   * The user_stream of the returned handle is set to the specified stream
    * of the other handle worker pool
    * @param[in] other other handle for which to use streams
-   * @param[in] stream_id stream id in `other` worker streams 
+   * @param[in] stream_id stream id in `other` worker streams
    * to be set as user stream in the constructed handle
    * @param[in] n_streams number worker streams to be created
    */
-  handle_t(const handle_t& other, int stream_id,
-           int n_streams = kNumDefaultWorkerStreams)
-    : dev_id_(other.get_device()) {
-    RAFT_EXPECTS(
-      other.get_num_internal_streams() > 0,
-      "ERROR: the main handle must have at least one worker stream\n");
-    if (n_streams != 0) {
-      streams_ = std::make_unique<rmm::cuda_stream_pool>(n_streams);
-    }
-    prop_ = other.get_device_properties();
+  handle_t(const handle_t& other, int stream_id, int n_streams = kNumDefaultWorkerStreams)
+    : dev_id_(other.get_device())
+  {
+    RAFT_EXPECTS(other.get_num_internal_streams() > 0,
+                 "ERROR: the main handle must have at least one worker stream\n");
+    if (n_streams != 0) { streams_ = std::make_unique<rmm::cuda_stream_pool>(n_streams); }
+    prop_                    = other.get_device_properties();
     device_prop_initialized_ = true;
     create_resources();
     set_stream(other.get_internal_stream(stream_id));
@@ -102,11 +98,10 @@ class handle_t {
 
   void set_stream(cudaStream_t stream) { user_stream_ = stream; }
   cudaStream_t get_stream() const { return user_stream_; }
-  rmm::cuda_stream_view get_stream_view() const {
-    return rmm::cuda_stream_view(user_stream_);
-  }
+  rmm::cuda_stream_view get_stream_view() const { return rmm::cuda_stream_view(user_stream_); }
 
-  cublasHandle_t get_cublas_handle() const {
+  cublasHandle_t get_cublas_handle() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cublas_initialized_) {
       CUBLAS_CHECK(cublasCreate(&cublas_handle_));
@@ -115,7 +110,8 @@ class handle_t {
     return cublas_handle_;
   }
 
-  cusolverDnHandle_t get_cusolver_dn_handle() const {
+  cusolverDnHandle_t get_cusolver_dn_handle() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_dn_initialized_) {
       CUSOLVER_CHECK(cusolverDnCreate(&cusolver_dn_handle_));
@@ -124,7 +120,8 @@ class handle_t {
     return cusolver_dn_handle_;
   }
 
-  cusolverSpHandle_t get_cusolver_sp_handle() const {
+  cusolverSpHandle_t get_cusolver_sp_handle() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusolver_sp_initialized_) {
       CUSOLVER_CHECK(cusolverSpCreate(&cusolver_sp_handle_));
@@ -133,7 +130,8 @@ class handle_t {
     return cusolver_sp_handle_;
   }
 
-  cusparseHandle_t get_cusparse_handle() const {
+  cusparseHandle_t get_cusparse_handle() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!cusparse_initialized_) {
       CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_));
@@ -145,25 +143,27 @@ class handle_t {
   rmm::exec_policy& get_thrust_policy() const { return *thrust_policy_; }
 
   // legacy compatibility for cuML
-  cudaStream_t get_internal_stream(int sid) const {
-    RAFT_EXPECTS(
-      streams_.get() != nullptr,
-      "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value");
+  cudaStream_t get_internal_stream(int sid) const
+  {
+    RAFT_EXPECTS(streams_.get() != nullptr,
+                 "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value");
     return streams_->get_stream(sid).value();
   }
   // new accessor return rmm::cuda_stream_view
-  rmm::cuda_stream_view get_internal_stream_view(int sid) const {
-    RAFT_EXPECTS(
-      streams_.get() != nullptr,
-      "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value");
+  rmm::cuda_stream_view get_internal_stream_view(int sid) const
+  {
+    RAFT_EXPECTS(streams_.get() != nullptr,
+                 "ERROR: rmm::cuda_stream_pool was not initialized with a non-zero value");
     return streams_->get_stream(sid);
   }
 
-  int get_num_internal_streams() const {
+  int get_num_internal_streams() const
+  {
     return streams_.get() != nullptr ? streams_->get_pool_size() : 0;
   }
 
-  std::vector<cudaStream_t> get_internal_streams() const {
+  std::vector<cudaStream_t> get_internal_streams() const
+  {
     std::vector<cudaStream_t> int_streams_vec;
     for (int i = 0; i < get_num_internal_streams(); i++) {
       int_streams_vec.push_back(get_internal_stream(i));
@@ -171,49 +171,51 @@ class handle_t {
     return int_streams_vec;
   }
 
-  void wait_on_user_stream() const {
+  void wait_on_user_stream() const
+  {
     CUDA_CHECK(cudaEventRecord(event_, user_stream_));
     for (int i = 0; i < get_num_internal_streams(); i++) {
       CUDA_CHECK(cudaStreamWaitEvent(get_internal_stream(i), event_, 0));
     }
   }
 
-  void wait_on_internal_streams() const {
+  void wait_on_internal_streams() const
+  {
     for (int i = 0; i < get_num_internal_streams(); i++) {
       CUDA_CHECK(cudaEventRecord(event_, get_internal_stream(i)));
       CUDA_CHECK(cudaStreamWaitEvent(user_stream_, event_, 0));
     }
   }
 
-  void set_comms(std::shared_ptr<comms::comms_t> communicator) {
-    communicator_ = communicator;
-  }
+  void set_comms(std::shared_ptr<comms::comms_t> communicator) { communicator_ = communicator; }
 
-  const comms::comms_t& get_comms() const {
-    RAFT_EXPECTS(this->comms_initialized(),
-                 "ERROR: Communicator was not initialized\n");
+  const comms::comms_t& get_comms() const
+  {
+    RAFT_EXPECTS(this->comms_initialized(), "ERROR: Communicator was not initialized\n");
     return *communicator_;
   }
 
-  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm) {
+  void set_subcomm(std::string key, std::shared_ptr<comms::comms_t> subcomm)
+  {
     subcomms_[key] = subcomm;
   }
 
-  const comms::comms_t& get_subcomm(std::string key) const {
-    RAFT_EXPECTS(subcomms_.find(key) != subcomms_.end(),
-                 "%s was not found in subcommunicators.", key.c_str());
+  const comms::comms_t& get_subcomm(std::string key) const
+  {
+    RAFT_EXPECTS(
+      subcomms_.find(key) != subcomms_.end(), "%s was not found in subcommunicators.", key.c_str());
 
     auto subcomm = subcomms_.at(key);
 
-    RAFT_EXPECTS(nullptr != subcomm.get(),
-                 "ERROR: Subcommunicator was not initialized");
+    RAFT_EXPECTS(nullptr != subcomm.get(), "ERROR: Subcommunicator was not initialized");
 
     return *subcomm;
   }
 
   bool comms_initialized() const { return (nullptr != communicator_.get()); }
 
-  const cudaDeviceProp& get_device_properties() const {
+  const cudaDeviceProp& get_device_properties() const
+  {
     std::lock_guard<std::mutex> _(mutex_);
     if (!device_prop_initialized_) {
       CUDA_CHECK(cudaGetDeviceProperties(&prop_, dev_id_));
@@ -243,29 +245,28 @@ class handle_t {
   mutable bool device_prop_initialized_{false};
   mutable std::mutex mutex_;
 
-  void create_resources() {
-    CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-  }
+  void create_resources() { CUDA_CHECK(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); }
 
-  void destroy_resources() {
+  void destroy_resources()
+  {
     ///@todo: enable *_NO_THROW variants once we have enabled logging
     if (cusparse_initialized_) {
-      //CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_));
+      // CUSPARSE_CHECK_NO_THROW(cusparseDestroy(cusparse_handle_));
       CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_));
     }
     if (cusolver_dn_initialized_) {
-      //CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
+      // CUSOLVER_CHECK_NO_THROW(cusolverDnDestroy(cusolver_dn_handle_));
       CUSOLVER_CHECK(cusolverDnDestroy(cusolver_dn_handle_));
     }
     if (cusolver_sp_initialized_) {
-      //CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
+      // CUSOLVER_CHECK_NO_THROW(cusolverSpDestroy(cusolver_sp_handle_));
       CUSOLVER_CHECK(cusolverSpDestroy(cusolver_sp_handle_));
     }
     if (cublas_initialized_) {
-      //CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_));
+      // CUBLAS_CHECK_NO_THROW(cublasDestroy(cublas_handle_));
       CUBLAS_CHECK(cublasDestroy(cublas_handle_));
     }
-    //CUDA_CHECK_NO_THROW(cudaEventDestroy(event_));
+    // CUDA_CHECK_NO_THROW(cudaEventDestroy(event_));
     CUDA_CHECK(cudaEventDestroy(event_));
   }
 };  // class handle_t
@@ -275,7 +276,8 @@ class handle_t {
  */
 class stream_syncer {
  public:
-  explicit stream_syncer(const handle_t& handle) : handle_(handle) {
+  explicit stream_syncer(const handle_t& handle) : handle_(handle)
+  {
     handle_.wait_on_user_stream();
   }
   ~stream_syncer() { handle_.wait_on_internal_streams(); }
diff --git a/cpp/include/raft/integer_utils.h b/cpp/include/raft/integer_utils.h
index a7cfb9287b..5fc56de14b 100644
--- a/cpp/include/raft/integer_utils.h
+++ b/cpp/include/raft/integer_utils.h
@@ -34,15 +34,13 @@ namespace raft {
  * `modulus` is positive.
  */
 template <typename S>
-inline S round_up_safe(S number_to_round, S modulus) {
+inline S round_up_safe(S number_to_round, S modulus)
+{
   auto remainder = number_to_round % modulus;
-  if (remainder == 0) {
-    return number_to_round;
-  }
+  if (remainder == 0) { return number_to_round; }
   auto rounded_up = number_to_round - remainder + modulus;
   if (rounded_up < number_to_round) {
-    throw std::invalid_argument(
-      "Attempt to round up beyond the type's maximum value");
+    throw std::invalid_argument("Attempt to round up beyond the type's maximum value");
   }
   return rounded_up;
 }
@@ -53,8 +51,9 @@ inline S round_up_safe(S number_to_round, S modulus) {
  * `modulus` is positive.
  */
 template <typename S>
-inline S round_down_safe(S number_to_round, S modulus) {
-  auto remainder = number_to_round % modulus;
+inline S round_down_safe(S number_to_round, S modulus)
+{
+  auto remainder    = number_to_round % modulus;
   auto rounded_down = number_to_round - remainder;
   return rounded_down;
 }
@@ -72,25 +71,28 @@ inline S round_down_safe(S number_to_round, S modulus) {
  * the result will be incorrect
  */
 template <typename S, typename T>
-constexpr inline S div_rounding_up_unsafe(const S& dividend,
-                                          const T& divisor) noexcept {
+constexpr inline S div_rounding_up_unsafe(const S& dividend, const T& divisor) noexcept
+{
   return (dividend + divisor - 1) / divisor;
 }
 
 namespace detail {
 template <typename I>
 constexpr inline I div_rounding_up_safe(std::integral_constant<bool, false>,
-                                        I dividend, I divisor) noexcept {
+                                        I dividend,
+                                        I divisor) noexcept
+{
   // TODO: This could probably be implemented faster
-  return (dividend > divisor)
-           ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
-           : (dividend > 0);
+  return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
+                              : (dividend > 0);
 }
 
 template <typename I>
 constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
-                                        I dividend, I divisor) noexcept {
-  auto quotient = dividend / divisor;
+                                        I dividend,
+                                        I divisor) noexcept
+{
+  auto quotient  = dividend / divisor;
   auto remainder = dividend % divisor;
   return quotient + (remainder != 0);
 }
@@ -110,16 +112,17 @@ constexpr inline I div_rounding_up_safe(std::integral_constant<bool, true>,
  * approach of using (dividend + divisor - 1) / divisor
  */
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, I>
-div_rounding_up_safe(I dividend, I divisor) noexcept {
-  using i_is_a_signed_type =
-    std::integral_constant<bool, std::is_signed<I>::value>;
+constexpr inline std::enable_if_t<std::is_integral<I>::value, I> div_rounding_up_safe(
+  I dividend, I divisor) noexcept
+{
+  using i_is_a_signed_type = std::integral_constant<bool, std::is_signed<I>::value>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
 template <typename I>
-constexpr inline std::enable_if_t<std::is_integral<I>::value, bool>
-is_a_power_of_two(I val) noexcept {
+constexpr inline std::enable_if_t<std::is_integral<I>::value, bool> is_a_power_of_two(
+  I val) noexcept
+{
   return ((val - 1) & val) == 0;
 }
 
@@ -147,14 +150,14 @@ is_a_power_of_two(I val) noexcept {
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(
-  T value) {
+std::enable_if_t<std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
+{
   return std::abs(value);
 }
 // Unsigned type just returns itself.
 template <typename T>
-std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(
-  T value) {
+std::enable_if_t<!std::is_signed<T>::value, T> constexpr inline absolute_value(T value)
+{
   return value;
 }
 
diff --git a/cpp/include/raft/label/classlabels.cuh b/cpp/include/raft/label/classlabels.cuh
index b2302836bc..a2e29952d7 100644
--- a/cpp/include/raft/label/classlabels.cuh
+++ b/cpp/include/raft/label/classlabels.cuh
@@ -42,26 +42,25 @@ namespace label {
  * \param [in] stream cuda stream
  */
 template <typename value_t>
-int getUniquelabels(rmm::device_uvector<value_t> &unique, value_t *y, size_t n,
-                    cudaStream_t stream) {
+int getUniquelabels(rmm::device_uvector<value_t>& unique, value_t* y, size_t n, cudaStream_t stream)
+{
   rmm::device_scalar<int> d_num_selected(stream);
   rmm::device_uvector<value_t> workspace(n, stream);
-  size_t bytes = 0;
+  size_t bytes  = 0;
   size_t bytes2 = 0;
 
   // Query how much temporary storage we will need for cub operations
   // and allocate it
   cub::DeviceRadixSort::SortKeys(NULL, bytes, y, workspace.data(), n);
-  cub::DeviceSelect::Unique(NULL, bytes2, workspace.data(), workspace.data(),
-                            d_num_selected.data(), n);
+  cub::DeviceSelect::Unique(
+    NULL, bytes2, workspace.data(), workspace.data(), d_num_selected.data(), n);
   bytes = max(bytes, bytes2);
   rmm::device_uvector<char> cub_storage(bytes, stream);
 
   // Select Unique classes
-  cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, workspace.data(),
-                                 n);
-  cub::DeviceSelect::Unique(cub_storage.data(), bytes, workspace.data(),
-                            workspace.data(), d_num_selected.data(), n);
+  cub::DeviceRadixSort::SortKeys(cub_storage.data(), bytes, y, workspace.data(), n);
+  cub::DeviceSelect::Unique(
+    cub_storage.data(), bytes, workspace.data(), workspace.data(), d_num_selected.data(), n);
 
   int n_unique = d_num_selected.value(stream);
   // Copy unique classes to output
@@ -90,16 +89,17 @@ int getUniquelabels(rmm::device_uvector<value_t> &unique, value_t *y, size_t n,
  * \param [in] stream cuda stream
  */
 template <typename value_t>
-void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes,
-                  value_t *y_out, int idx, cudaStream_t stream) {
+void getOvrlabels(
+  value_t* y, int n, value_t* y_unique, int n_classes, value_t* y_out, int idx, cudaStream_t stream)
+{
   ASSERT(idx < n_classes,
          "Parameter idx should not be larger than the number "
          "of classes");
   raft::linalg::unaryOp(
-    y_out, y, n,
-    [idx, y_unique] __device__(value_t y) {
-      return y == y_unique[idx] ? +1 : -1;
-    },
+    y_out,
+    y,
+    n,
+    [idx, y_unique] __device__(value_t y) { return y == y_unique[idx] ? +1 : -1; },
     stream);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -108,9 +108,14 @@ void getOvrlabels(value_t *y, int n, value_t *y_unique, int n_classes,
 // +/-1, return array with the new class labels and corresponding indices.
 
 template <typename Type, int TPB_X, typename Lambda>
-__global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in,
-                                 Type *out, size_t N, Lambda filter_op,
-                                 bool zero_based = false) {
+__global__ void map_label_kernel(Type* map_ids,
+                                 size_t N_labels,
+                                 Type* in,
+                                 Type* out,
+                                 size_t N,
+                                 Lambda filter_op,
+                                 bool zero_based = false)
+{
   int tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     if (!filter_op(in[tid])) {
@@ -125,27 +130,28 @@ __global__ void map_label_kernel(Type *map_ids, size_t N_labels, Type *in,
 }
 
 /**
-   * Maps an input array containing a series of numbers into a new array
-   * where numbers have been mapped to a monotonically increasing set
-   * of labels. This can be useful in machine learning algorithms, for instance,
-   * where a given set of labels is not taken from a monotonically increasing
-   * set. This can happen if they are filtered or if only a subset of the
-   * total labels are used in a dataset. This is also useful in graph algorithms
-   * where a set of vertices need to be labeled in a monotonically increasing
-   * order.
-   * @tparam Type the numeric type of the input and output arrays
-   * @tparam Lambda the type of an optional filter function, which determines
-   * which items in the array to map.
-   * @param out the output monotonic array
-   * @param in input label array
-   * @param N number of elements in the input array
-   * @param stream cuda stream to use
-   * @param filter_op an optional function for specifying which values
-   * should have monotonically increasing labels applied to them.
-   */
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @tparam Lambda the type of an optional filter function, which determines
+ * which items in the array to map.
+ * @param out the output monotonic array
+ * @param in input label array
+ * @param N number of elements in the input array
+ * @param stream cuda stream to use
+ * @param filter_op an optional function for specifying which values
+ * should have monotonically increasing labels applied to them.
+ */
 template <typename Type, typename Lambda>
-void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
-                    Lambda filter_op, bool zero_based = false) {
+void make_monotonic(
+  Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op, bool zero_based = false)
+{
   static const size_t TPB_X = 256;
 
   dim3 blocks(raft::ceildiv(N, TPB_X));
@@ -159,25 +165,25 @@ void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
 }
 
 /**
-   * Maps an input array containing a series of numbers into a new array
-   * where numbers have been mapped to a monotonically increasing set
-   * of labels. This can be useful in machine learning algorithms, for instance,
-   * where a given set of labels is not taken from a monotonically increasing
-   * set. This can happen if they are filtered or if only a subset of the
-   * total labels are used in a dataset. This is also useful in graph algorithms
-   * where a set of vertices need to be labeled in a monotonically increasing
-   * order.
-   * @tparam Type the numeric type of the input and output arrays
-   * @tparam Lambda the type of an optional filter function, which determines
-   * which items in the array to map.
-   * @param out output label array with labels assigned monotonically
-   * @param in input label array
-   * @param N number of elements in the input array
-   * @param stream cuda stream to use
-   */
+ * Maps an input array containing a series of numbers into a new array
+ * where numbers have been mapped to a monotonically increasing set
+ * of labels. This can be useful in machine learning algorithms, for instance,
+ * where a given set of labels is not taken from a monotonically increasing
+ * set. This can happen if they are filtered or if only a subset of the
+ * total labels are used in a dataset. This is also useful in graph algorithms
+ * where a set of vertices need to be labeled in a monotonically increasing
+ * order.
+ * @tparam Type the numeric type of the input and output arrays
+ * @tparam Lambda the type of an optional filter function, which determines
+ * which items in the array to map.
+ * @param out output label array with labels assigned monotonically
+ * @param in input label array
+ * @param N number of elements in the input array
+ * @param stream cuda stream to use
+ */
 template <typename Type>
-void make_monotonic(Type *out, Type *in, size_t N, cudaStream_t stream,
-                    bool zero_based = false) {
+void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, bool zero_based = false)
+{
   make_monotonic<Type>(
     out, in, N, stream, [] __device__(Type val) { return false; }, zero_based);
 }
diff --git a/cpp/include/raft/label/merge_labels.cuh b/cpp/include/raft/label/merge_labels.cuh
index bed74581a2..1ee0659b0d 100644
--- a/cpp/include/raft/label/merge_labels.cuh
+++ b/cpp/include/raft/label/merge_labels.cuh
@@ -35,8 +35,10 @@ __global__ void __launch_bounds__(TPB_X)
   propagate_label_kernel(const value_idx* __restrict__ labels_a,
                          const value_idx* __restrict__ labels_b,
                          value_idx* __restrict__ R,
-                         const bool* __restrict__ mask, bool* __restrict__ m,
-                         value_idx N) {
+                         const bool* __restrict__ mask,
+                         bool* __restrict__ m,
+                         value_idx N)
+{
   value_idx tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     if (__ldg((char*)mask + tid)) {
@@ -65,15 +67,17 @@ template <typename value_idx, int TPB_X = 256>
 __global__ void __launch_bounds__(TPB_X)
   reassign_label_kernel(value_idx* __restrict__ labels_a,
                         const value_idx* __restrict__ labels_b,
-                        const value_idx* __restrict__ R, value_idx N,
-                        value_idx MAX_LABEL) {
+                        const value_idx* __restrict__ R,
+                        value_idx N,
+                        value_idx MAX_LABEL)
+{
   value_idx tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     // Note: labels are from 1 to N
-    value_idx la = labels_a[tid];
-    value_idx lb = __ldg(labels_b + tid);
-    value_idx ra = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1;
-    value_idx rb = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1;
+    value_idx la  = labels_a[tid];
+    value_idx lb  = __ldg(labels_b + tid);
+    value_idx ra  = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1;
+    value_idx rb  = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1;
     labels_a[tid] = min(ra, rb);
   }
 }
@@ -108,9 +112,14 @@ __global__ void __launch_bounds__(TPB_X)
  * @param[in]    stream      CUDA stream
  */
 template <typename value_idx = int, int TPB_X = 256>
-void merge_labels(value_idx* labels_a, const value_idx* labels_b,
-                  const bool* mask, value_idx* R, bool* m, value_idx N,
-                  cudaStream_t stream) {
+void merge_labels(value_idx* labels_a,
+                  const value_idx* labels_b,
+                  const bool* mask,
+                  value_idx* R,
+                  bool* m,
+                  value_idx N,
+                  cudaStream_t stream)
+{
   dim3 blocks(raft::ceildiv(N, value_idx(TPB_X)));
   dim3 threads(TPB_X);
   value_idx MAX_LABEL = std::numeric_limits<value_idx>::max();
diff --git a/cpp/include/raft/lap/d_structs.h b/cpp/include/raft/lap/d_structs.h
index ed545b7198..e488dc528f 100644
--- a/cpp/include/raft/lap/d_structs.h
+++ b/cpp/include/raft/lap/d_structs.h
@@ -26,18 +26,18 @@
 
 template <typename vertex_t, typename weight_t>
 struct Vertices {
-  vertex_t *row_assignments;
-  vertex_t *col_assignments;
-  int *row_covers;
-  int *col_covers;
-  weight_t *row_duals;
-  weight_t *col_duals;
-  weight_t *col_slacks;
+  vertex_t* row_assignments;
+  vertex_t* col_assignments;
+  int* row_covers;
+  int* col_covers;
+  weight_t* row_duals;
+  weight_t* col_duals;
+  weight_t* col_slacks;
 };
 
 template <typename vertex_t>
 struct VertexData {
-  vertex_t *parents;
-  vertex_t *children;
-  int *is_visited;
+  vertex_t* parents;
+  vertex_t* children;
+  int* is_visited;
 };
diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh
index f64afb3549..42b898ebff 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.cuh
@@ -39,12 +39,12 @@ class LinearAssignmentProblem {
   vertex_t batchsize_;
   weight_t epsilon_;
 
-  weight_t const *d_costs_;
+  weight_t const* d_costs_;
 
   Vertices<vertex_t, weight_t> d_vertices_dev;
   VertexData<vertex_t> d_row_data_dev, d_col_data_dev;
 
-  raft::handle_t const &handle_;
+  raft::handle_t const& handle_;
   rmm::device_uvector<int> row_covers_v;
   rmm::device_uvector<int> col_covers_v;
   rmm::device_uvector<weight_t> row_duals_v;
@@ -60,8 +60,10 @@ class LinearAssignmentProblem {
   rmm::device_uvector<weight_t> obj_val_dual_v;
 
  public:
-  LinearAssignmentProblem(raft::handle_t const &handle, vertex_t size,
-                          vertex_t batchsize, weight_t epsilon)
+  LinearAssignmentProblem(raft::handle_t const& handle,
+                          vertex_t size,
+                          vertex_t batchsize,
+                          weight_t epsilon)
     : handle_(handle),
       size_(size),
       batchsize_(batchsize),
@@ -79,11 +81,13 @@ class LinearAssignmentProblem {
       row_children_v(0, handle_.get_stream()),
       col_children_v(0, handle_.get_stream()),
       obj_val_primal_v(0, handle_.get_stream()),
-      obj_val_dual_v(0, handle_.get_stream()) {}
+      obj_val_dual_v(0, handle_.get_stream())
+  {
+  }
 
   // Executes Hungarian algorithm on the input cost matrix.
-  void solve(weight_t const *d_cost_matrix, vertex_t *d_row_assignment,
-             vertex_t *d_col_assignment) {
+  void solve(weight_t const* d_cost_matrix, vertex_t* d_row_assignment, vertex_t* d_col_assignment)
+  {
     initializeDevice();
 
     d_vertices_dev.row_assignments = d_row_assignment;
@@ -95,27 +99,13 @@ class LinearAssignmentProblem {
 
     while (step != 100) {
       switch (step) {
-        case 0:
-          step = hungarianStep0();
-          break;
-        case 1:
-          step = hungarianStep1();
-          break;
-        case 2:
-          step = hungarianStep2();
-          break;
-        case 3:
-          step = hungarianStep3();
-          break;
-        case 4:
-          step = hungarianStep4();
-          break;
-        case 5:
-          step = hungarianStep5();
-          break;
-        case 6:
-          step = hungarianStep6();
-          break;
+        case 0: step = hungarianStep0(); break;
+        case 1: step = hungarianStep1(); break;
+        case 2: step = hungarianStep2(); break;
+        case 3: step = hungarianStep3(); break;
+        case 4: step = hungarianStep4(); break;
+        case 5: step = hungarianStep5(); break;
+        case 6: step = hungarianStep6(); break;
       }
     }
 
@@ -123,36 +113,39 @@ class LinearAssignmentProblem {
   }
 
   // Function for getting optimal row dual vector for subproblem spId.
-  std::pair<const weight_t *, vertex_t> getRowDualVector(int spId) const {
+  std::pair<const weight_t*, vertex_t> getRowDualVector(int spId) const
+  {
     return std::make_pair(row_duals_v.data() + spId * size_, size_);
   }
 
   // Function for getting optimal col dual vector for subproblem spId.
-  std::pair<const weight_t *, vertex_t> getColDualVector(int spId) {
+  std::pair<const weight_t*, vertex_t> getColDualVector(int spId)
+  {
     return std::make_pair(col_duals_v.data() + spId * size_, size_);
   }
 
   // Function for getting optimal primal objective value for subproblem spId.
-  weight_t getPrimalObjectiveValue(int spId) {
+  weight_t getPrimalObjectiveValue(int spId)
+  {
     weight_t result;
-    raft::update_host(&result, obj_val_primal_v.data() + spId, 1,
-                      handle_.get_stream());
+    raft::update_host(&result, obj_val_primal_v.data() + spId, 1, handle_.get_stream());
     CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
   // Function for getting optimal dual objective value for subproblem spId.
-  weight_t getDualObjectiveValue(int spId) {
+  weight_t getDualObjectiveValue(int spId)
+  {
     weight_t result;
-    raft::update_host(&result, obj_val_dual_v.data() + spId, 1,
-                      handle_.get_stream());
+    raft::update_host(&result, obj_val_dual_v.data() + spId, 1, handle_.get_stream());
     CHECK_CUDA(handle_.get_stream());
     return result;
   }
 
  private:
   // Helper function for initializing global variables and arrays on a single host.
-  void initializeDevice() {
+  void initializeDevice()
+  {
     cudaStream_t stream = handle_.get_stream();
     row_covers_v.resize(batchsize_ * size_, stream);
     col_covers_v.resize(batchsize_ * size_, stream);
@@ -171,39 +164,36 @@ class LinearAssignmentProblem {
     d_vertices_dev.row_covers = row_covers_v.data();
     d_vertices_dev.col_covers = col_covers_v.data();
 
-    d_vertices_dev.row_duals = row_duals_v.data();
-    d_vertices_dev.col_duals = col_duals_v.data();
+    d_vertices_dev.row_duals  = row_duals_v.data();
+    d_vertices_dev.col_duals  = col_duals_v.data();
     d_vertices_dev.col_slacks = col_slacks_v.data();
 
     d_row_data_dev.is_visited = row_is_visited_v.data();
     d_col_data_dev.is_visited = col_is_visited_v.data();
-    d_row_data_dev.parents = row_parents_v.data();
-    d_row_data_dev.children = row_children_v.data();
-    d_col_data_dev.parents = col_parents_v.data();
-    d_col_data_dev.children = col_children_v.data();
-
-    thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(),
-                 int{0});
-    thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(),
-                 int{0});
-    thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(),
-                 weight_t{0});
-    thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(),
-                 weight_t{0});
+    d_row_data_dev.parents    = row_parents_v.data();
+    d_row_data_dev.children   = row_children_v.data();
+    d_col_data_dev.parents    = col_parents_v.data();
+    d_col_data_dev.children   = col_children_v.data();
+
+    thrust::fill(thrust::device, row_covers_v.begin(), row_covers_v.end(), int{0});
+    thrust::fill(thrust::device, col_covers_v.begin(), col_covers_v.end(), int{0});
+    thrust::fill(thrust::device, row_duals_v.begin(), row_duals_v.end(), weight_t{0});
+    thrust::fill(thrust::device, col_duals_v.begin(), col_duals_v.end(), weight_t{0});
   }
 
   // Function for calculating initial zeros by subtracting row and column minima from each element.
-  int hungarianStep0() {
-    detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_,
-                             size_);
+  int hungarianStep0()
+  {
+    detail::initialReduction(handle_, d_costs_, d_vertices_dev, batchsize_, size_);
 
     return 1;
   }
 
   // Function for calculating initial zeros by subtracting row and column minima from each element.
-  int hungarianStep1() {
-    detail::computeInitialAssignments(handle_, d_costs_, d_vertices_dev,
-                                      batchsize_, size_, epsilon_);
+  int hungarianStep1()
+  {
+    detail::computeInitialAssignments(
+      handle_, d_costs_, d_vertices_dev, batchsize_, size_, epsilon_);
 
     int next = 2;
 
@@ -219,10 +209,10 @@ class LinearAssignmentProblem {
   }
 
   // Function for checking optimality and constructing predicates and covers.
-  int hungarianStep2() {
-    int cover_count =
-      detail::computeRowCovers(handle_, d_vertices_dev, d_row_data_dev,
-                               d_col_data_dev, batchsize_, size_);
+  int hungarianStep2()
+  {
+    int cover_count = detail::computeRowCovers(
+      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
 
     int next = (cover_count == batchsize_ * size_) ? 6 : 3;
 
@@ -230,7 +220,8 @@ class LinearAssignmentProblem {
   }
 
   // Function for building alternating tree rooted at unassigned rows.
-  int hungarianStep3() {
+  int hungarianStep3()
+  {
     int next;
 
     rmm::device_scalar<bool> flag_v(handle_.get_stream());
@@ -238,8 +229,14 @@ class LinearAssignmentProblem {
     bool h_flag = false;
     flag_v.set_value_async(h_flag, handle_.get_stream());
 
-    detail::executeZeroCover(handle_, d_costs_, d_vertices_dev, d_row_data_dev,
-                             d_col_data_dev, flag_v.data(), batchsize_, size_,
+    detail::executeZeroCover(handle_,
+                             d_costs_,
+                             d_vertices_dev,
+                             d_row_data_dev,
+                             d_col_data_dev,
+                             flag_v.data(),
+                             batchsize_,
+                             size_,
                              epsilon_);
 
     h_flag = flag_v.value(handle_.get_stream());
@@ -250,31 +247,36 @@ class LinearAssignmentProblem {
   }
 
   // Function for augmenting the solution along multiple node-disjoint alternating trees.
-  int hungarianStep4() {
-    detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_,
-                        size_);
+  int hungarianStep4()
+  {
+    detail::reversePass(handle_, d_row_data_dev, d_col_data_dev, batchsize_, size_);
 
-    detail::augmentationPass(handle_, d_vertices_dev, d_row_data_dev,
-                             d_col_data_dev, batchsize_, size_);
+    detail::augmentationPass(
+      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_);
 
     return 2;
   }
 
   // Function for updating dual solution to introduce new zero-cost arcs.
-  int hungarianStep5() {
-    detail::dualUpdate(handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev,
-                       batchsize_, size_, epsilon_);
+  int hungarianStep5()
+  {
+    detail::dualUpdate(
+      handle_, d_vertices_dev, d_row_data_dev, d_col_data_dev, batchsize_, size_, epsilon_);
 
     return 3;
   }
 
   // Function for calculating primal and dual objective values at optimality.
-  int hungarianStep6() {
-    detail::calcObjValPrimal(handle_, obj_val_primal_v.data(), d_costs_,
-                             d_vertices_dev.row_assignments, batchsize_, size_);
+  int hungarianStep6()
+  {
+    detail::calcObjValPrimal(handle_,
+                             obj_val_primal_v.data(),
+                             d_costs_,
+                             d_vertices_dev.row_assignments,
+                             batchsize_,
+                             size_);
 
-    detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev,
-                           batchsize_, size_);
+    detail::calcObjValDual(handle_, obj_val_dual_v.data(), d_vertices_dev, batchsize_, size_);
 
     return 100;
   }
diff --git a/cpp/include/raft/lap/lap_functions.cuh b/cpp/include/raft/lap/lap_functions.cuh
index 830940f0ec..ab4aa2df59 100644
--- a/cpp/include/raft/lap/lap_functions.cuh
+++ b/cpp/include/raft/lap/lap_functions.cuh
@@ -45,20 +45,26 @@ const int BLOCKDIMX{64};
 const int BLOCKDIMY{1};
 
 // Function for calculating grid and block dimensions from the given input size.
-inline void calculateLinearDims(dim3 &blocks_per_grid, dim3 &threads_per_block,
-                                int &total_blocks, int size) {
+inline void calculateLinearDims(dim3& blocks_per_grid,
+                                dim3& threads_per_block,
+                                int& total_blocks,
+                                int size)
+{
   threads_per_block.x = BLOCKDIMX * BLOCKDIMY;
 
   int value = size / threads_per_block.x;
   if (size % threads_per_block.x > 0) value++;
 
-  total_blocks = value;
+  total_blocks      = value;
   blocks_per_grid.x = value;
 }
 
 // Function for calculating grid and block dimensions from the given input size for square grid.
-inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block,
-                                int &total_blocks, int size) {
+inline void calculateSquareDims(dim3& blocks_per_grid,
+                                dim3& threads_per_block,
+                                int& total_blocks,
+                                int size)
+{
   threads_per_block.x = BLOCKDIMX;
   threads_per_block.y = BLOCKDIMY;
 
@@ -67,15 +73,16 @@ inline void calculateSquareDims(dim3 &blocks_per_grid, dim3 &threads_per_block,
   int valuex = (int)ceil((float)(sq_size) / BLOCKDIMX);
   int valuey = (int)ceil((float)(sq_size) / BLOCKDIMY);
 
-  total_blocks = valuex * valuey;
+  total_blocks      = valuex * valuey;
   blocks_per_grid.x = valuex;
   blocks_per_grid.y = valuey;
 }
 
-// Function for calculating grid and block dimensions from the given input size for rectangular grid.
-inline void calculateRectangularDims(dim3 &blocks_per_grid,
-                                     dim3 &threads_per_block, int &total_blocks,
-                                     int xsize, int ysize) {
+// Function for calculating grid and block dimensions from the given input size for rectangular
+// grid.
+inline void calculateRectangularDims(
+  dim3& blocks_per_grid, dim3& threads_per_block, int& total_blocks, int xsize, int ysize)
+{
   threads_per_block.x = BLOCKDIMX;
   threads_per_block.y = BLOCKDIMY;
 
@@ -85,16 +92,18 @@ inline void calculateRectangularDims(dim3 &blocks_per_grid,
   int valuey = ysize / threads_per_block.y;
   if (ysize % threads_per_block.y > 0) valuey++;
 
-  total_blocks = valuex * valuey;
+  total_blocks      = valuex * valuey;
   blocks_per_grid.x = valuex;
   blocks_per_grid.y = valuey;
 }
 
 template <typename vertex_t, typename weight_t>
-inline void initialReduction(raft::handle_t const &handle,
-                             weight_t const *d_costs,
-                             Vertices<vertex_t, weight_t> &d_vertices_dev,
-                             int SP, vertex_t N) {
+inline void initialReduction(raft::handle_t const& handle,
+                             weight_t const* d_costs,
+                             Vertices<vertex_t, weight_t>& d_vertices_dev,
+                             int SP,
+                             vertex_t N)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -102,24 +111,28 @@ inline void initialReduction(raft::handle_t const &handle,
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
-  kernel_rowReduction<<<blocks_per_grid, threads_per_block, 0,
-                        handle.get_stream()>>>(
-    d_costs, d_vertices_dev.row_duals, SP, N,
-    std::numeric_limits<weight_t>::max());
+  kernel_rowReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_costs, d_vertices_dev.row_duals, SP, N, std::numeric_limits<weight_t>::max());
 
   CHECK_CUDA(handle.get_stream());
-  kernel_columnReduction<<<blocks_per_grid, threads_per_block, 0,
-                           handle.get_stream()>>>(
-    d_costs, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N,
+  kernel_columnReduction<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_costs,
+    d_vertices_dev.row_duals,
+    d_vertices_dev.col_duals,
+    SP,
+    N,
     std::numeric_limits<weight_t>::max());
   CHECK_CUDA(handle.get_stream());
 }
 
 template <typename vertex_t, typename weight_t>
-inline void computeInitialAssignments(raft::handle_t const &handle,
-                                      weight_t const *d_costs,
-                                      Vertices<vertex_t, weight_t> &d_vertices,
-                                      int SP, vertex_t N, weight_t epsilon) {
+inline void computeInitialAssignments(raft::handle_t const& handle,
+                                      weight_t const* d_costs,
+                                      Vertices<vertex_t, weight_t>& d_vertices,
+                                      int SP,
+                                      vertex_t N,
+                                      weight_t epsilon)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -137,21 +150,29 @@ inline void computeInitialAssignments(raft::handle_t const &handle,
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
-  kernel_computeInitialAssignments<<<blocks_per_grid, threads_per_block, 0,
-                                     handle.get_stream()>>>(
-    d_costs, d_vertices.row_duals, d_vertices.col_duals,
-    d_vertices.row_assignments, d_vertices.col_assignments, row_lock_v.data(),
-    col_lock_v.data(), SP, N, epsilon);
+  kernel_computeInitialAssignments<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_costs,
+    d_vertices.row_duals,
+    d_vertices.col_duals,
+    d_vertices.row_assignments,
+    d_vertices.col_assignments,
+    row_lock_v.data(),
+    col_lock_v.data(),
+    SP,
+    N,
+    epsilon);
   CHECK_CUDA(handle.get_stream());
 }
 
 // Function for finding row cover on individual devices.
 template <typename vertex_t, typename weight_t>
-inline int computeRowCovers(raft::handle_t const &handle,
-                            Vertices<vertex_t, weight_t> &d_vertices,
-                            VertexData<vertex_t> &d_row_data,
-                            VertexData<vertex_t> &d_col_data, int SP,
-                            vertex_t N) {
+inline int computeRowCovers(raft::handle_t const& handle,
+                            Vertices<vertex_t, weight_t>& d_vertices,
+                            VertexData<vertex_t>& d_row_data,
+                            VertexData<vertex_t>& d_col_data,
+                            int SP,
+                            vertex_t N)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
@@ -160,8 +181,7 @@ inline int computeRowCovers(raft::handle_t const &handle,
 
   thrust::fill_n(thrust::device, d_vertices.row_covers, size, int{0});
   thrust::fill_n(thrust::device, d_vertices.col_covers, size, int{0});
-  thrust::fill_n(thrust::device, d_vertices.col_slacks, size,
-                 std::numeric_limits<weight_t>::max());
+  thrust::fill_n(thrust::device, d_vertices.col_slacks, size, std::numeric_limits<weight_t>::max());
   thrust::fill_n(thrust::device, d_row_data.is_visited, size, DORMANT);
   thrust::fill_n(thrust::device, d_col_data.is_visited, size, DORMANT);
   thrust::fill_n(thrust::device, d_row_data.parents, size, vertex_t{-1});
@@ -171,25 +191,28 @@ inline int computeRowCovers(raft::handle_t const &handle,
 
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
-  kernel_computeRowCovers<<<blocks_per_grid, threads_per_block, 0,
-                            handle.get_stream()>>>(
-    d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited,
-    SP, N);
+  kernel_computeRowCovers<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_vertices.row_assignments, d_vertices.row_covers, d_row_data.is_visited, SP, N);
 
   CHECK_CUDA(handle.get_stream());
 
-  return thrust::reduce(thrust::device, d_vertices.row_covers,
-                        d_vertices.row_covers + size);
+  return thrust::reduce(thrust::device, d_vertices.row_covers, d_vertices.row_covers + size);
 }
 
 // Function for covering the zeros in uncovered rows and expanding the frontier.
 template <typename vertex_t, typename weight_t>
-inline void coverZeroAndExpand(
-  raft::handle_t const &handle, weight_t const *d_costs_dev,
-  vertex_t const *d_rows_csr_neighbors, vertex_t const *d_rows_csr_ptrs,
-  Vertices<vertex_t, weight_t> &d_vertices_dev,
-  VertexData<vertex_t> &d_row_data_dev, VertexData<vertex_t> &d_col_data_dev,
-  bool *d_flag, int SP, vertex_t N, weight_t epsilon) {
+inline void coverZeroAndExpand(raft::handle_t const& handle,
+                               weight_t const* d_costs_dev,
+                               vertex_t const* d_rows_csr_neighbors,
+                               vertex_t const* d_rows_csr_ptrs,
+                               Vertices<vertex_t, weight_t>& d_vertices_dev,
+                               VertexData<vertex_t>& d_row_data_dev,
+                               VertexData<vertex_t>& d_col_data_dev,
+                               bool* d_flag,
+                               int SP,
+                               vertex_t N,
+                               weight_t epsilon)
+{
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
@@ -197,20 +220,30 @@ inline void coverZeroAndExpand(
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
-  kernel_coverAndExpand<<<blocks_per_grid, threads_per_block, 0,
-                          handle.get_stream()>>>(
-    d_flag, d_rows_csr_ptrs, d_rows_csr_neighbors, d_costs_dev, d_vertices_dev,
-    d_row_data_dev, d_col_data_dev, SP, N, epsilon);
+  kernel_coverAndExpand<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_flag,
+    d_rows_csr_ptrs,
+    d_rows_csr_neighbors,
+    d_costs_dev,
+    d_vertices_dev,
+    d_row_data_dev,
+    d_col_data_dev,
+    SP,
+    N,
+    epsilon);
 }
 
 template <typename vertex_t, typename weight_t>
-inline vertex_t zeroCoverIteration(raft::handle_t const &handle,
-                                   weight_t const *d_costs_dev,
-                                   Vertices<vertex_t, weight_t> &d_vertices_dev,
-                                   VertexData<vertex_t> &d_row_data_dev,
-                                   VertexData<vertex_t> &d_col_data_dev,
-                                   bool *d_flag, int SP, vertex_t N,
-                                   weight_t epsilon) {
+inline vertex_t zeroCoverIteration(raft::handle_t const& handle,
+                                   weight_t const* d_costs_dev,
+                                   Vertices<vertex_t, weight_t>& d_vertices_dev,
+                                   VertexData<vertex_t>& d_row_data_dev,
+                                   VertexData<vertex_t>& d_col_data_dev,
+                                   bool* d_flag,
+                                   int SP,
+                                   vertex_t N,
+                                   weight_t epsilon)
+{
   vertex_t M;
 
   rmm::device_uvector<vertex_t> csr_ptrs_v(0, handle.get_stream());
@@ -235,65 +268,85 @@ inline vertex_t zeroCoverIteration(raft::handle_t const &handle,
       blocks_per_grid, threads_per_block, total_blocks, N, SP);
 
     // construct predicate matrix for edges.
-    kernel_rowPredicateConstructionCSR<<<blocks_per_grid, threads_per_block, 0,
+    kernel_rowPredicateConstructionCSR<<<blocks_per_grid,
+                                         threads_per_block,
+                                         0,
                                          handle.get_stream()>>>(
-      predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP,
-      N);
+      predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP, N);
     CHECK_CUDA(handle.get_stream());
 
     M = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
-    thrust::exclusive_scan(thrust::device, addresses_v.begin(),
-                           addresses_v.end(), addresses_v.begin());
+    thrust::exclusive_scan(
+      thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin());
 
     if (M > 0) {
       csr_neighbors_v.resize(M, handle.get_stream());
 
-      kernel_rowScatterCSR<<<blocks_per_grid, threads_per_block, 0,
-                             handle.get_stream()>>>(
-        predicates_v.data(), addresses_v.data(), csr_neighbors_v.data(),
-        csr_ptrs_v.data(), M, SP, N);
+      kernel_rowScatterCSR<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+        predicates_v.data(),
+        addresses_v.data(),
+        csr_neighbors_v.data(),
+        csr_ptrs_v.data(),
+        M,
+        SP,
+        N);
 
       CHECK_CUDA(handle.get_stream());
     }
   }
 
   if (M > 0) {
-    coverZeroAndExpand(handle, d_costs_dev, csr_neighbors_v.data(),
-                       csr_ptrs_v.data(), d_vertices_dev, d_row_data_dev,
-                       d_col_data_dev, d_flag, SP, N, epsilon);
+    coverZeroAndExpand(handle,
+                       d_costs_dev,
+                       csr_neighbors_v.data(),
+                       csr_ptrs_v.data(),
+                       d_vertices_dev,
+                       d_row_data_dev,
+                       d_col_data_dev,
+                       d_flag,
+                       SP,
+                       N,
+                       epsilon);
   }
 
   return M;
 }
 
-// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending on the presence of uncovered zeros.
+// Function for executing recursive zero cover. Returns the next step (Step 4 or Step 5) depending
+// on the presence of uncovered zeros.
 template <typename vertex_t, typename weight_t>
-inline void executeZeroCover(raft::handle_t const &handle,
-                             weight_t const *d_costs_dev,
-                             Vertices<vertex_t, weight_t> &d_vertices_dev,
-                             VertexData<vertex_t> &d_row_data_dev,
-                             VertexData<vertex_t> &d_col_data_dev, bool *d_flag,
-                             int SP, vertex_t N, weight_t epsilon) {
+inline void executeZeroCover(raft::handle_t const& handle,
+                             weight_t const* d_costs_dev,
+                             Vertices<vertex_t, weight_t>& d_vertices_dev,
+                             VertexData<vertex_t>& d_row_data_dev,
+                             VertexData<vertex_t>& d_col_data_dev,
+                             bool* d_flag,
+                             int SP,
+                             vertex_t N,
+                             weight_t epsilon)
+{
   vertex_t M = 1;
   while (M > 0) {
-    M = zeroCoverIteration(handle, d_costs_dev, d_vertices_dev, d_row_data_dev,
-                           d_col_data_dev, d_flag, SP, N, epsilon);
+    M = zeroCoverIteration(
+      handle, d_costs_dev, d_vertices_dev, d_row_data_dev, d_col_data_dev, d_flag, SP, N, epsilon);
   }
 }
 
 // Function for executing reverse pass of the maximum matching.
 template <typename vertex_t>
-inline void reversePass(raft::handle_t const &handle,
-                        VertexData<vertex_t> &d_row_data_dev,
-                        VertexData<vertex_t> &d_col_data_dev, int SP, int N) {
+inline void reversePass(raft::handle_t const& handle,
+                        VertexData<vertex_t>& d_row_data_dev,
+                        VertexData<vertex_t>& d_col_data_dev,
+                        int SP,
+                        int N)
+{
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
 
   std::size_t size = SP * N;
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, size);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, size);
 
   rmm::device_uvector<bool> predicates_v(size, handle.get_stream());
   rmm::device_uvector<vertex_t> addresses_v(size, handle.get_stream());
@@ -302,18 +355,19 @@ inline void reversePass(raft::handle_t const &handle,
   thrust::fill_n(thrust::device, addresses_v.data(), size, vertex_t{0});
 
   // compact the reverse pass row vertices.
-  kernel_augmentPredicateConstruction<<<blocks_per_grid, threads_per_block, 0,
+  kernel_augmentPredicateConstruction<<<blocks_per_grid,
+                                        threads_per_block,
+                                        0,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_col_data_dev.is_visited, size);
 
   CHECK_CUDA(handle.get_stream());
 
   // calculate total number of vertices.
-  std::size_t csr_size =
-    thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
+  std::size_t csr_size = thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
   // exclusive scan for calculating the scatter addresses.
-  thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(),
-                         addresses_v.begin());
+  thrust::exclusive_scan(
+    thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin());
 
   if (csr_size > 0) {
     int total_blocks_1 = 0;
@@ -324,14 +378,12 @@ inline void reversePass(raft::handle_t const &handle,
 
     rmm::device_uvector<vertex_t> elements_v(csr_size, handle.get_stream());
 
-    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0,
-                            handle.get_stream()>>>(
+    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
       elements_v.data(), predicates_v.data(), addresses_v.data(), size);
 
     CHECK_CUDA(handle.get_stream());
 
-    kernel_reverseTraversal<<<blocks_per_grid_1, threads_per_block_1, 0,
-                              handle.get_stream()>>>(
+    kernel_reverseTraversal<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
       elements_v.data(), d_row_data_dev, d_col_data_dev, csr_size);
     CHECK_CUDA(handle.get_stream());
   }
@@ -339,16 +391,17 @@ inline void reversePass(raft::handle_t const &handle,
 
 // Function for executing augmentation pass of the maximum matching.
 template <typename vertex_t, typename weight_t>
-inline void augmentationPass(raft::handle_t const &handle,
-                             Vertices<vertex_t, weight_t> &d_vertices_dev,
-                             VertexData<vertex_t> &d_row_data_dev,
-                             VertexData<vertex_t> &d_col_data_dev, int SP,
-                             int N) {
+inline void augmentationPass(raft::handle_t const& handle,
+                             Vertices<vertex_t, weight_t>& d_vertices_dev,
+                             VertexData<vertex_t>& d_row_data_dev,
+                             VertexData<vertex_t>& d_col_data_dev,
+                             int SP,
+                             int N)
+{
   int total_blocks = 0;
   dim3 blocks_per_grid;
   dim3 threads_per_block;
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, SP * N);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP * N);
 
   rmm::device_uvector<bool> predicates_v(SP * N, handle.get_stream());
   rmm::device_uvector<vertex_t> addresses_v(SP * N, handle.get_stream());
@@ -357,7 +410,9 @@ inline void augmentationPass(raft::handle_t const &handle,
   thrust::fill_n(thrust::device, addresses_v.data(), SP * N, vertex_t{0});
 
   // compact the reverse pass row vertices.
-  kernel_augmentPredicateConstruction<<<blocks_per_grid, threads_per_block, 0,
+  kernel_augmentPredicateConstruction<<<blocks_per_grid,
+                                        threads_per_block,
+                                        0,
                                         handle.get_stream()>>>(
     predicates_v.data(), addresses_v.data(), d_row_data_dev.is_visited, SP * N);
 
@@ -368,8 +423,8 @@ inline void augmentationPass(raft::handle_t const &handle,
   vertex_t row_ids_csr_size =
     thrust::reduce(thrust::device, addresses_v.begin(), addresses_v.end());
   // exclusive scan for calculating the scatter addresses.
-  thrust::exclusive_scan(thrust::device, addresses_v.begin(), addresses_v.end(),
-                         addresses_v.begin());
+  thrust::exclusive_scan(
+    thrust::device, addresses_v.begin(), addresses_v.end(), addresses_v.begin());
 
   if (row_ids_csr_size > 0) {
     int total_blocks_1 = 0;
@@ -378,20 +433,20 @@ inline void augmentationPass(raft::handle_t const &handle,
     raft::lap::detail::calculateLinearDims(
       blocks_per_grid_1, threads_per_block_1, total_blocks_1, row_ids_csr_size);
 
-    rmm::device_uvector<vertex_t> elements_v(row_ids_csr_size,
-                                             handle.get_stream());
+    rmm::device_uvector<vertex_t> elements_v(row_ids_csr_size, handle.get_stream());
 
-    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0,
-                            handle.get_stream()>>>(
-      elements_v.data(), predicates_v.data(), addresses_v.data(),
-      vertex_t{SP * N});
+    kernel_augmentScatter<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+      elements_v.data(), predicates_v.data(), addresses_v.data(), vertex_t{SP * N});
 
     CHECK_CUDA(handle.get_stream());
 
-    kernel_augmentation<<<blocks_per_grid_1, threads_per_block_1, 0,
-                          handle.get_stream()>>>(
-      d_vertices_dev.row_assignments, d_vertices_dev.col_assignments,
-      elements_v.data(), d_row_data_dev, d_col_data_dev, vertex_t{N},
+    kernel_augmentation<<<blocks_per_grid_1, threads_per_block_1, 0, handle.get_stream()>>>(
+      d_vertices_dev.row_assignments,
+      d_vertices_dev.col_assignments,
+      elements_v.data(),
+      d_row_data_dev,
+      d_col_data_dev,
+      vertex_t{N},
       row_ids_csr_size);
 
     CHECK_CUDA(handle.get_stream());
@@ -399,34 +454,45 @@ inline void augmentationPass(raft::handle_t const &handle,
 }
 
 template <typename vertex_t, typename weight_t>
-inline void dualUpdate(raft::handle_t const &handle,
-                       Vertices<vertex_t, weight_t> &d_vertices_dev,
-                       VertexData<vertex_t> &d_row_data_dev,
-                       VertexData<vertex_t> &d_col_data_dev, int SP, vertex_t N,
-                       weight_t epsilon) {
+inline void dualUpdate(raft::handle_t const& handle,
+                       Vertices<vertex_t, weight_t>& d_vertices_dev,
+                       VertexData<vertex_t>& d_row_data_dev,
+                       VertexData<vertex_t>& d_col_data_dev,
+                       int SP,
+                       vertex_t N,
+                       weight_t epsilon)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks;
 
   rmm::device_scalar<weight_t> sp_min_v(handle.get_stream());
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, SP);
-  kernel_dualUpdate_1<<<blocks_per_grid, threads_per_block, 0,
-                        handle.get_stream()>>>(
-    sp_min_v.data(), d_vertices_dev.col_slacks, d_vertices_dev.col_covers, SP,
-    N, std::numeric_limits<weight_t>::max());
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
+  kernel_dualUpdate_1<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    sp_min_v.data(),
+    d_vertices_dev.col_slacks,
+    d_vertices_dev.col_covers,
+    SP,
+    N,
+    std::numeric_limits<weight_t>::max());
 
   CHECK_CUDA(handle.get_stream());
 
   raft::lap::detail::calculateRectangularDims(
     blocks_per_grid, threads_per_block, total_blocks, N, SP);
-  kernel_dualUpdate_2<<<blocks_per_grid, threads_per_block, 0,
-                        handle.get_stream()>>>(
-    sp_min_v.data(), d_vertices_dev.row_duals, d_vertices_dev.col_duals,
-    d_vertices_dev.col_slacks, d_vertices_dev.row_covers,
-    d_vertices_dev.col_covers, d_row_data_dev.is_visited,
-    d_col_data_dev.parents, SP, N, std::numeric_limits<weight_t>::max(),
+  kernel_dualUpdate_2<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    sp_min_v.data(),
+    d_vertices_dev.row_duals,
+    d_vertices_dev.col_duals,
+    d_vertices_dev.col_slacks,
+    d_vertices_dev.row_covers,
+    d_vertices_dev.col_covers,
+    d_row_data_dev.is_visited,
+    d_col_data_dev.parents,
+    SP,
+    N,
+    std::numeric_limits<weight_t>::max(),
     epsilon);
 
   CHECK_CUDA(handle.get_stream());
@@ -434,18 +500,19 @@ inline void dualUpdate(raft::handle_t const &handle,
 
 // Function for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val,
-                           Vertices<vertex_t, weight_t> &d_vertices_dev, int SP,
-                           int N) {
+inline void calcObjValDual(raft::handle_t const& handle,
+                           weight_t* d_obj_val,
+                           Vertices<vertex_t, weight_t>& d_vertices_dev,
+                           int SP,
+                           int N)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, SP);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
 
-  kernel_calcObjValDual<<<blocks_per_grid, threads_per_block, 0,
-                          handle.get_stream()>>>(
+  kernel_calcObjValDual<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
     d_obj_val, d_vertices_dev.row_duals, d_vertices_dev.col_duals, SP, N);
 
   CHECK_CUDA(handle.get_stream());
@@ -453,20 +520,21 @@ inline void calcObjValDual(raft::handle_t const &handle, weight_t *d_obj_val,
 
 // Function for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-inline void calcObjValPrimal(raft::handle_t const &handle, weight_t *d_obj_val,
-                             weight_t const *d_costs,
-                             vertex_t const *d_row_assignments, int SP,
-                             vertex_t N) {
+inline void calcObjValPrimal(raft::handle_t const& handle,
+                             weight_t* d_obj_val,
+                             weight_t const* d_costs,
+                             vertex_t const* d_row_assignments,
+                             int SP,
+                             vertex_t N)
+{
   dim3 blocks_per_grid;
   dim3 threads_per_block;
   int total_blocks = 0;
 
-  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block,
-                                         total_blocks, SP);
+  raft::lap::detail::calculateLinearDims(blocks_per_grid, threads_per_block, total_blocks, SP);
 
-  kernel_calcObjValPrimal<<<blocks_per_grid, threads_per_block, 0,
-                            handle.get_stream()>>>(d_obj_val, d_costs,
-                                                   d_row_assignments, SP, N);
+  kernel_calcObjValPrimal<<<blocks_per_grid, threads_per_block, 0, handle.get_stream()>>>(
+    d_obj_val, d_costs, d_row_assignments, SP, N);
 
   CHECK_CUDA(handle.get_stream());
 }
diff --git a/cpp/include/raft/lap/lap_kernels.cuh b/cpp/include/raft/lap/lap_kernels.cuh
index 14ad877aa4..328cbf3e74 100644
--- a/cpp/include/raft/lap/lap_kernels.cuh
+++ b/cpp/include/raft/lap/lap_kernels.cuh
@@ -45,42 +45,57 @@ const int AUGMENT{4};
 const int MODIFIED{5};
 
 template <typename weight_t>
-bool __device__ near_zero(weight_t w, weight_t epsilon) {
+bool __device__ near_zero(weight_t w, weight_t epsilon)
+{
   return ((w > -epsilon) && (w < epsilon));
 }
 
 template <>
-bool __device__ near_zero<int32_t>(int32_t w, int32_t epsilon) {
+bool __device__ near_zero<int32_t>(int32_t w, int32_t epsilon)
+{
   return (w == 0);
 }
 
 template <>
-bool __device__ near_zero<int64_t>(int64_t w, int64_t epsilon) {
+bool __device__ near_zero<int64_t>(int64_t w, int64_t epsilon)
+{
   return (w == 0);
 }
 
-// Device function for traversing the neighbors from start pointer to end pointer and updating the covers.
-// The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of Step 4 execution.
+// Device function for traversing the neighbors from start pointer to end pointer and updating the
+// covers. The function sets d_next to 4 if there are uncovered zeros, indicating the requirement of
+// Step 4 execution.
 template <typename vertex_t, typename weight_t>
-__device__ void cover_and_expand_row(
-  weight_t const *d_elements, weight_t const *d_row_duals,
-  weight_t const *d_col_duals, weight_t *d_col_slacks, int *d_row_covers,
-  int *d_col_covers, vertex_t const *d_col_assignments, bool *d_flag,
-  vertex_t *d_row_parents, vertex_t *d_col_parents, int *d_row_visited,
-  int *d_col_visited, vertex_t rowid, int spid, int colid, vertex_t N,
-  weight_t epsilon) {
+__device__ void cover_and_expand_row(weight_t const* d_elements,
+                                     weight_t const* d_row_duals,
+                                     weight_t const* d_col_duals,
+                                     weight_t* d_col_slacks,
+                                     int* d_row_covers,
+                                     int* d_col_covers,
+                                     vertex_t const* d_col_assignments,
+                                     bool* d_flag,
+                                     vertex_t* d_row_parents,
+                                     vertex_t* d_col_parents,
+                                     int* d_row_visited,
+                                     int* d_col_visited,
+                                     vertex_t rowid,
+                                     int spid,
+                                     int colid,
+                                     vertex_t N,
+                                     weight_t epsilon)
+{
   int ROWID = spid * N + rowid;
   int COLID = spid * N + colid;
 
-  weight_t slack = d_elements[spid * N * N + rowid * N + colid] -
-                   d_row_duals[ROWID] - d_col_duals[COLID];
+  weight_t slack =
+    d_elements[spid * N * N + rowid * N + colid] - d_row_duals[ROWID] - d_col_duals[COLID];
 
   int nxt_rowid = d_col_assignments[COLID];
   int NXT_ROWID = spid * N + nxt_rowid;
 
   if (rowid != nxt_rowid && d_col_covers[COLID] == 0) {
     if (slack < d_col_slacks[COLID]) {
-      d_col_slacks[COLID] = slack;
+      d_col_slacks[COLID]  = slack;
       d_col_parents[COLID] = ROWID;
     }
 
@@ -89,13 +104,12 @@ __device__ void cover_and_expand_row(
         d_row_parents[NXT_ROWID] = COLID;  // update parent info
 
         d_row_covers[NXT_ROWID] = 0;
-        d_col_covers[COLID] = 1;
+        d_col_covers[COLID]     = 1;
 
-        if (d_row_visited[NXT_ROWID] != VISITED)
-          d_row_visited[NXT_ROWID] = ACTIVE;
+        if (d_row_visited[NXT_ROWID] != VISITED) d_row_visited[NXT_ROWID] = ACTIVE;
       } else {
         d_col_visited[COLID] = REVERSE;
-        *d_flag = true;
+        *d_flag              = true;
       }
     }
   }
@@ -104,28 +118,34 @@ __device__ void cover_and_expand_row(
 
 // Device function for traversing an alternating path from unassigned row to unassigned column.
 template <typename vertex_t>
-__device__ void __reverse_traversal(
-  int *d_row_visited, vertex_t *d_row_children, vertex_t *d_col_children,
-  vertex_t const *d_row_parents, vertex_t const *d_col_parents, int cur_colid) {
+__device__ void __reverse_traversal(int* d_row_visited,
+                                    vertex_t* d_row_children,
+                                    vertex_t* d_col_children,
+                                    vertex_t const* d_row_parents,
+                                    vertex_t const* d_col_parents,
+                                    int cur_colid)
+{
   int cur_rowid = -1;
 
   while (cur_colid != -1) {
     d_col_children[cur_colid] = cur_rowid;
-    cur_rowid = d_col_parents[cur_colid];
+    cur_rowid                 = d_col_parents[cur_colid];
 
     d_row_children[cur_rowid] = cur_colid;
-    cur_colid = d_row_parents[cur_rowid];
+    cur_colid                 = d_row_parents[cur_rowid];
   }
   d_row_visited[cur_rowid] = AUGMENT;
 }
 
 // Device function for augmenting the alternating path from unassigned column to unassigned row.
 template <typename vertex_t>
-__device__ void __augment(vertex_t *d_row_assignments,
-                          vertex_t *d_col_assignments,
-                          vertex_t const *d_row_children,
-                          vertex_t const *d_col_children, vertex_t cur_rowid,
-                          vertex_t N) {
+__device__ void __augment(vertex_t* d_row_assignments,
+                          vertex_t* d_col_assignments,
+                          vertex_t const* d_row_children,
+                          vertex_t const* d_col_children,
+                          vertex_t cur_rowid,
+                          vertex_t N)
+{
   int cur_colid = -1;
 
   while (cur_rowid != -1) {
@@ -142,20 +162,18 @@ __device__ void __augment(vertex_t *d_row_assignments,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_rowReduction(weight_t const *d_costs,
-                                    weight_t *d_row_duals, int SP, vertex_t N,
-                                    weight_t infinity) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
-  int rowid = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void kernel_rowReduction(
+  weight_t const* d_costs, weight_t* d_row_duals, int SP, vertex_t N, weight_t infinity)
+{
+  int spid     = blockIdx.y * blockDim.y + threadIdx.y;
+  int rowid    = blockIdx.x * blockDim.x + threadIdx.x;
   weight_t min = infinity;
 
   if (spid < SP && rowid < N) {
     for (int colid = 0; colid < N; colid++) {
       weight_t slack = d_costs[spid * N * N + rowid * N + colid];
 
-      if (slack < min) {
-        min = slack;
-      }
+      if (slack < min) { min = slack; }
     }
 
     d_row_duals[spid * N + rowid] = min;
@@ -166,25 +184,26 @@ __global__ void kernel_rowReduction(weight_t const *d_costs,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_columnReduction(weight_t const *d_costs,
-                                       weight_t const *d_row_duals,
-                                       weight_t *d_col_duals, int SP,
-                                       vertex_t N, weight_t infinity) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_columnReduction(weight_t const* d_costs,
+                                       weight_t const* d_row_duals,
+                                       weight_t* d_col_duals,
+                                       int SP,
+                                       vertex_t N,
+                                       weight_t infinity)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
 
   weight_t min = infinity;
 
   if (spid < SP && colid < N) {
     for (int rowid = 0; rowid < N; rowid++) {
-      weight_t cost = d_costs[spid * N * N + rowid * N + colid];
+      weight_t cost     = d_costs[spid * N * N + rowid * N + colid];
       weight_t row_dual = d_row_duals[spid * N + rowid];
 
       weight_t slack = cost - row_dual;
 
-      if (slack < min) {
-        min = slack;
-      }
+      if (slack < min) { min = slack; }
     }
 
     d_col_duals[spid * N + colid] = min;
@@ -193,12 +212,18 @@ __global__ void kernel_columnReduction(weight_t const *d_costs,
 
 // Kernel for calculating initial assignments.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_computeInitialAssignments(
-  weight_t const *d_costs, weight_t const *d_row_duals,
-  weight_t const *d_col_duals, vertex_t *d_row_assignments,
-  vertex_t *d_col_assignments, int *d_row_lock, int *d_col_lock, int SP,
-  vertex_t N, weight_t epsilon) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_computeInitialAssignments(weight_t const* d_costs,
+                                                 weight_t const* d_row_duals,
+                                                 weight_t const* d_col_duals,
+                                                 vertex_t* d_row_assignments,
+                                                 vertex_t* d_col_assignments,
+                                                 int* d_row_lock,
+                                                 int* d_col_lock,
+                                                 int SP,
+                                                 vertex_t N,
+                                                 weight_t epsilon)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && colid < N) {
@@ -210,15 +235,15 @@ __global__ void kernel_computeInitialAssignments(
 
       if (d_col_lock[overall_colid] == 1) break;
 
-      weight_t cost = d_costs[spid * N * N + rowid * N + colid];
+      weight_t cost     = d_costs[spid * N * N + rowid * N + colid];
       weight_t row_dual = d_row_duals[overall_rowid];
-      weight_t slack = cost - row_dual - col_dual;
+      weight_t slack    = cost - row_dual - col_dual;
 
       if (near_zero(slack, epsilon)) {
         if (atomicCAS(&d_row_lock[overall_rowid], 0, 1) == 0) {
           d_row_assignments[overall_rowid] = colid;
           d_col_assignments[overall_colid] = rowid;
-          d_col_lock[overall_colid] = 1;
+          d_col_lock[overall_colid]        = 1;
         }
       }
     }
@@ -227,10 +252,10 @@ __global__ void kernel_computeInitialAssignments(
 
 // Kernel for populating the cover arrays and initializing alternating tree.
 template <typename vertex_t>
-__global__ void kernel_computeRowCovers(vertex_t *d_row_assignments,
-                                        int *d_row_covers, int *d_row_visited,
-                                        int SP, vertex_t N) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_computeRowCovers(
+  vertex_t* d_row_assignments, int* d_row_covers, int* d_row_visited, int SP, vertex_t N)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && rowid < N) {
@@ -246,11 +271,10 @@ __global__ void kernel_computeRowCovers(vertex_t *d_row_assignments,
 
 // Kernel for populating the predicate matrix for edges in row major format.
 template <typename vertex_t>
-__global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates,
-                                                   vertex_t *d_addresses,
-                                                   int *d_row_visited, int SP,
-                                                   vertex_t N) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_rowPredicateConstructionCSR(
+  bool* d_predicates, vertex_t* d_addresses, int* d_row_visited, int SP, vertex_t N)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && rowid < N) {
@@ -258,130 +282,160 @@ __global__ void kernel_rowPredicateConstructionCSR(bool *d_predicates,
 
     if (d_row_visited[index] == ACTIVE) {
       d_predicates[index] = true;
-      d_addresses[index] = 1;
+      d_addresses[index]  = 1;
     } else {
       d_predicates[index] = false;
-      d_addresses[index] = 0;
+      d_addresses[index]  = 0;
     }
   }
 }
 
 // Kernel for scattering the edges based on the scatter addresses.
 template <typename vertex_t>
-__global__ void kernel_rowScatterCSR(bool const *d_predicates,
-                                     vertex_t const *d_addresses,
-                                     vertex_t *d_neighbors, vertex_t *d_ptrs,
-                                     vertex_t M, int SP, vertex_t N) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+__global__ void kernel_rowScatterCSR(bool const* d_predicates,
+                                     vertex_t const* d_addresses,
+                                     vertex_t* d_neighbors,
+                                     vertex_t* d_ptrs,
+                                     vertex_t M,
+                                     int SP,
+                                     vertex_t N)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int rowid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && rowid < N) {
     int index = spid * N + rowid;
 
-    bool predicate = d_predicates[index];
+    bool predicate  = d_predicates[index];
     vertex_t compid = d_addresses[index];
 
-    if (predicate) {
-      d_neighbors[compid] = rowid;
-    }
+    if (predicate) { d_neighbors[compid] = rowid; }
     if (rowid == 0) {
       d_ptrs[spid] = compid;
-      d_ptrs[SP] = M;
+      d_ptrs[SP]   = M;
     }
   }
 }
 
 // Kernel for finding the minimum zero cover.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_coverAndExpand(bool *d_flag, vertex_t const *d_ptrs,
-                                      vertex_t const *d_neighbors,
-                                      weight_t const *d_elements,
+__global__ void kernel_coverAndExpand(bool* d_flag,
+                                      vertex_t const* d_ptrs,
+                                      vertex_t const* d_neighbors,
+                                      weight_t const* d_elements,
                                       Vertices<vertex_t, weight_t> d_vertices,
                                       VertexData<vertex_t> d_row_data,
-                                      VertexData<vertex_t> d_col_data, int SP,
-                                      vertex_t N, weight_t epsilon) {
-  int spid = blockIdx.y * blockDim.y + threadIdx.y;
+                                      VertexData<vertex_t> d_col_data,
+                                      int SP,
+                                      vertex_t N,
+                                      weight_t epsilon)
+{
+  int spid  = blockIdx.y * blockDim.y + threadIdx.y;
   int colid = blockIdx.x * blockDim.x + threadIdx.x;
 
   // Load values into local memory
 
   if (spid < SP && colid < N) {
     thrust::for_each(
-      thrust::seq, d_neighbors + d_ptrs[spid], d_neighbors + d_ptrs[spid + 1],
-      [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N,
-       epsilon] __device__(vertex_t rowid) {
-        cover_and_expand_row(
-          d_elements, d_vertices.row_duals, d_vertices.col_duals,
-          d_vertices.col_slacks, d_vertices.row_covers, d_vertices.col_covers,
-          d_vertices.col_assignments, d_flag, d_row_data.parents,
-          d_col_data.parents, d_row_data.is_visited, d_col_data.is_visited,
-          rowid, spid, colid, N, epsilon);
+      thrust::seq,
+      d_neighbors + d_ptrs[spid],
+      d_neighbors + d_ptrs[spid + 1],
+      [d_elements, d_vertices, d_flag, d_row_data, d_col_data, spid, colid, N, epsilon] __device__(
+        vertex_t rowid) {
+        cover_and_expand_row(d_elements,
+                             d_vertices.row_duals,
+                             d_vertices.col_duals,
+                             d_vertices.col_slacks,
+                             d_vertices.row_covers,
+                             d_vertices.col_covers,
+                             d_vertices.col_assignments,
+                             d_flag,
+                             d_row_data.parents,
+                             d_col_data.parents,
+                             d_row_data.is_visited,
+                             d_col_data.is_visited,
+                             rowid,
+                             spid,
+                             colid,
+                             N,
+                             epsilon);
       });
   }
 }
 
 // Kernel for constructing the predicates for reverse pass or augmentation candidates.
 template <typename vertex_t>
-__global__ void kernel_augmentPredicateConstruction(bool *d_predicates,
-                                                    vertex_t *d_addresses,
-                                                    int *d_visited, int size) {
+__global__ void kernel_augmentPredicateConstruction(bool* d_predicates,
+                                                    vertex_t* d_addresses,
+                                                    int* d_visited,
+                                                    int size)
+{
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
     int visited = d_visited[id];
     if ((visited == REVERSE) || (visited == AUGMENT)) {
       d_predicates[id] = true;
-      d_addresses[id] = 1;
+      d_addresses[id]  = 1;
     } else {
       d_predicates[id] = false;
-      d_addresses[id] = 0;
+      d_addresses[id]  = 0;
     }
   }
 }
 
 // Kernel for scattering the vertices based on the scatter addresses.
 template <typename vertex_t>
-__global__ void kernel_augmentScatter(vertex_t *d_elements,
-                                      bool const *d_predicates,
-                                      vertex_t const *d_addresses,
-                                      std::size_t size) {
+__global__ void kernel_augmentScatter(vertex_t* d_elements,
+                                      bool const* d_predicates,
+                                      vertex_t const* d_addresses,
+                                      std::size_t size)
+{
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
-    if (d_predicates[id]) {
-      d_elements[d_addresses[id]] = id;
-    }
+    if (d_predicates[id]) { d_elements[d_addresses[id]] = id; }
   }
 }
 
 // Kernel for executing the reverse pass of the maximum matching algorithm.
 template <typename vertex_t>
-__global__ void kernel_reverseTraversal(vertex_t *d_elements,
+__global__ void kernel_reverseTraversal(vertex_t* d_elements,
                                         VertexData<vertex_t> d_row_data,
                                         VertexData<vertex_t> d_col_data,
-                                        int size) {
+                                        int size)
+{
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
-    __reverse_traversal(d_row_data.is_visited, d_row_data.children,
-                        d_col_data.children, d_row_data.parents,
-                        d_col_data.parents, d_elements[id]);
+    __reverse_traversal(d_row_data.is_visited,
+                        d_row_data.children,
+                        d_col_data.children,
+                        d_row_data.parents,
+                        d_col_data.parents,
+                        d_elements[id]);
   }
 }
 
 // Kernel for executing the augmentation pass of the maximum matching algorithm.
 template <typename vertex_t>
-__global__ void kernel_augmentation(vertex_t *d_row_assignments,
-                                    vertex_t *d_col_assignments,
-                                    vertex_t const *d_row_elements,
+__global__ void kernel_augmentation(vertex_t* d_row_assignments,
+                                    vertex_t* d_col_assignments,
+                                    vertex_t const* d_row_elements,
                                     VertexData<vertex_t> d_row_data,
-                                    VertexData<vertex_t> d_col_data, vertex_t N,
-                                    vertex_t size) {
+                                    VertexData<vertex_t> d_col_data,
+                                    vertex_t N,
+                                    vertex_t size)
+{
   int id = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (id < size) {
-    __augment(d_row_assignments, d_col_assignments, d_row_data.children,
-              d_col_data.children, d_row_elements[id], N);
+    __augment(d_row_assignments,
+              d_col_assignments,
+              d_row_data.children,
+              d_col_data.children,
+              d_row_elements[id],
+              N);
   }
 }
 
@@ -389,18 +443,21 @@ __global__ void kernel_augmentation(vertex_t *d_row_assignments,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_dualUpdate_1(weight_t *d_sp_min,
-                                    weight_t const *d_col_slacks,
-                                    int const *d_col_covers, int SP, vertex_t N,
-                                    weight_t infinity) {
+__global__ void kernel_dualUpdate_1(weight_t* d_sp_min,
+                                    weight_t const* d_col_slacks,
+                                    int const* d_col_covers,
+                                    int SP,
+                                    vertex_t N,
+                                    weight_t infinity)
+{
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP) {
     weight_t min = infinity;
     for (int colid = 0; colid < N; colid++) {
-      int index = spid * N + colid;
+      int index      = spid * N + colid;
       weight_t slack = d_col_slacks[index];
-      int col_cover = d_col_covers[index];
+      int col_cover  = d_col_covers[index];
 
       if (col_cover == 0)
         if (slack < min) min = slack;
@@ -414,21 +471,29 @@ __global__ void kernel_dualUpdate_1(weight_t *d_sp_min,
 //  FIXME:  Once cuda 10.2 is the standard should replace passing infinity
 //          here with using cuda::std::numeric_limits<weight_t>::max()
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_dualUpdate_2(
-  weight_t const *d_sp_min, weight_t *d_row_duals, weight_t *d_col_duals,
-  weight_t *d_col_slacks, int const *d_row_covers, int const *d_col_covers,
-  int *d_row_visited, vertex_t *d_col_parents, int SP, vertex_t N,
-  weight_t infinity, weight_t epsilon) {
+__global__ void kernel_dualUpdate_2(weight_t const* d_sp_min,
+                                    weight_t* d_row_duals,
+                                    weight_t* d_col_duals,
+                                    weight_t* d_col_slacks,
+                                    int const* d_row_covers,
+                                    int const* d_col_covers,
+                                    int* d_row_visited,
+                                    vertex_t* d_col_parents,
+                                    int SP,
+                                    vertex_t N,
+                                    weight_t infinity,
+                                    weight_t epsilon)
+{
   int spid = blockIdx.y * blockDim.y + threadIdx.y;
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  int id   = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP && id < N) {
     int index = spid * N + id;
 
     if (d_sp_min[spid] < infinity) {
       weight_t theta = d_sp_min[spid];
-      int row_cover = d_row_covers[index];
-      int col_cover = d_col_covers[index];
+      int row_cover  = d_row_covers[index];
+      int col_cover  = d_col_covers[index];
 
       if (row_cover == 0)  // Row vertex is reachable from source.
         d_row_duals[index] += theta;
@@ -450,10 +515,12 @@ __global__ void kernel_dualUpdate_2(
 
 // Kernel for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual,
-                                      weight_t const *d_row_duals,
-                                      weight_t const *d_col_duals, int SP,
-                                      vertex_t N) {
+__global__ void kernel_calcObjValDual(weight_t* d_obj_val_dual,
+                                      weight_t const* d_row_duals,
+                                      weight_t const* d_col_duals,
+                                      int SP,
+                                      vertex_t N)
+{
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP) {
@@ -468,10 +535,12 @@ __global__ void kernel_calcObjValDual(weight_t *d_obj_val_dual,
 
 // Kernel for calculating optimal objective function value using dual variables.
 template <typename vertex_t, typename weight_t>
-__global__ void kernel_calcObjValPrimal(weight_t *d_obj_val_primal,
-                                        weight_t const *d_costs,
-                                        vertex_t const *d_row_assignments,
-                                        int SP, vertex_t N) {
+__global__ void kernel_calcObjValPrimal(weight_t* d_obj_val_primal,
+                                        weight_t const* d_costs,
+                                        vertex_t const* d_row_assignments,
+                                        int SP,
+                                        vertex_t N)
+{
   int spid = blockIdx.x * blockDim.x + threadIdx.x;
 
   if (spid < SP) {
diff --git a/cpp/include/raft/linalg/add.cuh b/cpp/include/raft/linalg/add.cuh
index 7a454f64e2..11d3174951 100644
--- a/cpp/include/raft/linalg/add.cuh
+++ b/cpp/include/raft/linalg/add.cuh
@@ -37,8 +37,8 @@ namespace linalg {
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void addScalar(OutT *out, const InT *in, InT scalar, IdxType len,
-               cudaStream_t stream) {
+void addScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
   auto op = [scalar] __device__(InT in) { return OutT(in + scalar); };
   unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
 }
@@ -57,23 +57,24 @@ void addScalar(OutT *out, const InT *in, InT scalar, IdxType len,
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void add(OutT *out, const InT *in1, const InT *in2, IdxType len,
-         cudaStream_t stream) {
+void add(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
   auto op = [] __device__(InT a, InT b) { return OutT(a + b); };
   binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
 }
 
 template <class math_t, typename IdxType>
-__global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
-                                      const math_t *singleScalarDev,
-                                      IdxType len) {
+__global__ void add_dev_scalar_kernel(math_t* outDev,
+                                      const math_t* inDev,
+                                      const math_t* singleScalarDev,
+                                      IdxType len)
+{
   IdxType i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) {
-    outDev[i] = inDev[i] + *singleScalarDev;
-  }
+  if (i < len) { outDev[i] = inDev[i] + *singleScalarDev; }
 }
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+ * write result to outDev[i]
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
  * @param outDev the output buffer
@@ -83,14 +84,16 @@ __global__ void add_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void addDevScalar(math_t *outDev, const math_t *inDev,
-                  const math_t *singleScalarDev, IdxType len,
-                  cudaStream_t stream) {
+void addDevScalar(math_t* outDev,
+                  const math_t* inDev,
+                  const math_t* singleScalarDev,
+                  IdxType len,
+                  cudaStream_t stream)
+{
   // TODO: block dimension has not been tuned
   dim3 block(256);
   dim3 grid(raft::ceildiv(len, (IdxType)block.x));
-  add_dev_scalar_kernel<math_t>
-    <<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
+  add_dev_scalar_kernel<math_t><<<grid, block, 0, stream>>>(outDev, inDev, singleScalarDev, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index 940d786e87..a49a433941 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -22,10 +22,10 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, int VecLen, typename Lambda, typename IdxType,
-          typename OutType>
-__global__ void binaryOpKernel(OutType *out, const InType *in1,
-                               const InType *in2, IdxType len, Lambda op) {
+template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType>
+__global__ void binaryOpKernel(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op)
+{
   typedef TxN_t<InType, VecLen> InVecType;
   typedef TxN_t<OutType, VecLen> OutVecType;
   InVecType a, b;
@@ -42,12 +42,11 @@ __global__ void binaryOpKernel(OutType *out, const InType *in1,
   c.store(out, idx);
 }
 
-template <typename InType, int VecLen, typename Lambda, typename IdxType,
-          typename OutType, int TPB>
-void binaryOpImpl(OutType *out, const InType *in1, const InType *in2,
-                  IdxType len, Lambda op, cudaStream_t stream) {
-  const IdxType nblks =
-    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType, int TPB>
+void binaryOpImpl(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
   binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
     <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
   CUDA_CHECK(cudaPeekAtLastError());
@@ -56,8 +55,8 @@ void binaryOpImpl(OutType *out, const InType *in1, const InType *in2,
 /**
  * @brief Checks if addresses are aligned on N bytes
  */
-inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3,
-                           uint64_t N) {
+inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N)
+{
   return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
 }
 
@@ -77,38 +76,36 @@ inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3,
  * @note Lambda must be a functor with the following signature:
  *       `OutType func(const InType& val1, const InType& val2);`
  */
-template <typename InType, typename Lambda, typename OutType = InType,
-          typename IdxType = int, int TPB = 256>
-void binaryOp(OutType *out, const InType *in1, const InType *in2, IdxType len,
-              Lambda op, cudaStream_t stream) {
-  constexpr auto maxSize =
-    sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes = len * maxSize;
-  uint64_t in1Addr = uint64_t(in1);
-  uint64_t in2Addr = uint64_t(in2);
-  uint64_t outAddr = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 &&
-      addressAligned(in1Addr, in2Addr, outAddr, 16)) {
+template <typename InType,
+          typename Lambda,
+          typename OutType = InType,
+          typename IdxType = int,
+          int TPB          = 256>
+void binaryOp(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes           = len * maxSize;
+  uint64_t in1Addr       = uint64_t(in1);
+  uint64_t in2Addr       = uint64_t(in2);
+  uint64_t outAddr       = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) {
     binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (8 / maxSize && bytes % 8 == 0 &&
-             addressAligned(in1Addr, in2Addr, outAddr, 8)) {
+  } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) {
     binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (4 / maxSize && bytes % 4 == 0 &&
-             addressAligned(in1Addr, in2Addr, outAddr, 4)) {
+  } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) {
     binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
-  } else if (2 / maxSize && bytes % 2 == 0 &&
-             addressAligned(in1Addr, in2Addr, outAddr, 2)) {
+  } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) {
     binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else if (1 / maxSize) {
     binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
       out, in1, in2, len, op, stream);
   } else {
-    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len,
-                                                           op, stream);
+    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len, op, stream);
   }
 }
 
diff --git a/cpp/include/raft/linalg/cholesky_r1_update.cuh b/cpp/include/raft/linalg/cholesky_r1_update.cuh
index d6d064c20e..4b58133ac5 100644
--- a/cpp/include/raft/linalg/cholesky_r1_update.cuh
+++ b/cpp/include/raft/linalg/cholesky_r1_update.cuh
@@ -122,9 +122,16 @@ namespace linalg {
  *    conditioned systems. Negative values mean no regularizaton.
  */
 template <typename math_t>
-void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
-                         void *workspace, int *n_bytes, cublasFillMode_t uplo,
-                         cudaStream_t stream, math_t eps = -1) {
+void choleskyRank1Update(const raft::handle_t& handle,
+                         math_t* L,
+                         int n,
+                         int ld,
+                         void* workspace,
+                         int* n_bytes,
+                         cublasFillMode_t uplo,
+                         cudaStream_t stream,
+                         math_t eps = -1)
+{
   // The matrix A' is defined as:
   // A' = [[A_11, A_12]
   //       [A_21, A_22]]
@@ -144,18 +151,17 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
   // We need a workspace in device memory to store a scalar. Additionally, in
   // CUBLAS_FILL_MODE_LOWER we need space for n-1 floats.
   const int align = 256;
-  int offset = (uplo == CUBLAS_FILL_MODE_LOWER)
-                 ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align)
-                 : 0;
+  int offset =
+    (uplo == CUBLAS_FILL_MODE_LOWER) ? raft::alignTo<int>(sizeof(math_t) * (n - 1), align) : 0;
   if (workspace == nullptr) {
     *n_bytes = offset + 1 * sizeof(math_t);
     return;
   }
-  math_t *s = reinterpret_cast<math_t *>(((char *)workspace) + offset);
-  math_t *L_22 = L + (n - 1) * ld + n - 1;
+  math_t* s    = reinterpret_cast<math_t*>(((char*)workspace) + offset);
+  math_t* L_22 = L + (n - 1) * ld + n - 1;
 
-  math_t *A_new;
-  math_t *A_row;
+  math_t* A_new;
+  math_t* A_row;
   if (uplo == CUBLAS_FILL_MODE_UPPER) {
     // A_new is stored as the n-1 th column of L
     A_new = L + (n - 1) * ld;
@@ -164,27 +170,36 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
     // as the n-th row of L. Since the matrix is column major, this is non
     // contiguous. We copy elements from A_row to a contiguous workspace A_new.
     A_row = L + n - 1;
-    A_new = reinterpret_cast<math_t *>(workspace);
-    CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
-                                          A_row, ld, A_new, 1, stream));
+    A_new = reinterpret_cast<math_t*>(workspace);
+    CUBLAS_CHECK(
+      raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_row, ld, A_new, 1, stream));
   }
-  cublasOperation_t op =
-    (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t op = (uplo == CUBLAS_FILL_MODE_UPPER) ? CUBLAS_OP_T : CUBLAS_OP_N;
   if (n > 1) {
     // Calculate L_12 = x by solving equation L_11 x = A_12
     math_t alpha = 1;
-    CUBLAS_CHECK(raft::linalg::cublastrsm(
-      handle.get_cublas_handle(), CUBLAS_SIDE_LEFT, uplo, op,
-      CUBLAS_DIAG_NON_UNIT, n - 1, 1, &alpha, L, ld, A_new, n - 1, stream));
+    CUBLAS_CHECK(raft::linalg::cublastrsm(handle.get_cublas_handle(),
+                                          CUBLAS_SIDE_LEFT,
+                                          uplo,
+                                          op,
+                                          CUBLAS_DIAG_NON_UNIT,
+                                          n - 1,
+                                          1,
+                                          &alpha,
+                                          L,
+                                          ld,
+                                          A_new,
+                                          n - 1,
+                                          stream));
 
     // A_new now stores L_12, we calculate s = L_12 * L_12
-    CUBLAS_CHECK(raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1,
-                                         A_new, 1, A_new, 1, s, stream));
+    CUBLAS_CHECK(
+      raft::linalg::cublasdot(handle.get_cublas_handle(), n - 1, A_new, 1, A_new, 1, s, stream));
 
     if (uplo == CUBLAS_FILL_MODE_LOWER) {
       // Copy back the L_12 elements as the n-th row of L
-      CUBLAS_CHECK(raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1,
-                                            A_new, 1, A_row, ld, stream));
+      CUBLAS_CHECK(
+        raft::linalg::cublasCopy(handle.get_cublas_handle(), n - 1, A_new, 1, A_row, ld, stream));
     }
   } else {  // n == 1 case
     CUDA_CHECK(cudaMemsetAsync(s, 0, sizeof(math_t), stream));
@@ -202,9 +217,7 @@ void choleskyRank1Update(const raft::handle_t &handle, math_t *L, int n, int ld,
   // the system is very ill conditioned then the A_22 - L_12 * L_12 can be
   // negative, which would result L_22 = NaN. A small positive eps parameter
   // can be used to prevent this.
-  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) {
-    L_22_host = eps;
-  }
+  if (eps >= 0 && (std::isnan(L_22_host) || L_22_host < eps)) { L_22_host = eps; }
   ASSERT(!std::isnan(L_22_host), "Error during Cholesky rank one update");
   raft::update_device(L_22, &L_22_host, 1, stream);
 }
diff --git a/cpp/include/raft/linalg/coalesced_reduction.cuh b/cpp/include/raft/linalg/coalesced_reduction.cuh
index ef983ff3d0..7e0744f98a 100644
--- a/cpp/include/raft/linalg/coalesced_reduction.cuh
+++ b/cpp/include/raft/linalg/coalesced_reduction.cuh
@@ -26,18 +26,27 @@ namespace linalg {
 // of the matrix, i.e. reduce along rows for row major or reduce along columns
 // for column major layout. Kernel does an inplace reduction adding to original
 // values of dots.
-template <typename InType, typename OutType, typename IdxType, int TPB,
-          typename MainLambda, typename ReduceLambda, typename FinalLambda>
-__global__ void coalescedReductionKernel(OutType *dots, const InType *data,
-                                         int D, int N, OutType init,
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          int TPB,
+          typename MainLambda,
+          typename ReduceLambda,
+          typename FinalLambda>
+__global__ void coalescedReductionKernel(OutType* dots,
+                                         const InType* data,
+                                         int D,
+                                         int N,
+                                         OutType init,
                                          MainLambda main_op,
                                          ReduceLambda reduce_op,
                                          FinalLambda final_op,
-                                         bool inplace = false) {
+                                         bool inplace = false)
+{
   typedef cub::BlockReduce<OutType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType thread_data = init;
-  IdxType rowStart = blockIdx.x * D;
+  IdxType rowStart    = blockIdx.x * D;
   for (IdxType i = threadIdx.x; i < D; i += TPB) {
     IdxType idx = rowStart + i;
     thread_data = reduce_op(thread_data, main_op(data[idx], i));
@@ -79,33 +88,37 @@ __global__ void coalescedReductionKernel(OutType *dots, const InType *data,
  * @param inplace reduction result added inplace or overwrites old values?
  * @param stream cuda stream where to launch work
  */
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda = raft::Nop<OutType>>
-void coalescedReduction(OutType *dots, const InType *data, int D, int N,
-                        OutType init, cudaStream_t stream, bool inplace = false,
-                        MainLambda main_op = raft::Nop<InType, IdxType>(),
+          typename FinalLambda  = raft::Nop<OutType>>
+void coalescedReduction(OutType* dots,
+                        const InType* data,
+                        int D,
+                        int N,
+                        OutType init,
+                        cudaStream_t stream,
+                        bool inplace           = false,
+                        MainLambda main_op     = raft::Nop<InType, IdxType>(),
                         ReduceLambda reduce_op = raft::Sum<OutType>(),
-                        FinalLambda final_op = raft::Nop<OutType>()) {
+                        FinalLambda final_op   = raft::Nop<OutType>())
+{
   // One block per reduction
   // Efficient only for large leading dimensions
   if (D <= 32) {
     coalescedReductionKernel<InType, OutType, IdxType, 32>
-      <<<N, 32, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                             final_op, inplace);
+      <<<N, 32, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   } else if (D <= 64) {
     coalescedReductionKernel<InType, OutType, IdxType, 64>
-      <<<N, 64, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                             final_op, inplace);
+      <<<N, 64, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   } else if (D <= 128) {
     coalescedReductionKernel<InType, OutType, IdxType, 128>
-      <<<N, 128, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                              final_op, inplace);
+      <<<N, 128, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   } else {
     coalescedReductionKernel<InType, OutType, IdxType, 256>
-      <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op,
-                              final_op, inplace);
+      <<<N, 256, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/linalg/contractions.cuh b/cpp/include/raft/linalg/contractions.cuh
index e6ff8a49ce..817bfeab5c 100644
--- a/cpp/include/raft/linalg/contractions.cuh
+++ b/cpp/include/raft/linalg/contractions.cuh
@@ -55,8 +55,7 @@ namespace linalg {
  *                 thread block. This also determines the number of threads per
  *                 thread block
  */
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr,
-          int _tc>
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
 struct KernelPolicy {
   enum {
     /** number of elements along K worked upon per main loop iteration */
@@ -101,8 +100,7 @@ struct KernelPolicy {
 
 };  // struct KernelPolicy
 
-template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr,
-          int _tc>
+template <typename DataT, int _veclen, int _kblk, int _rpt, int _cpt, int _tr, int _tc>
 struct ColKernelPolicy {
   enum {
     /** number of elements along K worked upon per main loop iteration */
@@ -151,7 +149,8 @@ struct ColKernelPolicy {
  * @{
  */
 template <typename DataT, int _veclen>
-struct Policy4x4 {};
+struct Policy4x4 {
+};
 
 template <int _veclen>
 struct Policy4x4<float, _veclen> {
@@ -171,7 +170,8 @@ struct Policy4x4<double, _veclen> {
  * @{
  */
 template <typename DataT, int _veclen = 1>
-struct Policy2x8 {};
+struct Policy2x8 {
+};
 
 template <int _veclen>
 struct Policy2x8<float, _veclen> {
@@ -201,8 +201,7 @@ struct Policy2x8<double, _veclen> {
  * @tparam Policy policy used to customize memory access behavior.
  *                See documentation for `KernelPolicy` to know more.
  */
-template <typename DataT, typename IdxT, typename Policy,
-          bool isRowMajor = true>
+template <typename DataT, typename IdxT, typename Policy, bool isRowMajor = true>
 struct Contractions_NT {
  protected:
   typedef Policy P;
@@ -268,8 +267,7 @@ struct Contractions_NT {
    * @param[in] _k number of cols of X and Y
    * @param[in] _smem shared memory region used during computations
    */
-  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-                     IdxT _k, char* _smem)
+  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n, IdxT _k, char* _smem)
     : m(_m),
       n(_n),
       k(_k),
@@ -286,7 +284,9 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0) {}
+      pageRd(0)
+  {
+  }
 
   /**
    * @brief Ctor
@@ -297,8 +297,15 @@ struct Contractions_NT {
    * @param[in] _k number of cols of X and Y
    * @param[in] _smem shared memory region used during computations
    */
-  DI Contractions_NT(const DataT* _x, const DataT* _y, IdxT _m, IdxT _n,
-                     IdxT _k, IdxT _lda, IdxT _ldb, IdxT _ldd, char* _smem)
+  DI Contractions_NT(const DataT* _x,
+                     const DataT* _y,
+                     IdxT _m,
+                     IdxT _n,
+                     IdxT _k,
+                     IdxT _lda,
+                     IdxT _ldb,
+                     IdxT _ldd,
+                     char* _smem)
     : m(_m),
       n(_n),
       k(_k),
@@ -312,17 +319,18 @@ struct Contractions_NT {
       sx((DataT*)_smem),
       sy(&(sx[P::SmemPageX])),
       pageWr(0),
-      pageRd(0) {
+      pageRd(0)
+  {
     if (isRowMajor) {
       xrowid = IdxT(blockIdx.y) * P::Mblk + srowid;
       yrowid = IdxT(blockIdx.x) * P::Nblk + srowid;
-      x = _x + xrowid * lda;
-      y = _y + yrowid * ldb;
+      x      = _x + xrowid * lda;
+      y      = _y + yrowid * ldb;
     } else {
       xrowid = IdxT(blockIdx.y) * P::Mblk;
       yrowid = IdxT(blockIdx.x) * P::Nblk;
-      x = _x + xrowid + srowid * lda;
-      y = _y + yrowid + srowid * ldb;
+      x      = _x + xrowid + srowid * lda;
+      y      = _y + yrowid + srowid * ldb;
     }
   }
 
@@ -331,7 +339,8 @@ struct Contractions_NT {
    * @brief Load current block of X/Y from global memory to registers
    * @param[in] kidx current start index of k to be loaded
    */
-  DI void ldgXY(IdxT kidx) {
+  DI void ldgXY(IdxT kidx)
+  {
     ldgX(kidx);
     ldgY(kidx);
   }
@@ -340,7 +349,8 @@ struct Contractions_NT {
    * @brief Store current block of X/Y from registers to smem
    * @param[in] kidx current start index of k to be loaded
    */
-  DI void stsXY() {
+  DI void stsXY()
+  {
     stsX(sx + pageWr * P::SmemPage);
     stsY(sy + pageWr * P::SmemPage);
   }
@@ -349,13 +359,15 @@ struct Contractions_NT {
    * @brief Load X and Y block from shared memory to registers
    * @param[in] kidx k value from the current k-block to be loaded from smem
    */
-  DI void ldsXY(int kidx) {
+  DI void ldsXY(int kidx)
+  {
     ldsX(kidx, sx + pageRd * P::SmemPage);
     ldsY(kidx, sy + pageRd * P::SmemPage);
   }
 
  private:
-  DI void ldgX(IdxT kidx) {
+  DI void ldgX(IdxT kidx)
+  {
     if (isRowMajor) {
       auto numRows = m;
       auto koffset = kidx + scolid;
@@ -372,11 +384,10 @@ struct Contractions_NT {
       }
     } else {
       const auto numRows = k;
-      auto koffset = scolid;
+      auto koffset       = scolid;
 #pragma unroll
       for (int i = 0; i < P::LdgPerThX; ++i) {
-        if ((koffset + xrowid) < lda &&
-            (srowid + kidx + i * P::LdgRowsX) < numRows) {
+        if ((koffset + xrowid) < lda && (srowid + kidx + i * P::LdgRowsX) < numRows) {
           ldg(ldgDataX[i], x + (kidx + i * P::LdgRowsX) * lda + koffset);
         } else {
 #pragma unroll
@@ -388,7 +399,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldgY(IdxT kidx) {
+  DI void ldgY(IdxT kidx)
+  {
     if (isRowMajor) {
       auto numRows = n;
       auto koffset = kidx + scolid;
@@ -408,8 +420,7 @@ struct Contractions_NT {
       auto koffset = scolid;
 #pragma unroll
       for (int i = 0; i < P::LdgPerThY; ++i) {
-        if ((koffset + yrowid) < ldb &&
-            (srowid + kidx + i * P::LdgRowsY) < numRows) {
+        if ((koffset + yrowid) < ldb && (srowid + kidx + i * P::LdgRowsY) < numRows) {
           ldg(ldgDataY[i], y + (kidx + i * P::LdgRowsY) * ldb + koffset);
         } else {
 #pragma unroll
@@ -421,7 +432,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void stsX(DataT* smem) {
+  DI void stsX(DataT* smem)
+  {
     auto* saddr = smem + srowid * P::SmemStride + scolid;
 #pragma unroll
     for (int i = 0; i < P::LdgPerThX; ++i) {
@@ -429,7 +441,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void stsY(DataT* smem) {
+  DI void stsY(DataT* smem)
+  {
     auto* saddr = smem + srowid * P::SmemStride + scolid;
 #pragma unroll
     for (int i = 0; i < P::LdgPerThY; ++i) {
@@ -437,7 +450,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldsX(int kidx, DataT* smem) {
+  DI void ldsX(int kidx, DataT* smem)
+  {
     if (isRowMajor) {
       auto* saddr = smem + accrowid * P::SmemStride + kidx;
 #pragma unroll
@@ -456,7 +470,8 @@ struct Contractions_NT {
     }
   }
 
-  DI void ldsY(int kidx, DataT* smem) {
+  DI void ldsY(int kidx, DataT* smem)
+  {
     if (isRowMajor) {
       auto* saddr = smem + acccolid * P::SmemStride + kidx;
 #pragma unroll
diff --git a/cpp/include/raft/linalg/cublas_wrappers.h b/cpp/include/raft/linalg/cublas_wrappers.h
index 1be14a550d..3616d54506 100644
--- a/cpp/include/raft/linalg/cublas_wrappers.h
+++ b/cpp/include/raft/linalg/cublas_wrappers.h
@@ -25,8 +25,7 @@
 #include <cstdint>
 
 #define _CUBLAS_ERR_TO_STR(err) \
-  case err:                     \
-    return #err
+  case err: return #err
 
 namespace raft {
 
@@ -34,15 +33,15 @@ namespace raft {
  * @brief Exception thrown when a cuBLAS error is encountered.
  */
 struct cublas_error : public raft::exception {
-  explicit cublas_error(char const *const message) : raft::exception(message) {}
-  explicit cublas_error(std::string const &message)
-    : raft::exception(message) {}
+  explicit cublas_error(char const* const message) : raft::exception(message) {}
+  explicit cublas_error(std::string const& message) : raft::exception(message) {}
 };
 
 namespace linalg {
 namespace detail {
 
-inline const char *cublas_error_to_string(cublasStatus_t err) {
+inline const char* cublas_error_to_string(cublasStatus_t err)
+{
   switch (err) {
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_SUCCESS);
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_INITIALIZED);
@@ -54,8 +53,7 @@ inline const char *cublas_error_to_string(cublasStatus_t err) {
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_INTERNAL_ERROR);
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_NOT_SUPPORTED);
     _CUBLAS_ERR_TO_STR(CUBLAS_STATUS_LICENSE_ERROR);
-    default:
-      return "CUBLAS_STATUS_UNKNOWN";
+    default: return "CUBLAS_STATUS_UNKNOWN";
   };
 }
 
@@ -71,29 +69,34 @@ inline const char *cublas_error_to_string(cublasStatus_t err) {
  * Invokes a cuBLAS runtime API function call, if the call does not return
  * CUBLAS_STATUS_SUCCESS, throws an exception detailing the cuBLAS error that occurred
  */
-#define CUBLAS_TRY(call)                                                      \
-  do {                                                                        \
-    cublasStatus_t const status = (call);                                     \
-    if (CUBLAS_STATUS_SUCCESS != status) {                                    \
-      std::string msg{};                                                      \
-      SET_ERROR_MSG(                                                          \
-        msg, "cuBLAS error encountered at: ", "call='%s', Reason=%d:%s",      \
-        #call, status, raft::linalg::detail::cublas_error_to_string(status)); \
-      throw raft::cublas_error(msg);                                          \
-    }                                                                         \
+#define CUBLAS_TRY(call)                                                   \
+  do {                                                                     \
+    cublasStatus_t const status = (call);                                  \
+    if (CUBLAS_STATUS_SUCCESS != status) {                                 \
+      std::string msg{};                                                   \
+      SET_ERROR_MSG(msg,                                                   \
+                    "cuBLAS error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                             \
+                    #call,                                                 \
+                    status,                                                \
+                    raft::linalg::detail::cublas_error_to_string(status)); \
+      throw raft::cublas_error(msg);                                       \
+    }                                                                      \
   } while (0)
 
 /** FIXME: temporary alias for cuML compatibility */
 #define CUBLAS_CHECK(call) CUBLAS_TRY(call)
 
 /** check for cublas runtime API errors but do not assert */
-#define CUBLAS_CHECK_NO_THROW(call)                                          \
-  do {                                                                       \
-    cublasStatus_t err = call;                                               \
-    if (err != CUBLAS_STATUS_SUCCESS) {                                      \
-      CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s", #call, err, \
-                     raft::linalg::detail::cublas_error_to_string(err));     \
-    }                                                                        \
+#define CUBLAS_CHECK_NO_THROW(call)                                      \
+  do {                                                                   \
+    cublasStatus_t err = call;                                           \
+    if (err != CUBLAS_STATUS_SUCCESS) {                                  \
+      CUML_LOG_ERROR("CUBLAS call='%s' got errorcode=%d err=%s",         \
+                     #call,                                              \
+                     err,                                                \
+                     raft::linalg::detail::cublas_error_to_string(err)); \
+    }                                                                    \
   } while (0)
 
 namespace raft {
@@ -104,22 +107,39 @@ namespace linalg {
  * @{
  */
 template <typename T>
-cublasStatus_t cublasaxpy(cublasHandle_t handle, int n, const T *alpha,
-                          const T *x, int incx, T *y, int incy,
+cublasStatus_t cublasaxpy(cublasHandle_t handle,
+                          int n,
+                          const T* alpha,
+                          const T* x,
+                          int incx,
+                          T* y,
+                          int incy,
                           cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
-                                 const float *alpha, const float *x, int incx,
-                                 float *y, int incy, cudaStream_t stream) {
+inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
+                                 int n,
+                                 const float* alpha,
+                                 const float* x,
+                                 int incx,
+                                 float* y,
+                                 int incy,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSaxpy(handle, n, alpha, x, incx, y, incy);
 }
 
 template <>
-inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
-                                 const double *alpha, const double *x, int incx,
-                                 double *y, int incy, cudaStream_t stream) {
+inline cublasStatus_t cublasaxpy(cublasHandle_t handle,
+                                 int n,
+                                 const double* alpha,
+                                 const double* x,
+                                 int incx,
+                                 double* y,
+                                 int incy,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDaxpy(handle, n, alpha, x, incx, y, incy);
 }
@@ -130,21 +150,21 @@ inline cublasStatus_t cublasaxpy(cublasHandle_t handle, int n,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasSwap(cublasHandle_t handle, int n, T *x, int incx, T *y,
-                          int incy, cudaStream_t stream);
+cublasStatus_t cublasSwap(
+  cublasHandle_t handle, int n, T* x, int incx, T* y, int incy, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, float *x,
-                                 int incx, float *y, int incy,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasSwap(
+  cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSswap(handle, n, x, incx, y, incy);
 }
 
 template <>
-inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x,
-                                 int incx, double *y, int incy,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasSwap(
+  cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDswap(handle, n, x, incx, y, incy);
 }
@@ -156,20 +176,20 @@ inline cublasStatus_t cublasSwap(cublasHandle_t handle, int n, double *x,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const T *x, int incx,
-                          T *y, int incy, cudaStream_t stream);
+cublasStatus_t cublasCopy(
+  cublasHandle_t handle, int n, const T* x, int incx, T* y, int incy, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const float *x,
-                                 int incx, float *y, int incy,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasCopy(
+  cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasScopy(handle, n, x, incx, y, incy);
 }
 template <>
-inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x,
-                                 int incx, double *y, int incy,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasCopy(
+  cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDcopy(handle, n, x, incx, y, incy);
 }
@@ -180,31 +200,56 @@ inline cublasStatus_t cublasCopy(cublasHandle_t handle, int n, const double *x,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasgemv(cublasHandle_t handle, cublasOperation_t transA,
-                          int m, int n, const T *alfa, const T *A, int lda,
-                          const T *x, int incx, const T *beta, T *y, int incy,
+cublasStatus_t cublasgemv(cublasHandle_t handle,
+                          cublasOperation_t transA,
+                          int m,
+                          int n,
+                          const T* alfa,
+                          const T* A,
+                          int lda,
+                          const T* x,
+                          int incx,
+                          const T* beta,
+                          T* y,
+                          int incy,
                           cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemv(cublasHandle_t handle,
-                                 cublasOperation_t transA, int m, int n,
-                                 const float *alfa, const float *A, int lda,
-                                 const float *x, int incx, const float *beta,
-                                 float *y, int incy, cudaStream_t stream) {
+                                 cublasOperation_t transA,
+                                 int m,
+                                 int n,
+                                 const float* alfa,
+                                 const float* A,
+                                 int lda,
+                                 const float* x,
+                                 int incx,
+                                 const float* beta,
+                                 float* y,
+                                 int incy,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y,
-                     incy);
+  return cublasSgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 
 template <>
 inline cublasStatus_t cublasgemv(cublasHandle_t handle,
-                                 cublasOperation_t transA, int m, int n,
-                                 const double *alfa, const double *A, int lda,
-                                 const double *x, int incx, const double *beta,
-                                 double *y, int incy, cudaStream_t stream) {
+                                 cublasOperation_t transA,
+                                 int m,
+                                 int n,
+                                 const double* alfa,
+                                 const double* A,
+                                 int lda,
+                                 const double* x,
+                                 int incx,
+                                 const double* beta,
+                                 double* y,
+                                 int incy,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y,
-                     incy);
+  return cublasDgemv(handle, transA, m, n, alfa, A, lda, x, incx, beta, y, incy);
 }
 /** @} */
 
@@ -213,23 +258,47 @@ inline cublasStatus_t cublasgemv(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasger(cublasHandle_t handle, int m, int n, const T *alpha,
-                         const T *x, int incx, const T *y, int incy, T *A,
-                         int lda, cudaStream_t stream);
+cublasStatus_t cublasger(cublasHandle_t handle,
+                         int m,
+                         int n,
+                         const T* alpha,
+                         const T* x,
+                         int incx,
+                         const T* y,
+                         int incy,
+                         T* A,
+                         int lda,
+                         cudaStream_t stream);
 template <>
-inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
-                                const float *alpha, const float *x, int incx,
-                                const float *y, int incy, float *A, int lda,
-                                cudaStream_t stream) {
+inline cublasStatus_t cublasger(cublasHandle_t handle,
+                                int m,
+                                int n,
+                                const float* alpha,
+                                const float* x,
+                                int incx,
+                                const float* y,
+                                int incy,
+                                float* A,
+                                int lda,
+                                cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
 template <>
-inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
-                                const double *alpha, const double *x, int incx,
-                                const double *y, int incy, double *A, int lda,
-                                cudaStream_t stream) {
+inline cublasStatus_t cublasger(cublasHandle_t handle,
+                                int m,
+                                int n,
+                                const double* alpha,
+                                const double* x,
+                                int incx,
+                                const double* y,
+                                int incy,
+                                double* A,
+                                int lda,
+                                cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDger(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
@@ -240,34 +309,62 @@ inline cublasStatus_t cublasger(cublasHandle_t handle, int m, int n,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasgemm(cublasHandle_t handle, cublasOperation_t transA,
-                          cublasOperation_t transB, int m, int n, int k,
-                          const T *alfa, const T *A, int lda, const T *B,
-                          int ldb, const T *beta, T *C, int ldc,
+cublasStatus_t cublasgemm(cublasHandle_t handle,
+                          cublasOperation_t transA,
+                          cublasOperation_t transB,
+                          int m,
+                          int n,
+                          int k,
+                          const T* alfa,
+                          const T* A,
+                          int lda,
+                          const T* B,
+                          int ldb,
+                          const T* beta,
+                          T* C,
+                          int ldc,
                           cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB, int m, int n, int k,
-                                 const float *alfa, const float *A, int lda,
-                                 const float *B, int ldb, const float *beta,
-                                 float *C, int ldc, cudaStream_t stream) {
+                                 cublasOperation_t transB,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const float* alfa,
+                                 const float* A,
+                                 int lda,
+                                 const float* B,
+                                 int ldb,
+                                 const float* beta,
+                                 float* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb,
-                     beta, C, ldc);
+  return cublasSgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 
 template <>
 inline cublasStatus_t cublasgemm(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB, int m, int n, int k,
-                                 const double *alfa, const double *A, int lda,
-                                 const double *B, int ldb, const double *beta,
-                                 double *C, int ldc, cudaStream_t stream) {
+                                 cublasOperation_t transB,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const double* alfa,
+                                 const double* A,
+                                 int lda,
+                                 const double* B,
+                                 int ldb,
+                                 const double* beta,
+                                 double* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb,
-                     beta, C, ldc);
+  return cublasDgemm(handle, transA, transB, m, n, k, alfa, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
 
@@ -278,38 +375,93 @@ inline cublasStatus_t cublasgemm(cublasHandle_t handle,
 template <typename T>
 cublasStatus_t cublasgemmBatched(cublasHandle_t handle,  // NOLINT
                                  cublasOperation_t transa,
-                                 cublasOperation_t transb, int m, int n, int k,
-                                 const T *alpha,
-                                 const T *const Aarray[],           // NOLINT
-                                 int lda, const T *const Barray[],  // NOLINT
-                                 int ldb, const T *beta,
-                                 T *Carray[],  // NOLINT
-                                 int ldc, int batchCount, cudaStream_t stream);
+                                 cublasOperation_t transb,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const T* alpha,
+                                 const T* const Aarray[],  // NOLINT
+                                 int lda,
+                                 const T* const Barray[],  // NOLINT
+                                 int ldb,
+                                 const T* beta,
+                                 T* Carray[],  // NOLINT
+                                 int ldc,
+                                 int batchCount,
+                                 cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemmBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const float *alpha,
-  const float *const Aarray[],                  // NOLINT
-  int lda, const float *const Barray[],         // NOLINT
-  int ldb, const float *beta, float *Carray[],  // NOLINT
-  int ldc, int batchCount, cudaStream_t stream) {
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const float* alpha,
+  const float* const Aarray[],  // NOLINT
+  int lda,
+  const float* const Barray[],  // NOLINT
+  int ldb,
+  const float* beta,
+  float* Carray[],  // NOLINT
+  int ldc,
+  int batchCount,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda,
-                            Barray, ldb, beta, Carray, ldc, batchCount);
+  return cublasSgemmBatched(handle,
+                            transa,
+                            transb,
+                            m,
+                            n,
+                            k,
+                            alpha,
+                            Aarray,
+                            lda,
+                            Barray,
+                            ldb,
+                            beta,
+                            Carray,
+                            ldc,
+                            batchCount);
 }
 
 template <>
 inline cublasStatus_t cublasgemmBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const double *alpha,
-  const double *const Aarray[],                   // NOLINT
-  int lda, const double *const Barray[],          // NOLINT
-  int ldb, const double *beta, double *Carray[],  // NOLINT
-  int ldc, int batchCount, cudaStream_t stream) {
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const double* alpha,
+  const double* const Aarray[],  // NOLINT
+  int lda,
+  const double* const Barray[],  // NOLINT
+  int ldb,
+  const double* beta,
+  double* Carray[],  // NOLINT
+  int ldc,
+  int batchCount,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemmBatched(handle, transa, transb, m, n, k, alpha, Aarray, lda,
-                            Barray, ldb, beta, Carray, ldc, batchCount);
+  return cublasDgemmBatched(handle,
+                            transa,
+                            transb,
+                            m,
+                            n,
+                            k,
+                            alpha,
+                            Aarray,
+                            lda,
+                            Barray,
+                            ldb,
+                            beta,
+                            Carray,
+                            ldc,
+                            batchCount);
 }
 /** @} */
 
@@ -319,36 +471,110 @@ inline cublasStatus_t cublasgemmBatched(  // NOLINT
  */
 template <typename T>
 cublasStatus_t cublasgemmStridedBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const T *alpha, const T *const Aarray, int lda,
-  int64_t strideA, const T *const Barray, int ldb, int64_t strideB,
-  const T *beta, T *Carray, int ldc, int64_t strideC, int batchCount,
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const T* alpha,
+  const T* const Aarray,
+  int lda,
+  int64_t strideA,
+  const T* const Barray,
+  int ldb,
+  int64_t strideB,
+  const T* beta,
+  T* Carray,
+  int ldc,
+  int64_t strideC,
+  int batchCount,
   cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const float *alpha, const float *const Aarray, int lda,
-  int64_t strideA, const float *const Barray, int ldb, int64_t strideB,
-  const float *beta, float *Carray, int ldc, int64_t strideC, int batchCount,
-  cudaStream_t stream) {
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const float* alpha,
+  const float* const Aarray,
+  int lda,
+  int64_t strideA,
+  const float* const Barray,
+  int ldb,
+  int64_t strideB,
+  const float* beta,
+  float* Carray,
+  int ldc,
+  int64_t strideC,
+  int batchCount,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgemmStridedBatched(handle, transa, transb, m, n, k, alpha,
-                                   Aarray, lda, strideA, Barray, ldb, strideB,
-                                   beta, Carray, ldc, strideC, batchCount);
+  return cublasSgemmStridedBatched(handle,
+                                   transa,
+                                   transb,
+                                   m,
+                                   n,
+                                   k,
+                                   alpha,
+                                   Aarray,
+                                   lda,
+                                   strideA,
+                                   Barray,
+                                   ldb,
+                                   strideB,
+                                   beta,
+                                   Carray,
+                                   ldc,
+                                   strideC,
+                                   batchCount);
 }
 
 template <>
 inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
-  cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
-  int m, int n, int k, const double *alpha, const double *const Aarray, int lda,
-  int64_t strideA, const double *const Barray, int ldb, int64_t strideB,
-  const double *beta, double *Carray, int ldc, int64_t strideC, int batchCount,
-  cudaStream_t stream) {
+  cublasHandle_t handle,
+  cublasOperation_t transa,
+  cublasOperation_t transb,
+  int m,
+  int n,
+  int k,
+  const double* alpha,
+  const double* const Aarray,
+  int lda,
+  int64_t strideA,
+  const double* const Barray,
+  int ldb,
+  int64_t strideB,
+  const double* beta,
+  double* Carray,
+  int ldc,
+  int64_t strideC,
+  int batchCount,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgemmStridedBatched(handle, transa, transb, m, n, k, alpha,
-                                   Aarray, lda, strideA, Barray, ldb, strideB,
-                                   beta, Carray, ldc, strideC, batchCount);
+  return cublasDgemmStridedBatched(handle,
+                                   transa,
+                                   transb,
+                                   m,
+                                   n,
+                                   k,
+                                   alpha,
+                                   Aarray,
+                                   lda,
+                                   strideA,
+                                   Barray,
+                                   ldb,
+                                   strideB,
+                                   beta,
+                                   Carray,
+                                   ldc,
+                                   strideC,
+                                   batchCount);
 }
 /** @} */
 
@@ -358,51 +584,85 @@ inline cublasStatus_t cublasgemmStridedBatched(  // NOLINT
  */
 
 template <typename T>
-cublasStatus_t cublasgetrfBatched(cublasHandle_t handle, int n,  // NOLINT
-                                  T *const A[],                  // NOLINT
-                                  int lda, int *P, int *info, int batchSize,
+cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,
+                                  int n,         // NOLINT
+                                  T* const A[],  // NOLINT
+                                  int lda,
+                                  int* P,
+                                  int* info,
+                                  int batchSize,
                                   cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,    // NOLINT
-                                         int n, float *const A[],  // NOLINT
-                                         int lda, int *P, int *info,
-                                         int batchSize, cudaStream_t stream) {
+inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
+                                         int n,
+                                         float* const A[],  // NOLINT
+                                         int lda,
+                                         int* P,
+                                         int* info,
+                                         int batchSize,
+                                         cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
 template <>
-inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,     // NOLINT
-                                         int n, double *const A[],  // NOLINT
-                                         int lda, int *P, int *info,
-                                         int batchSize, cudaStream_t stream) {
+inline cublasStatus_t cublasgetrfBatched(cublasHandle_t handle,  // NOLINT
+                                         int n,
+                                         double* const A[],  // NOLINT
+                                         int lda,
+                                         int* P,
+                                         int* info,
+                                         int batchSize,
+                                         cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgetrfBatched(handle, n, A, lda, P, info, batchSize);
 }
 
 template <typename T>
-cublasStatus_t cublasgetriBatched(cublasHandle_t handle, int n,  // NOLINT
-                                  const T *const A[],            // NOLINT
-                                  int lda, const int *P,
-                                  T *const C[],  // NOLINT
-                                  int ldc, int *info, int batchSize,
+cublasStatus_t cublasgetriBatched(cublasHandle_t handle,
+                                  int n,               // NOLINT
+                                  const T* const A[],  // NOLINT
+                                  int lda,
+                                  const int* P,
+                                  T* const C[],  // NOLINT
+                                  int ldc,
+                                  int* info,
+                                  int batchSize,
                                   cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasgetriBatched(                // NOLINT
-  cublasHandle_t handle, int n, const float *const A[],  // NOLINT
-  int lda, const int *P, float *const C[],               // NOLINT
-  int ldc, int *info, int batchSize, cudaStream_t stream) {
+inline cublasStatus_t cublasgetriBatched(  // NOLINT
+  cublasHandle_t handle,
+  int n,
+  const float* const A[],  // NOLINT
+  int lda,
+  const int* P,
+  float* const C[],  // NOLINT
+  int ldc,
+  int* info,
+  int batchSize,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
 template <>
-inline cublasStatus_t cublasgetriBatched(                 // NOLINT
-  cublasHandle_t handle, int n, const double *const A[],  // NOLINT
-  int lda, const int *P, double *const C[],               // NOLINT
-  int ldc, int *info, int batchSize, cudaStream_t stream) {
+inline cublasStatus_t cublasgetriBatched(  // NOLINT
+  cublasHandle_t handle,
+  int n,
+  const double* const A[],  // NOLINT
+  int lda,
+  const int* P,
+  double* const C[],  // NOLINT
+  int ldc,
+  int* info,
+  int batchSize,
+  cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDgetriBatched(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
@@ -416,34 +676,57 @@ inline cublasStatus_t cublasgetriBatched(                 // NOLINT
 
 template <typename T>
 inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int m, int n,
-                                        int nrhs, T *Aarray[],  // NOLINT
-                                        int lda, T *Carray[],   // NOLINT
-                                        int ldc, int *info, int *devInfoArray,
-                                        int batchSize, cudaStream_t stream);
+                                        cublasOperation_t trans,
+                                        int m,
+                                        int n,
+                                        int nrhs,
+                                        T* Aarray[],  // NOLINT
+                                        int lda,
+                                        T* Carray[],  // NOLINT
+                                        int ldc,
+                                        int* info,
+                                        int* devInfoArray,
+                                        int batchSize,
+                                        cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int m, int n,
-                                        int nrhs, float *Aarray[],  // NOLINT
-                                        int lda, float *Carray[],   // NOLINT
-                                        int ldc, int *info, int *devInfoArray,
-                                        int batchSize, cudaStream_t stream) {
+                                        cublasOperation_t trans,
+                                        int m,
+                                        int n,
+                                        int nrhs,
+                                        float* Aarray[],  // NOLINT
+                                        int lda,
+                                        float* Carray[],  // NOLINT
+                                        int ldc,
+                                        int* info,
+                                        int* devInfoArray,
+                                        int batchSize,
+                                        cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
-                            info, devInfoArray, batchSize);
+  return cublasSgelsBatched(
+    handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
 
 template <>
 inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int m, int n,
-                                        int nrhs, double *Aarray[],  // NOLINT
-                                        int lda, double *Carray[],   // NOLINT
-                                        int ldc, int *info, int *devInfoArray,
-                                        int batchSize, cudaStream_t stream) {
+                                        cublasOperation_t trans,
+                                        int m,
+                                        int n,
+                                        int nrhs,
+                                        double* Aarray[],  // NOLINT
+                                        int lda,
+                                        double* Carray[],  // NOLINT
+                                        int ldc,
+                                        int* info,
+                                        int* devInfoArray,
+                                        int batchSize,
+                                        cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgelsBatched(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc,
-                            info, devInfoArray, batchSize);
+  return cublasDgelsBatched(
+    handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
 }
 
 /** @} */
@@ -453,33 +736,59 @@ inline cublasStatus_t cublasgelsBatched(cublasHandle_t handle,  // NOLINT
  * @{
  */
 template <typename T>
-cublasStatus_t cublasgeam(cublasHandle_t handle, cublasOperation_t transA,
-                          cublasOperation_t transB, int m, int n, const T *alfa,
-                          const T *A, int lda, const T *beta, const T *B,
-                          int ldb, T *C, int ldc, cudaStream_t stream);
+cublasStatus_t cublasgeam(cublasHandle_t handle,
+                          cublasOperation_t transA,
+                          cublasOperation_t transB,
+                          int m,
+                          int n,
+                          const T* alfa,
+                          const T* A,
+                          int lda,
+                          const T* beta,
+                          const T* B,
+                          int ldb,
+                          T* C,
+                          int ldc,
+                          cudaStream_t stream);
 
 template <>
 inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB, int m, int n,
-                                 const float *alfa, const float *A, int lda,
-                                 const float *beta, const float *B, int ldb,
-                                 float *C, int ldc, cudaStream_t stream) {
+                                 cublasOperation_t transB,
+                                 int m,
+                                 int n,
+                                 const float* alfa,
+                                 const float* A,
+                                 int lda,
+                                 const float* beta,
+                                 const float* B,
+                                 int ldb,
+                                 float* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb,
-                     C, ldc);
+  return cublasSgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 
 template <>
 inline cublasStatus_t cublasgeam(cublasHandle_t handle,
                                  cublasOperation_t transA,
-                                 cublasOperation_t transB, int m, int n,
-                                 const double *alfa, const double *A, int lda,
-                                 const double *beta, const double *B, int ldb,
-                                 double *C, int ldc, cudaStream_t stream) {
+                                 cublasOperation_t transB,
+                                 int m,
+                                 int n,
+                                 const double* alfa,
+                                 const double* A,
+                                 int lda,
+                                 const double* beta,
+                                 const double* B,
+                                 int ldb,
+                                 double* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb,
-                     C, ldc);
+  return cublasDgeam(handle, transA, transB, m, n, alfa, A, lda, beta, B, ldb, C, ldc);
 }
 /** @} */
 
@@ -488,31 +797,59 @@ inline cublasStatus_t cublasgeam(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
-                          cublasFillMode_t uplo, int m, int n, const T *alpha,
-                          const T *A, int lda, const T *B, int ldb,
-                          const T *beta, T *C, int ldc, cudaStream_t stream);
+cublasStatus_t cublassymm(cublasHandle_t handle,
+                          cublasSideMode_t side,
+                          cublasFillMode_t uplo,
+                          int m,
+                          int n,
+                          const T* alpha,
+                          const T* A,
+                          int lda,
+                          const T* B,
+                          int ldb,
+                          const T* beta,
+                          T* C,
+                          int ldc,
+                          cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
-                                 cublasFillMode_t uplo, int m, int n,
-                                 const float *alpha, const float *A, int lda,
-                                 const float *B, int ldb, const float *beta,
-                                 float *C, int ldc, cudaStream_t stream) {
+inline cublasStatus_t cublassymm(cublasHandle_t handle,
+                                 cublasSideMode_t side,
+                                 cublasFillMode_t uplo,
+                                 int m,
+                                 int n,
+                                 const float* alpha,
+                                 const float* A,
+                                 int lda,
+                                 const float* B,
+                                 int ldb,
+                                 const float* beta,
+                                 float* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                     ldc);
+  return cublasSsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
 template <>
-inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
-                                 cublasFillMode_t uplo, int m, int n,
-                                 const double *alpha, const double *A, int lda,
-                                 const double *B, int ldb, const double *beta,
-                                 double *C, int ldc, cudaStream_t stream) {
+inline cublasStatus_t cublassymm(cublasHandle_t handle,
+                                 cublasSideMode_t side,
+                                 cublasFillMode_t uplo,
+                                 int m,
+                                 int n,
+                                 const double* alpha,
+                                 const double* A,
+                                 int lda,
+                                 const double* B,
+                                 int ldb,
+                                 const double* beta,
+                                 double* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
-                     ldc);
+  return cublasDsymm(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 /** @} */
 
@@ -521,27 +858,51 @@ inline cublasStatus_t cublassymm(cublasHandle_t handle, cublasSideMode_t side,
  * @{
  */
 template <typename T>
-cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
-                          cublasOperation_t trans, int n, int k, const T *alpha,
-                          const T *A, int lda, const T *beta, T *C, int ldc,
+cublasStatus_t cublassyrk(cublasHandle_t handle,
+                          cublasFillMode_t uplo,
+                          cublasOperation_t trans,
+                          int n,
+                          int k,
+                          const T* alpha,
+                          const T* A,
+                          int lda,
+                          const T* beta,
+                          T* C,
+                          int ldc,
                           cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
-                                 cublasOperation_t trans, int n, int k,
-                                 const float *alpha, const float *A, int lda,
-                                 const float *beta, float *C, int ldc,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublassyrk(cublasHandle_t handle,
+                                 cublasFillMode_t uplo,
+                                 cublasOperation_t trans,
+                                 int n,
+                                 int k,
+                                 const float* alpha,
+                                 const float* A,
+                                 int lda,
+                                 const float* beta,
+                                 float* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
 template <>
-inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
-                                 cublasOperation_t trans, int n, int k,
-                                 const double *alpha, const double *A, int lda,
-                                 const double *beta, double *C, int ldc,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublassyrk(cublasHandle_t handle,
+                                 cublasFillMode_t uplo,
+                                 cublasOperation_t trans,
+                                 int n,
+                                 int k,
+                                 const double* alpha,
+                                 const double* A,
+                                 int lda,
+                                 const double* beta,
+                                 double* C,
+                                 int ldc,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDsyrk(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
@@ -552,52 +913,77 @@ inline cublasStatus_t cublassyrk(cublasHandle_t handle, cublasFillMode_t uplo,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const T *x, int incx,
-                          T *result, cudaStream_t stream);
+cublasStatus_t cublasnrm2(
+  cublasHandle_t handle, int n, const T* x, int incx, T* result, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const float *x,
-                                 int incx, float *result, cudaStream_t stream) {
+inline cublasStatus_t cublasnrm2(
+  cublasHandle_t handle, int n, const float* x, int incx, float* result, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSnrm2(handle, n, x, incx, result);
 }
 
 template <>
-inline cublasStatus_t cublasnrm2(cublasHandle_t handle, int n, const double *x,
-                                 int incx, double *result,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasnrm2(
+  cublasHandle_t handle, int n, const double* x, int incx, double* result, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDnrm2(handle, n, x, incx, result);
 }
 /** @} */
 
 template <typename T>
-cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
-                          cublasFillMode_t uplo, cublasOperation_t trans,
-                          cublasDiagType_t diag, int m, int n, const T *alpha,
-                          const T *A, int lda, T *B, int ldb,
+cublasStatus_t cublastrsm(cublasHandle_t handle,
+                          cublasSideMode_t side,
+                          cublasFillMode_t uplo,
+                          cublasOperation_t trans,
+                          cublasDiagType_t diag,
+                          int m,
+                          int n,
+                          const T* alpha,
+                          const T* A,
+                          int lda,
+                          T* B,
+                          int ldb,
                           cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
-                                 cublasFillMode_t uplo, cublasOperation_t trans,
-                                 cublasDiagType_t diag, int m, int n,
-                                 const float *alpha, const float *A, int lda,
-                                 float *B, int ldb, cudaStream_t stream) {
+inline cublasStatus_t cublastrsm(cublasHandle_t handle,
+                                 cublasSideMode_t side,
+                                 cublasFillMode_t uplo,
+                                 cublasOperation_t trans,
+                                 cublasDiagType_t diag,
+                                 int m,
+                                 int n,
+                                 const float* alpha,
+                                 const float* A,
+                                 int lda,
+                                 float* B,
+                                 int ldb,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B,
-                     ldb);
+  return cublasStrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
 template <>
-inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
-                                 cublasFillMode_t uplo, cublasOperation_t trans,
-                                 cublasDiagType_t diag, int m, int n,
-                                 const double *alpha, const double *A, int lda,
-                                 double *B, int ldb, cudaStream_t stream) {
+inline cublasStatus_t cublastrsm(cublasHandle_t handle,
+                                 cublasSideMode_t side,
+                                 cublasFillMode_t uplo,
+                                 cublasOperation_t trans,
+                                 cublasDiagType_t diag,
+                                 int m,
+                                 int n,
+                                 const double* alpha,
+                                 const double* A,
+                                 int lda,
+                                 double* B,
+                                 int ldb,
+                                 cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
-  return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B,
-                     ldb);
+  return cublasDtrsm(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
 /**
@@ -605,21 +991,39 @@ inline cublasStatus_t cublastrsm(cublasHandle_t handle, cublasSideMode_t side,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasdot(cublasHandle_t handle, int n, const T *x, int incx,
-                         const T *y, int incy, T *result, cudaStream_t stream);
+cublasStatus_t cublasdot(cublasHandle_t handle,
+                         int n,
+                         const T* x,
+                         int incx,
+                         const T* y,
+                         int incy,
+                         T* result,
+                         cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const float *x,
-                                int incx, const float *y, int incy,
-                                float *result, cudaStream_t stream) {
+inline cublasStatus_t cublasdot(cublasHandle_t handle,
+                                int n,
+                                const float* x,
+                                int incx,
+                                const float* y,
+                                int incy,
+                                float* result,
+                                cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSdot(handle, n, x, incx, y, incy, result);
 }
 
 template <>
-inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x,
-                                int incx, const double *y, int incy,
-                                double *result, cudaStream_t stream) {
+inline cublasStatus_t cublasdot(cublasHandle_t handle,
+                                int n,
+                                const double* x,
+                                int incx,
+                                const double* y,
+                                int incy,
+                                double* result,
+                                cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDdot(handle, n, x, incx, y, incy, result);
 }
@@ -639,7 +1043,8 @@ inline cublasStatus_t cublasdot(cublasHandle_t handle, int n, const double *x,
 // template<>
 inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
                                            cublasPointerMode_t mode,
-                                           cudaStream_t stream) {
+                                           cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSetPointerMode(handle, mode);
 }
@@ -650,21 +1055,21 @@ inline cublasStatus_t cublassetpointermode(cublasHandle_t handle,
  * @{
  */
 template <typename T>
-cublasStatus_t cublasscal(cublasHandle_t handle, int n, const T *alpha, T *x,
-                          int incx, cudaStream_t stream);
+cublasStatus_t cublasscal(
+  cublasHandle_t handle, int n, const T* alpha, T* x, int incx, cudaStream_t stream);
 
 template <>
-inline cublasStatus_t cublasscal(cublasHandle_t handle, int n,
-                                 const float *alpha, float *x, int incx,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasscal(
+  cublasHandle_t handle, int n, const float* alpha, float* x, int incx, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasSscal(handle, n, alpha, x, incx);
 }
 
 template <>
-inline cublasStatus_t cublasscal(cublasHandle_t handle, int n,
-                                 const double *alpha, double *x, int incx,
-                                 cudaStream_t stream) {
+inline cublasStatus_t cublasscal(
+  cublasHandle_t handle, int n, const double* alpha, double* x, int incx, cudaStream_t stream)
+{
   CUBLAS_CHECK(cublasSetStream(handle, stream));
   return cublasDscal(handle, n, alpha, x, incx);
 }
diff --git a/cpp/include/raft/linalg/cusolver_wrappers.h b/cpp/include/raft/linalg/cusolver_wrappers.h
index 6aa5e74455..85f2740647 100644
--- a/cpp/include/raft/linalg/cusolver_wrappers.h
+++ b/cpp/include/raft/linalg/cusolver_wrappers.h
@@ -24,8 +24,7 @@
 #include <type_traits>
 
 #define _CUSOLVER_ERR_TO_STR(err) \
-  case err:                       \
-    return #err;
+  case err: return #err;
 
 namespace raft {
 
@@ -33,16 +32,15 @@ namespace raft {
  * @brief Exception thrown when a cuSOLVER error is encountered.
  */
 struct cusolver_error : public raft::exception {
-  explicit cusolver_error(char const *const message)
-    : raft::exception(message) {}
-  explicit cusolver_error(std::string const &message)
-    : raft::exception(message) {}
+  explicit cusolver_error(char const* const message) : raft::exception(message) {}
+  explicit cusolver_error(std::string const& message) : raft::exception(message) {}
 };
 
 namespace linalg {
 namespace detail {
 
-inline const char *cusolver_error_to_string(cusolverStatus_t err) {
+inline const char* cusolver_error_to_string(cusolverStatus_t err)
+{
   switch (err) {
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_SUCCESS);
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_INITIALIZED);
@@ -54,8 +52,7 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) {
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_ZERO_PIVOT);
     _CUSOLVER_ERR_TO_STR(CUSOLVER_STATUS_NOT_SUPPORTED);
-    default:
-      return "CUSOLVER_STATUS_UNKNOWN";
+    default: return "CUSOLVER_STATUS_UNKNOWN";
   };
 }
 
@@ -76,8 +73,11 @@ inline const char *cusolver_error_to_string(cusolverStatus_t err) {
     cusolverStatus_t const status = (call);                                  \
     if (CUSOLVER_STATUS_SUCCESS != status) {                                 \
       std::string msg{};                                                     \
-      SET_ERROR_MSG(msg, "cuSOLVER error encountered at: ",                  \
-                    "call='%s', Reason=%d:%s", #call, status,                \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSOLVER error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
                     raft::linalg::detail::cusolver_error_to_string(status)); \
       throw raft::cusolver_error(msg);                                       \
     }                                                                        \
@@ -107,42 +107,76 @@ namespace linalg {
  * @{
  */
 template <typename T>
-cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle, int m,  // NOLINT
-                                 int n, T *A, int lda, T *Workspace,
-                                 int *devIpiv, int *devInfo,
+cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,
+                                 int m,  // NOLINT
+                                 int n,
+                                 T* A,
+                                 int lda,
+                                 T* Workspace,
+                                 int* devIpiv,
+                                 int* devInfo,
                                  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m, int n, float *A, int lda,
-                                        float *Workspace, int *devIpiv,
-                                        int *devInfo, cudaStream_t stream) {
+                                        int m,
+                                        int n,
+                                        float* A,
+                                        int lda,
+                                        float* Workspace,
+                                        int* devIpiv,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDngetrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m, int n, double *A, int lda,
-                                        double *Workspace, int *devIpiv,
-                                        int *devInfo, cudaStream_t stream) {
+                                        int m,
+                                        int n,
+                                        double* A,
+                                        int lda,
+                                        double* Workspace,
+                                        int* devIpiv,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgetrf(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork);
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  T* A,
+  int lda,
+  int* Lwork);
 
 template <>
 inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  float* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnSgetrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  double* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnDgetrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 
@@ -152,30 +186,49 @@ inline cusolverStatus_t cusolverDngetrf_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
-                                 cublasOperation_t trans, int n, int nrhs,
-                                 const T *A, int lda, const int *devIpiv, T *B,
-                                 int ldb, int *devInfo, cudaStream_t stream);
+                                 cublasOperation_t trans,
+                                 int n,
+                                 int nrhs,
+                                 const T* A,
+                                 int lda,
+                                 const int* devIpiv,
+                                 T* B,
+                                 int ldb,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int n,
-                                        int nrhs, const float *A, int lda,
-                                        const int *devIpiv, float *B, int ldb,
-                                        int *devInfo, cudaStream_t stream) {
+                                        cublasOperation_t trans,
+                                        int n,
+                                        int nrhs,
+                                        const float* A,
+                                        int lda,
+                                        const int* devIpiv,
+                                        float* B,
+                                        int ldb,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb,
-                          devInfo);
+  return cusolverDnSgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasOperation_t trans, int n,
-                                        int nrhs, const double *A, int lda,
-                                        const int *devIpiv, double *B, int ldb,
-                                        int *devInfo, cudaStream_t stream) {
+                                        cublasOperation_t trans,
+                                        int n,
+                                        int nrhs,
+                                        const double* A,
+                                        int lda,
+                                        const int* devIpiv,
+                                        double* B,
+                                        int ldb,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb,
-                          devInfo);
+  return cusolverDnDgetrs(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
 }
 /** @} */
 
@@ -185,20 +238,40 @@ inline cusolverStatus_t cusolverDngetrs(cusolverDnHandle_t handle,  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const T *A, int lda, const T *W, int *lwork);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const T* A,
+  int lda,
+  const T* W,
+  int* lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const float *A, int lda, const float *W, int *lwork) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const float* A,
+  int lda,
+  const float* W,
+  int* lwork)
+{
   return cusolverDnSsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const double *A, int lda, const double *W, int *lwork) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const double* A,
+  int lda,
+  const double* W,
+  int* lwork)
+{
   return cusolverDnDsyevd_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork);
 }
 /** @} */
@@ -209,52 +282,96 @@ inline cusolverStatus_t cusolverDnsyevd_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnsyevj(cusolverDnHandle_t handle,  // NOLINT
-                                 cusolverEigMode_t jobz, cublasFillMode_t uplo,
-                                 int n, T *A, int lda, T *W, T *work, int lwork,
-                                 int *info, syevjInfo_t params,
+                                 cusolverEigMode_t jobz,
+                                 cublasFillMode_t uplo,
+                                 int n,
+                                 T* A,
+                                 int lda,
+                                 T* W,
+                                 T* work,
+                                 int lwork,
+                                 int* info,
+                                 syevjInfo_t params,
                                  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, float *A, int lda, float *W, float *work, int lwork, int *info,
-  syevjInfo_t params, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  float* A,
+  int lda,
+  float* W,
+  float* work,
+  int lwork,
+  int* info,
+  syevjInfo_t params,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info,
-                          params);
+  return cusolverDnSsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, double *A, int lda, double *W, double *work, int lwork, int *info,
-  syevjInfo_t params, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  double* A,
+  int lda,
+  double* W,
+  double* work,
+  int lwork,
+  int* info,
+  syevjInfo_t params,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info,
-                          params);
+  return cusolverDnDsyevj(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const T *A, int lda, const T *W, int *lwork, syevjInfo_t params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const T* A,
+  int lda,
+  const T* W,
+  int* lwork,
+  syevjInfo_t params);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const float *A, int lda, const float *W, int *lwork,
-  syevjInfo_t params) {
-  return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork,
-                                     params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const float* A,
+  int lda,
+  const float* W,
+  int* lwork,
+  syevjInfo_t params)
+{
+  return cusolverDnSsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
-  int n, const double *A, int lda, const double *W, int *lwork,
-  syevjInfo_t params) {
-  return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork,
-                                     params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int n,
+  const double* A,
+  int lda,
+  const double* W,
+  int* lwork,
+  syevjInfo_t params)
+{
+  return cusolverDnDsyevj_bufferSize(handle, jobz, uplo, n, A, lda, W, lwork, params);
 }
 /** @} */
 
@@ -264,32 +381,49 @@ inline cusolverStatus_t cusolverDnsyevj_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
-                                 cusolverEigMode_t jobz, cublasFillMode_t uplo,
-                                 int n, T *A, int lda, T *W, T *work, int lwork,
-                                 int *devInfo, cudaStream_t stream);
+                                 cusolverEigMode_t jobz,
+                                 cublasFillMode_t uplo,
+                                 int n,
+                                 T* A,
+                                 int lda,
+                                 T* W,
+                                 T* work,
+                                 int lwork,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         cusolverEigMode_t jobz,
-                                        cublasFillMode_t uplo, int n, float *A,
-                                        int lda, float *W, float *work,
-                                        int lwork, int *devInfo,
-                                        cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        float* A,
+                                        int lda,
+                                        float* W,
+                                        float* work,
+                                        int lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork,
-                          devInfo);
+  return cusolverDnSsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
                                         cusolverEigMode_t jobz,
-                                        cublasFillMode_t uplo, int n, double *A,
-                                        int lda, double *W, double *work,
-                                        int lwork, int *devInfo,
-                                        cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        double* A,
+                                        int lda,
+                                        double* W,
+                                        double* work,
+                                        int lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork,
-                          devInfo);
+  return cusolverDnDsyevd(handle, jobz, uplo, n, A, lda, W, work, lwork, devInfo);
 }
 /** @} */
 
@@ -297,57 +431,134 @@ inline cusolverStatus_t cusolverDnsyevd(cusolverDnHandle_t handle,  // NOLINT
 /**
  * @defgroup syevdx cusolver syevdx operations
  * @{
-*/
+ */
 template <typename T>
 cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, const T *A, int lda, T vl, T vu, int il, int iu,
-  int *h_meig, const T *W, int *lwork);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  const T* A,
+  int lda,
+  T vl,
+  T vu,
+  int il,
+  int iu,
+  int* h_meig,
+  const T* W,
+  int* lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu,
-  int il, int iu, int *h_meig, const float *W, int *lwork) {
-  return cusolverDnSsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl,
-                                      vu, il, iu, h_meig, W, lwork);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  const float* A,
+  int lda,
+  float vl,
+  float vu,
+  int il,
+  int iu,
+  int* h_meig,
+  const float* W,
+  int* lwork)
+{
+  return cusolverDnSsyevdx_bufferSize(
+    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, const double *A, int lda, double vl, double vu,
-  int il, int iu, int *h_meig, const double *W, int *lwork) {
-  return cusolverDnDsyevdx_bufferSize(handle, jobz, range, uplo, n, A, lda, vl,
-                                      vu, il, iu, h_meig, W, lwork);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  const double* A,
+  int lda,
+  double vl,
+  double vu,
+  int il,
+  int iu,
+  int* h_meig,
+  const double* W,
+  int* lwork)
+{
+  return cusolverDnDsyevdx_bufferSize(
+    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, lwork);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnsyevdx(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, T *A, int lda, T vl, T vu, int il, int iu,
-  int *h_meig, T *W, T *work, int lwork, int *devInfo, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  T* A,
+  int lda,
+  T vl,
+  T vu,
+  int il,
+  int iu,
+  int* h_meig,
+  T* W,
+  T* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il,
-  int iu, int *h_meig, float *W, float *work, int lwork, int *devInfo,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  float* A,
+  int lda,
+  float vl,
+  float vu,
+  int il,
+  int iu,
+  int* h_meig,
+  float* W,
+  float* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu,
-                           h_meig, W, work, lwork, devInfo);
+  return cusolverDnSsyevdx(
+    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
-  cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu,
-  int il, int iu, int *h_meig, double *W, double *work, int lwork, int *devInfo,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  cusolverEigRange_t range,
+  cublasFillMode_t uplo,
+  int n,
+  double* A,
+  int lda,
+  double vl,
+  double vu,
+  int il,
+  int iu,
+  int* h_meig,
+  double* W,
+  double* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDsyevdx(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu,
-                           h_meig, W, work, lwork, devInfo);
+  return cusolverDnDsyevdx(
+    handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, h_meig, W, work, lwork, devInfo);
 }
 /** @} */
 #endif
@@ -358,7 +569,11 @@ inline cusolverStatus_t cusolverDnsyevdx(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDngesvd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int* lwork)
+{
   if (std::is_same<std::decay_t<T>, float>::value) {
     return cusolverDnSgesvd_bufferSize(handle, m, n, lwork);
   } else {
@@ -367,72 +582,194 @@ cusolverStatus_t cusolverDngesvd_bufferSize(  // NOLINT
 }
 template <typename T>
 cusolverStatus_t cusolverDngesvd(  // NOLINT
-  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
-  T *A, int lda, T *S, T *U, int ldu, T *VT, int ldvt, T *work, int lwork,
-  T *rwork, int *devInfo, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  signed char jobu,
+  signed char jobvt,
+  int m,
+  int n,
+  T* A,
+  int lda,
+  T* S,
+  T* U,
+  int ldu,
+  T* VT,
+  int ldvt,
+  T* work,
+  int lwork,
+  T* rwork,
+  int* devInfo,
+  cudaStream_t stream);
 template <>
 inline cusolverStatus_t cusolverDngesvd(  // NOLINT
-  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
-  float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt,
-  float *work, int lwork, float *rwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  signed char jobu,
+  signed char jobvt,
+  int m,
+  int n,
+  float* A,
+  int lda,
+  float* S,
+  float* U,
+  int ldu,
+  float* VT,
+  int ldvt,
+  float* work,
+  int lwork,
+  float* rwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT,
-                          ldvt, work, lwork, rwork, devInfo);
+  return cusolverDnSgesvd(
+    handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
 template <>
 inline cusolverStatus_t cusolverDngesvd(  // NOLINT
-  cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m, int n,
-  double *A, int lda, double *S, double *U, int ldu, double *VT, int ldvt,
-  double *work, int lwork, double *rwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  signed char jobu,
+  signed char jobvt,
+  int m,
+  int n,
+  double* A,
+  int lda,
+  double* S,
+  double* U,
+  int ldu,
+  double* VT,
+  int ldvt,
+  double* work,
+  int lwork,
+  double* rwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDgesvd(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT,
-                          ldvt, work, lwork, rwork, devInfo);
+  return cusolverDnDgesvd(
+    handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, devInfo);
 }
 
 template <typename T>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  const T *A, int lda, const T *S, const T *U, int ldu, const T *V, int ldv,
-  int *lwork, gesvdjInfo_t params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  const T* A,
+  int lda,
+  const T* S,
+  const T* U,
+  int ldu,
+  const T* V,
+  int ldv,
+  int* lwork,
+  gesvdjInfo_t params);
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  const float *A, int lda, const float *S, const float *U, int ldu,
-  const float *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  return cusolverDnSgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U,
-                                      ldu, V, ldv, lwork, params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  const float* A,
+  int lda,
+  const float* S,
+  const float* U,
+  int ldu,
+  const float* V,
+  int ldv,
+  int* lwork,
+  gesvdjInfo_t params)
+{
+  return cusolverDnSgesvdj_bufferSize(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
 }
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  const double *A, int lda, const double *S, const double *U, int ldu,
-  const double *V, int ldv, int *lwork, gesvdjInfo_t params) {
-  return cusolverDnDgesvdj_bufferSize(handle, jobz, econ, m, n, A, lda, S, U,
-                                      ldu, V, ldv, lwork, params);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  const double* A,
+  int lda,
+  const double* S,
+  const double* U,
+  int ldu,
+  const double* V,
+  int ldv,
+  int* lwork,
+  gesvdjInfo_t params)
+{
+  return cusolverDnDgesvdj_bufferSize(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
 }
 template <typename T>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  T *A, int lda, T *S, T *U, int ldu, T *V, int ldv, T *work, int lwork,
-  int *info, gesvdjInfo_t params, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  T* A,
+  int lda,
+  T* S,
+  T* U,
+  int ldu,
+  T* V,
+  int ldv,
+  T* work,
+  int lwork,
+  int* info,
+  gesvdjInfo_t params,
+  cudaStream_t stream);
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  float *A, int lda, float *S, float *U, int ldu, float *V, int ldv,
-  float *work, int lwork, int *info, gesvdjInfo_t params, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  float* A,
+  int lda,
+  float* S,
+  float* U,
+  int ldu,
+  float* V,
+  int ldv,
+  float* work,
+  int lwork,
+  int* info,
+  gesvdjInfo_t params,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv,
-                           work, lwork, info, params);
+  return cusolverDnSgesvdj(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
 template <>
 inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
-  cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
-  double *A, int lda, double *S, double *U, int ldu, double *V, int ldv,
-  double *work, int lwork, int *info, gesvdjInfo_t params,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverEigMode_t jobz,
+  int econ,
+  int m,
+  int n,
+  double* A,
+  int lda,
+  double* S,
+  double* U,
+  int ldu,
+  double* V,
+  int ldv,
+  double* work,
+  int lwork,
+  int* info,
+  gesvdjInfo_t params,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDgesvdj(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv,
-                           work, lwork, info, params);
+  return cusolverDnDgesvdj(
+    handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
 }
 /** @} */
 
@@ -442,43 +779,74 @@ inline cusolverStatus_t CUSOLVERAPI cusolverDngesvdj(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, T *A, int lda,
-  int *Lwork);
+  cusolverDnHandle_t handle,
+  cublasFillMode_t uplo,
+  int n,
+  T* A,
+  int lda,
+  int* Lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda,
-  int *Lwork) {
+  cusolverDnHandle_t handle,
+  cublasFillMode_t uplo,
+  int n,
+  float* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnSpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
-  int *Lwork) {
+  cusolverDnHandle_t handle,
+  cublasFillMode_t uplo,
+  int n,
+  double* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnDpotrf_bufferSize(handle, uplo, n, A, lda, Lwork);
 }
 
 template <typename T>
 inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, T *A,
-                                        int lda, T *Workspace, int Lwork,
-                                        int *devInfo, cudaStream_t stream);
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        T* A,
+                                        int lda,
+                                        T* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, float *A,
-                                        int lda, float *Workspace, int Lwork,
-                                        int *devInfo, cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        float* A,
+                                        int lda,
+                                        float* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, double *A,
-                                        int lda, double *Workspace, int Lwork,
-                                        int *devInfo, cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        double* A,
+                                        int lda,
+                                        double* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrf(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
 }
@@ -490,26 +858,44 @@ inline cusolverStatus_t cusolverDnpotrf(cusolverDnHandle_t handle,  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
-                                 cublasFillMode_t uplo, int n, int nrhs,
-                                 const T *A, int lda, T *B, int ldb,
-                                 int *devInfo, cudaStream_t stream);
+                                 cublasFillMode_t uplo,
+                                 int n,
+                                 int nrhs,
+                                 const T* A,
+                                 int lda,
+                                 T* B,
+                                 int ldb,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, int nrhs,
-                                        const float *A, int lda, float *B,
-                                        int ldb, int *devInfo,
-                                        cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        int nrhs,
+                                        const float* A,
+                                        int lda,
+                                        float* B,
+                                        int ldb,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
-                                        cublasFillMode_t uplo, int n, int nrhs,
-                                        const double *A, int lda, double *B,
-                                        int ldb, int *devInfo,
-                                        cudaStream_t stream) {
+                                        cublasFillMode_t uplo,
+                                        int n,
+                                        int nrhs,
+                                        const double* A,
+                                        int lda,
+                                        double* B,
+                                        int ldb,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
@@ -520,38 +906,75 @@ inline cusolverStatus_t cusolverDnpotrs(cusolverDnHandle_t handle,  // NOLINT
  * @{
  */
 template <typename T>
-cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle, int m,  // NOLINT
-                                 int n, T *A, int lda, T *TAU, T *Workspace,
-                                 int Lwork, int *devInfo, cudaStream_t stream);
+cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,
+                                 int m,  // NOLINT
+                                 int n,
+                                 T* A,
+                                 int lda,
+                                 T* TAU,
+                                 T* Workspace,
+                                 int Lwork,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 template <>
 inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m, int n, float *A, int lda,
-                                        float *TAU, float *Workspace, int Lwork,
-                                        int *devInfo, cudaStream_t stream) {
+                                        int m,
+                                        int n,
+                                        float* A,
+                                        int lda,
+                                        float* TAU,
+                                        float* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 template <>
 inline cusolverStatus_t cusolverDngeqrf(cusolverDnHandle_t handle,  // NOLINT
-                                        int m, int n, double *A, int lda,
-                                        double *TAU, double *Workspace,
-                                        int Lwork, int *devInfo,
-                                        cudaStream_t stream) {
+                                        int m,
+                                        int n,
+                                        double* A,
+                                        int lda,
+                                        double* TAU,
+                                        double* Workspace,
+                                        int Lwork,
+                                        int* devInfo,
+                                        cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDgeqrf(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, T *A, int lda, int *Lwork);
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  T* A,
+  int lda,
+  int* Lwork);
 template <>
 inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  float* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnSgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 template <>
 inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  double* A,
+  int lda,
+  int* Lwork)
+{
   return cusolverDnDgeqrf_bufferSize(handle, m, n, A, lda, Lwork);
 }
 /** @} */
@@ -562,38 +985,86 @@ inline cusolverStatus_t cusolverDngeqrf_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnorgqr(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, T *A, int lda, const T *tau,
-  T *work, int lwork, int *devInfo, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  T* A,
+  int lda,
+  const T* tau,
+  T* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream);
 template <>
 inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, float *A, int lda,
-  const float *tau, float *work, int lwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  float* A,
+  int lda,
+  const float* tau,
+  float* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnSorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 template <>
 inline cusolverStatus_t cusolverDnorgqr(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, double *A, int lda,
-  const double *tau, double *work, int lwork, int *devInfo,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  double* A,
+  int lda,
+  const double* tau,
+  double* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
   return cusolverDnDorgqr(handle, m, n, k, A, lda, tau, work, lwork, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, const T *A, int lda,
-  const T *TAU, int *lwork);
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  const T* A,
+  int lda,
+  const T* TAU,
+  int* lwork);
 template <>
 inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda,
-  const float *TAU, int *lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  const float* A,
+  int lda,
+  const float* TAU,
+  int* lwork)
+{
   return cusolverDnSorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
 }
 template <>
 inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda,
-  const double *TAU, int *lwork) {
+  cusolverDnHandle_t handle,
+  int m,
+  int n,
+  int k,
+  const double* A,
+  int lda,
+  const double* TAU,
+  int* lwork)
+{
   return cusolverDnDorgqr_bufferSize(handle, m, n, k, A, lda, TAU, lwork);
 }
 /** @} */
@@ -604,53 +1075,114 @@ inline cusolverStatus_t cusolverDnorgqr_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnormqr(cusolverDnHandle_t handle,  // NOLINT
-                                 cublasSideMode_t side, cublasOperation_t trans,
-                                 int m, int n, int k, const T *A, int lda,
-                                 const T *tau, T *C, int ldc, T *work,
-                                 int lwork, int *devInfo, cudaStream_t stream);
+                                 cublasSideMode_t side,
+                                 cublasOperation_t trans,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const T* A,
+                                 int lda,
+                                 const T* tau,
+                                 T* C,
+                                 int ldc,
+                                 T* work,
+                                 int lwork,
+                                 int* devInfo,
+                                 cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnormqr(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const float *A, int lda, const float *tau, float *C,
-  int ldc, float *work, int lwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const float* A,
+  int lda,
+  const float* tau,
+  float* C,
+  int ldc,
+  float* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc,
-                          work, lwork, devInfo);
+  return cusolverDnSormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnormqr(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const double *A, int lda, const double *tau, double *C,
-  int ldc, double *work, int lwork, int *devInfo, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const double* A,
+  int lda,
+  const double* tau,
+  double* C,
+  int ldc,
+  double* work,
+  int lwork,
+  int* devInfo,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc,
-                          work, lwork, devInfo);
+  return cusolverDnDormqr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const T *A, int lda, const T *tau, const T *C, int ldc,
-  int *lwork);
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const T* A,
+  int lda,
+  const T* tau,
+  const T* C,
+  int ldc,
+  int* lwork);
 
 template <>
 inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const float *A, int lda, const float *tau,
-  const float *C, int ldc, int *lwork) {
-  return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau,
-                                     C, ldc, lwork);
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const float* A,
+  int lda,
+  const float* tau,
+  const float* C,
+  int ldc,
+  int* lwork)
+{
+  return cusolverDnSormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
-  int m, int n, int k, const double *A, int lda, const double *tau,
-  const double *C, int ldc, int *lwork) {
-  return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau,
-                                     C, ldc, lwork);
+  cusolverDnHandle_t handle,
+  cublasSideMode_t side,
+  cublasOperation_t trans,
+  int m,
+  int n,
+  int k,
+  const double* A,
+  int lda,
+  const double* tau,
+  const double* C,
+  int ldc,
+  int* lwork)
+{
+  return cusolverDnDormqr_bufferSize(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
 }
 /** @} */
 
@@ -660,62 +1192,136 @@ inline cusolverStatus_t cusolverDnormqr_bufferSize(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, int batchSize, csrqrInfo_t info,
-  size_t *internalDataInBytes, size_t *workspaceInBytes);
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const T* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  int batchSize,
+  csrqrInfo_t info,
+  size_t* internalDataInBytes,
+  size_t* workspaceInBytes);
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, int batchSize, csrqrInfo_t info,
-  size_t *internalDataInBytes, size_t *workspaceInBytes) {
-  return cusolverSpScsrqrBufferInfoBatched(
-    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize,
-    info, internalDataInBytes, workspaceInBytes);
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const float* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  int batchSize,
+  csrqrInfo_t info,
+  size_t* internalDataInBytes,
+  size_t* workspaceInBytes)
+{
+  return cusolverSpScsrqrBufferInfoBatched(handle,
+                                           m,
+                                           n,
+                                           nnzA,
+                                           descrA,
+                                           csrValA,
+                                           csrRowPtrA,
+                                           csrColIndA,
+                                           batchSize,
+                                           info,
+                                           internalDataInBytes,
+                                           workspaceInBytes);
 }
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrBufferInfoBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, int batchSize, csrqrInfo_t info,
-  size_t *internalDataInBytes, size_t *workspaceInBytes) {
-  return cusolverSpDcsrqrBufferInfoBatched(
-    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, batchSize,
-    info, internalDataInBytes, workspaceInBytes);
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const double* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  int batchSize,
+  csrqrInfo_t info,
+  size_t* internalDataInBytes,
+  size_t* workspaceInBytes)
+{
+  return cusolverSpDcsrqrBufferInfoBatched(handle,
+                                           m,
+                                           n,
+                                           nnzA,
+                                           descrA,
+                                           csrValA,
+                                           csrRowPtrA,
+                                           csrColIndA,
+                                           batchSize,
+                                           info,
+                                           internalDataInBytes,
+                                           workspaceInBytes);
 }
 
 template <typename T>
 cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const T *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, const T *b, T *x, int batchSize, csrqrInfo_t info,
-  void *pBuffer, cudaStream_t stream);
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const T* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  const T* b,
+  T* x,
+  int batchSize,
+  csrqrInfo_t info,
+  void* pBuffer,
+  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const float *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, const float *b, float *x, int batchSize,
-  csrqrInfo_t info, void *pBuffer, cudaStream_t stream) {
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const float* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  const float* b,
+  float* x,
+  int batchSize,
+  csrqrInfo_t info,
+  void* pBuffer,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
-  return cusolverSpScsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA,
-                                   csrRowPtrA, csrColIndA, b, x, batchSize,
-                                   info, pBuffer);
+  return cusolverSpScsrqrsvBatched(
+    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
 
 template <>
 inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
-  cusolverSpHandle_t handle, int m, int n, int nnzA,
-  const cusparseMatDescr_t descrA, const double *csrValA, const int *csrRowPtrA,
-  const int *csrColIndA, const double *b, double *x, int batchSize,
-  csrqrInfo_t info, void *pBuffer, cudaStream_t stream) {
+  cusolverSpHandle_t handle,
+  int m,
+  int n,
+  int nnzA,
+  const cusparseMatDescr_t descrA,
+  const double* csrValA,
+  const int* csrRowPtrA,
+  const int* csrColIndA,
+  const double* b,
+  double* x,
+  int batchSize,
+  csrqrInfo_t info,
+  void* pBuffer,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverSpSetStream(handle, stream));
-  return cusolverSpDcsrqrsvBatched(handle, m, n, nnzA, descrA, csrValA,
-                                   csrRowPtrA, csrColIndA, b, x, batchSize,
-                                   info, pBuffer);
+  return cusolverSpDcsrqrsvBatched(
+    handle, m, n, nnzA, descrA, csrValA, csrRowPtrA, csrColIndA, b, x, batchSize, info, pBuffer);
 }
 /** @} */
 
@@ -726,66 +1332,165 @@ inline cusolverStatus_t cusolverSpcsrqrsvBatched(  // NOLINT
  */
 template <typename T>
 cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
-  cublasFillMode_t uplo, int64_t n, const T *A, int64_t lda, const T *W,
-  size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost,
+  cusolverDnHandle_t handle,
+  cusolverDnParams_t params,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int64_t n,
+  const T* A,
+  int64_t lda,
+  const T* W,
+  size_t* workspaceInBytesOnDevice,
+  size_t* workspaceInBytesOnHost,
   cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
-  cublasFillMode_t uplo, int64_t n, const float *A, int64_t lda, const float *W,
-  size_t *workspaceInBytesOnDevice, size_t *workspaceInBytesOnHost,
-  cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverDnParams_t params,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int64_t n,
+  const float* A,
+  int64_t lda,
+  const float* W,
+  size_t* workspaceInBytesOnDevice,
+  size_t* workspaceInBytesOnHost,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnXsyevd_bufferSize(
-    handle, params, jobz, uplo, n, CUDA_R_32F, A, lda, CUDA_R_32F, W,
-    CUDA_R_32F, workspaceInBytesOnDevice, workspaceInBytesOnHost);
+  return cusolverDnXsyevd_bufferSize(handle,
+                                     params,
+                                     jobz,
+                                     uplo,
+                                     n,
+                                     CUDA_R_32F,
+                                     A,
+                                     lda,
+                                     CUDA_R_32F,
+                                     W,
+                                     CUDA_R_32F,
+                                     workspaceInBytesOnDevice,
+                                     workspaceInBytesOnHost);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnxsyevd_bufferSize(  // NOLINT
-  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
-  cublasFillMode_t uplo, int64_t n, const double *A, int64_t lda,
-  const double *W, size_t *workspaceInBytesOnDevice,
-  size_t *workspaceInBytesOnHost, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverDnParams_t params,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int64_t n,
+  const double* A,
+  int64_t lda,
+  const double* W,
+  size_t* workspaceInBytesOnDevice,
+  size_t* workspaceInBytesOnHost,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnXsyevd_bufferSize(
-    handle, params, jobz, uplo, n, CUDA_R_64F, A, lda, CUDA_R_64F, W,
-    CUDA_R_64F, workspaceInBytesOnDevice, workspaceInBytesOnHost);
+  return cusolverDnXsyevd_bufferSize(handle,
+                                     params,
+                                     jobz,
+                                     uplo,
+                                     n,
+                                     CUDA_R_64F,
+                                     A,
+                                     lda,
+                                     CUDA_R_64F,
+                                     W,
+                                     CUDA_R_64F,
+                                     workspaceInBytesOnDevice,
+                                     workspaceInBytesOnHost);
 }
 
 template <typename T>
 cusolverStatus_t cusolverDnxsyevd(  // NOLINT
-  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
-  cublasFillMode_t uplo, int64_t n, T *A, int64_t lda, T *W, T *bufferOnDevice,
-  size_t workspaceInBytesOnDevice, T *bufferOnHost,
-  size_t workspaceInBytesOnHost, int *info, cudaStream_t stream);
+  cusolverDnHandle_t handle,
+  cusolverDnParams_t params,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int64_t n,
+  T* A,
+  int64_t lda,
+  T* W,
+  T* bufferOnDevice,
+  size_t workspaceInBytesOnDevice,
+  T* bufferOnHost,
+  size_t workspaceInBytesOnHost,
+  int* info,
+  cudaStream_t stream);
 
 template <>
 inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
-  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
-  cublasFillMode_t uplo, int64_t n, float *A, int64_t lda, float *W,
-  float *bufferOnDevice, size_t workspaceInBytesOnDevice, float *bufferOnHost,
-  size_t workspaceInBytesOnHost, int *info, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverDnParams_t params,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int64_t n,
+  float* A,
+  int64_t lda,
+  float* W,
+  float* bufferOnDevice,
+  size_t workspaceInBytesOnDevice,
+  float* bufferOnHost,
+  size_t workspaceInBytesOnHost,
+  int* info,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnXsyevd(handle, params, jobz, uplo, n, CUDA_R_32F, A, lda,
-                          CUDA_R_32F, W, CUDA_R_32F, bufferOnDevice,
-                          workspaceInBytesOnDevice, bufferOnHost,
-                          workspaceInBytesOnHost, info);
+  return cusolverDnXsyevd(handle,
+                          params,
+                          jobz,
+                          uplo,
+                          n,
+                          CUDA_R_32F,
+                          A,
+                          lda,
+                          CUDA_R_32F,
+                          W,
+                          CUDA_R_32F,
+                          bufferOnDevice,
+                          workspaceInBytesOnDevice,
+                          bufferOnHost,
+                          workspaceInBytesOnHost,
+                          info);
 }
 
 template <>
 inline cusolverStatus_t cusolverDnxsyevd(  // NOLINT
-  cusolverDnHandle_t handle, cusolverDnParams_t params, cusolverEigMode_t jobz,
-  cublasFillMode_t uplo, int64_t n, double *A, int64_t lda, double *W,
-  double *bufferOnDevice, size_t workspaceInBytesOnDevice, double *bufferOnHost,
-  size_t workspaceInBytesOnHost, int *info, cudaStream_t stream) {
+  cusolverDnHandle_t handle,
+  cusolverDnParams_t params,
+  cusolverEigMode_t jobz,
+  cublasFillMode_t uplo,
+  int64_t n,
+  double* A,
+  int64_t lda,
+  double* W,
+  double* bufferOnDevice,
+  size_t workspaceInBytesOnDevice,
+  double* bufferOnHost,
+  size_t workspaceInBytesOnHost,
+  int* info,
+  cudaStream_t stream)
+{
   CUSOLVER_CHECK(cusolverDnSetStream(handle, stream));
-  return cusolverDnXsyevd(handle, params, jobz, uplo, n, CUDA_R_64F, A, lda,
-                          CUDA_R_64F, W, CUDA_R_64F, bufferOnDevice,
-                          workspaceInBytesOnDevice, bufferOnHost,
-                          workspaceInBytesOnHost, info);
+  return cusolverDnXsyevd(handle,
+                          params,
+                          jobz,
+                          uplo,
+                          n,
+                          CUDA_R_64F,
+                          A,
+                          lda,
+                          CUDA_R_64F,
+                          W,
+                          CUDA_R_64F,
+                          bufferOnDevice,
+                          workspaceInBytesOnDevice,
+                          bufferOnHost,
+                          workspaceInBytesOnHost,
+                          info);
 }
 /** @} */
 #endif
diff --git a/cpp/include/raft/linalg/divide.cuh b/cpp/include/raft/linalg/divide.cuh
index c848ac1f4b..562a3d8991 100644
--- a/cpp/include/raft/linalg/divide.cuh
+++ b/cpp/include/raft/linalg/divide.cuh
@@ -33,11 +33,10 @@ namespace linalg {
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void divideScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
-                  cudaStream_t stream) {
+void divideScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
   unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in / scalar; },
-    stream);
+    out, in, len, [scalar] __device__(math_t in) { return in / scalar; }, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/eig.cuh b/cpp/include/raft/linalg/eig.cuh
index e141883b6c..288d379dac 100644
--- a/cpp/include/raft/linalg/eig.cuh
+++ b/cpp/include/raft/linalg/eig.cuh
@@ -29,25 +29,42 @@ namespace raft {
 namespace linalg {
 
 template <typename math_t>
-void eigDC_legacy(const raft::handle_t &handle, const math_t *in,
-                  std::size_t n_rows, std::size_t n_cols, math_t *eig_vectors,
-                  math_t *eig_vals, cudaStream_t stream) {
+void eigDC_legacy(const raft::handle_t& handle,
+                  const math_t* in,
+                  std::size_t n_rows,
+                  std::size_t n_cols,
+                  math_t* eig_vectors,
+                  math_t* eig_vals,
+                  cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                            CUBLAS_FILL_MODE_UPPER, n_rows, in,
-                                            n_cols, eig_vals, &lwork));
+  CUSOLVER_CHECK(cusolverDnsyevd_bufferSize(cusolverH,
+                                            CUSOLVER_EIG_MODE_VECTOR,
+                                            CUBLAS_FILL_MODE_UPPER,
+                                            n_rows,
+                                            in,
+                                            n_cols,
+                                            eig_vals,
+                                            &lwork));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> d_dev_info(stream);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
-                                 n_cols, eig_vals, d_work.data(), lwork,
-                                 d_dev_info.data(), stream));
+  CUSOLVER_CHECK(cusolverDnsyevd(cusolverH,
+                                 CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER,
+                                 n_rows,
+                                 eig_vectors,
+                                 n_cols,
+                                 eig_vals,
+                                 d_work.data(),
+                                 lwork,
+                                 d_dev_info.data(),
+                                 stream));
   CUDA_CHECK(cudaGetLastError());
 
   auto dev_info = d_dev_info.value(stream);
@@ -70,9 +87,14 @@ void eigDC_legacy(const raft::handle_t &handle, const math_t *in,
  * @{
  */
 template <typename math_t>
-void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows,
-           std::size_t n_cols, math_t *eig_vectors, math_t *eig_vals,
-           cudaStream_t stream) {
+void eigDC(const raft::handle_t& handle,
+           const math_t* in,
+           std::size_t n_rows,
+           std::size_t n_cols,
+           math_t* eig_vectors,
+           math_t* eig_vals,
+           cudaStream_t stream)
+{
 #if CUDART_VERSION < 11010
   eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream);
 #else
@@ -82,11 +104,18 @@ void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows,
   CUSOLVER_CHECK(cusolverDnCreateParams(&dn_params));
 
   size_t workspaceDevice = 0;
-  size_t workspaceHost = 0;
-  CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize(
-    cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER,
-    static_cast<int64_t>(n_rows), eig_vectors, static_cast<int64_t>(n_cols),
-    eig_vals, &workspaceDevice, &workspaceHost, stream));
+  size_t workspaceHost   = 0;
+  CUSOLVER_CHECK(cusolverDnxsyevd_bufferSize(cusolverH,
+                                             dn_params,
+                                             CUSOLVER_EIG_MODE_VECTOR,
+                                             CUBLAS_FILL_MODE_UPPER,
+                                             static_cast<int64_t>(n_rows),
+                                             eig_vectors,
+                                             static_cast<int64_t>(n_cols),
+                                             eig_vals,
+                                             &workspaceDevice,
+                                             &workspaceHost,
+                                             stream));
 
   rmm::device_uvector<math_t> d_work(workspaceDevice / sizeof(math_t), stream);
   rmm::device_scalar<int> d_dev_info(stream);
@@ -94,11 +123,20 @@ void eigDC(const raft::handle_t &handle, const math_t *in, std::size_t n_rows,
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnxsyevd(
-    cusolverH, dn_params, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER,
-    static_cast<int64_t>(n_rows), eig_vectors, static_cast<int64_t>(n_cols),
-    eig_vals, d_work.data(), workspaceDevice, h_work.data(), workspaceHost,
-    d_dev_info.data(), stream));
+  CUSOLVER_CHECK(cusolverDnxsyevd(cusolverH,
+                                  dn_params,
+                                  CUSOLVER_EIG_MODE_VECTOR,
+                                  CUBLAS_FILL_MODE_UPPER,
+                                  static_cast<int64_t>(n_rows),
+                                  eig_vectors,
+                                  static_cast<int64_t>(n_cols),
+                                  eig_vals,
+                                  d_work.data(),
+                                  workspaceDevice,
+                                  h_work.data(),
+                                  workspaceHost,
+                                  d_dev_info.data(),
+                                  stream));
 
   CUDA_CHECK(cudaGetLastError());
   CUSOLVER_CHECK(cusolverDnDestroyParams(dn_params));
@@ -128,38 +166,79 @@ enum EigVecMemUsage { OVERWRITE_INPUT, COPY_INPUT };
  * @{
  */
 template <typename math_t>
-void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
-              int n_eig_vals, math_t *eig_vectors, math_t *eig_vals,
-              EigVecMemUsage memUsage, cudaStream_t stream) {
+void eigSelDC(const raft::handle_t& handle,
+              math_t* in,
+              int n_rows,
+              int n_cols,
+              int n_eig_vals,
+              math_t* eig_vectors,
+              math_t* eig_vals,
+              EigVecMemUsage memUsage,
+              cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int lwork;
   int h_meig;
 
-  CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
-    CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
-    n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, &lwork));
+  CUSOLVER_CHECK(cusolverDnsyevdx_bufferSize(cusolverH,
+                                             CUSOLVER_EIG_MODE_VECTOR,
+                                             CUSOLVER_EIG_RANGE_I,
+                                             CUBLAS_FILL_MODE_UPPER,
+                                             n_rows,
+                                             in,
+                                             n_cols,
+                                             math_t(0.0),
+                                             math_t(0.0),
+                                             n_cols - n_eig_vals + 1,
+                                             n_cols,
+                                             &h_meig,
+                                             eig_vals,
+                                             &lwork));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> d_dev_info(stream);
   rmm::device_uvector<math_t> d_eig_vectors(0, stream);
 
   if (memUsage == OVERWRITE_INPUT) {
-    CUSOLVER_CHECK(cusolverDnsyevdx(
-      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
-      CUBLAS_FILL_MODE_UPPER, n_rows, in, n_cols, math_t(0.0), math_t(0.0),
-      n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals, d_work.data(), lwork,
-      d_dev_info.data(), stream));
+    CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH,
+                                    CUSOLVER_EIG_MODE_VECTOR,
+                                    CUSOLVER_EIG_RANGE_I,
+                                    CUBLAS_FILL_MODE_UPPER,
+                                    n_rows,
+                                    in,
+                                    n_cols,
+                                    math_t(0.0),
+                                    math_t(0.0),
+                                    n_cols - n_eig_vals + 1,
+                                    n_cols,
+                                    &h_meig,
+                                    eig_vals,
+                                    d_work.data(),
+                                    lwork,
+                                    d_dev_info.data(),
+                                    stream));
   } else if (memUsage == COPY_INPUT) {
     d_eig_vectors.resize(n_rows * n_cols, stream);
     raft::matrix::copy(in, d_eig_vectors.data(), n_rows, n_cols, stream);
 
-    CUSOLVER_CHECK(cusolverDnsyevdx(
-      cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I,
-      CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors, n_cols, math_t(0.0),
-      math_t(0.0), n_cols - n_eig_vals + 1, n_cols, &h_meig, eig_vals,
-      d_work.data(), lwork, d_dev_info.data(), stream));
+    CUSOLVER_CHECK(cusolverDnsyevdx(cusolverH,
+                                    CUSOLVER_EIG_MODE_VECTOR,
+                                    CUSOLVER_EIG_RANGE_I,
+                                    CUBLAS_FILL_MODE_UPPER,
+                                    n_rows,
+                                    eig_vectors,
+                                    n_cols,
+                                    math_t(0.0),
+                                    math_t(0.0),
+                                    n_cols - n_eig_vals + 1,
+                                    n_cols,
+                                    &h_meig,
+                                    eig_vals,
+                                    d_work.data(),
+                                    lwork,
+                                    d_dev_info.data(),
+                                    stream));
   }
 
   CUDA_CHECK(cudaGetLastError());
@@ -170,11 +249,10 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
          "This usually occurs when some of the features do not vary enough.");
 
   if (memUsage == OVERWRITE_INPUT) {
-    raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals,
-                                  stream);
+    raft::matrix::truncZeroOrigin(in, n_rows, eig_vectors, n_rows, n_eig_vals, stream);
   } else if (memUsage == COPY_INPUT) {
-    raft::matrix::truncZeroOrigin(d_eig_vectors.data(), n_rows, eig_vectors,
-                                  n_rows, n_eig_vals, stream);
+    raft::matrix::truncZeroOrigin(
+      d_eig_vectors.data(), n_rows, eig_vectors, n_rows, n_eig_vals, stream);
   }
 }
 
@@ -195,36 +273,54 @@ void eigSelDC(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
  * @{
  */
 template <typename math_t>
-void eigJacobi(const raft::handle_t &handle, const math_t *in,
-               std::size_t n_rows, std::size_t n_cols, math_t *eig_vectors,
-               math_t *eig_vals, cudaStream_t stream, math_t tol = 1.e-7,
-               std::uint32_t sweeps = 15) {
+void eigJacobi(const raft::handle_t& handle,
+               const math_t* in,
+               std::size_t n_rows,
+               std::size_t n_cols,
+               math_t* eig_vectors,
+               math_t* eig_vals,
+               cudaStream_t stream,
+               math_t tol           = 1.e-7,
+               std::uint32_t sweeps = 15)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   syevjInfo_t syevj_params = nullptr;
   CUSOLVER_CHECK(cusolverDnCreateSyevjInfo(&syevj_params));
   CUSOLVER_CHECK(cusolverDnXsyevjSetTolerance(syevj_params, tol));
-  CUSOLVER_CHECK(
-    cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast<int>(sweeps)));
+  CUSOLVER_CHECK(cusolverDnXsyevjSetMaxSweeps(syevj_params, static_cast<int>(sweeps)));
 
   int lwork;
-  CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, n_rows,
-    eig_vectors, n_cols, eig_vals, &lwork, syevj_params));
+  CUSOLVER_CHECK(cusolverDnsyevj_bufferSize(cusolverH,
+                                            CUSOLVER_EIG_MODE_VECTOR,
+                                            CUBLAS_FILL_MODE_UPPER,
+                                            n_rows,
+                                            eig_vectors,
+                                            n_cols,
+                                            eig_vals,
+                                            &lwork,
+                                            syevj_params));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
   rmm::device_scalar<int> dev_info(stream);
 
   raft::matrix::copy(in, eig_vectors, n_rows, n_cols, stream);
 
-  CUSOLVER_CHECK(cusolverDnsyevj(cusolverH, CUSOLVER_EIG_MODE_VECTOR,
-                                 CUBLAS_FILL_MODE_UPPER, n_rows, eig_vectors,
-                                 n_cols, eig_vals, d_work.data(), lwork,
-                                 dev_info.data(), syevj_params, stream));
+  CUSOLVER_CHECK(cusolverDnsyevj(cusolverH,
+                                 CUSOLVER_EIG_MODE_VECTOR,
+                                 CUBLAS_FILL_MODE_UPPER,
+                                 n_rows,
+                                 eig_vectors,
+                                 n_cols,
+                                 eig_vals,
+                                 d_work.data(),
+                                 lwork,
+                                 dev_info.data(),
+                                 syevj_params,
+                                 stream));
 
   int executed_sweeps;
-  CUSOLVER_CHECK(
-    cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
+  CUSOLVER_CHECK(cusolverDnXsyevjGetSweeps(cusolverH, syevj_params, &executed_sweeps));
 
   CUDA_CHECK(cudaGetLastError());
   CUSOLVER_CHECK(cusolverDnDestroySyevjInfo(syevj_params));
diff --git a/cpp/include/raft/linalg/eltwise.cuh b/cpp/include/raft/linalg/eltwise.cuh
index 1c6dee562d..097c3ac218 100644
--- a/cpp/include/raft/linalg/eltwise.cuh
+++ b/cpp/include/raft/linalg/eltwise.cuh
@@ -34,19 +34,17 @@ namespace linalg {
  * @{
  */
 template <typename InType, typename IdxType, typename OutType = InType>
-void scalarAdd(OutType *out, const InType *in, InType scalar, IdxType len,
-               cudaStream_t stream) {
+void scalarAdd(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
   raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in + scalar; },
-    stream);
+    out, in, len, [scalar] __device__(InType in) { return in + scalar; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len,
-                    cudaStream_t stream) {
+void scalarMultiply(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
   raft::linalg::unaryOp(
-    out, in, len, [scalar] __device__(InType in) { return in * scalar; },
-    stream);
+    out, in, len, [scalar] __device__(InType in) { return in * scalar; }, stream);
 }
 /** @} */
 
@@ -62,42 +60,46 @@ void scalarMultiply(OutType *out, const InType *in, InType scalar, IdxType len,
  * @{
  */
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseAdd(OutType *out, const InType *in1, const InType *in2, IdxType len,
-                cudaStream_t stream) {
+void eltwiseAdd(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseSub(OutType *out, const InType *in1, const InType *in2, IdxType len,
-                cudaStream_t stream) {
+void eltwiseSub(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a - b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseMultiply(OutType *out, const InType *in1, const InType *in2,
-                     IdxType len, cudaStream_t stream) {
+void eltwiseMultiply(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a * b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivide(OutType *out, const InType *in1, const InType *in2,
-                   IdxType len, cudaStream_t stream) {
+void eltwiseDivide(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a / b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void eltwiseDivideCheckZero(OutType *out, const InType *in1, const InType *in2,
-                            IdxType len, cudaStream_t stream) {
+void eltwiseDivideCheckZero(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len,
+    out,
+    in1,
+    in2,
+    len,
     [] __device__(InType a, InType b) {
       if (b == InType(0.0))
         return InType(0.0);
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index 0a4897cc0b..d5942b7446 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -43,35 +43,53 @@ namespace linalg {
  * @param stream cuda stream
  */
 template <typename math_t>
-void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
-          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
-          cublasOperation_t trans_a, cublasOperation_t trans_b, math_t alpha,
-          math_t beta, cudaStream_t stream) {
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          math_t alpha,
+          math_t beta,
+          cudaStream_t stream)
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
-  int m = n_rows_c;
-  int n = n_cols_c;
-  int k = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a;
+  int m   = n_rows_c;
+  int n   = n_cols_c;
+  int k   = trans_a == CUBLAS_OP_T ? n_rows_a : n_cols_a;
   int lda = trans_a == CUBLAS_OP_T ? k : m;
   int ldb = trans_b == CUBLAS_OP_T ? n : k;
   int ldc = m;
-  CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda,
-                          b, ldb, &beta, c, ldc, stream));
+  CUBLAS_CHECK(
+    cublasgemm(cublas_h, trans_a, trans_b, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
 }
 
 template <typename math_t>
-void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
-          int n_cols_a, const math_t *b, math_t *c, int n_rows_c, int n_cols_c,
-          cublasOperation_t trans_a, cublasOperation_t trans_b,
-          cudaStream_t stream) {
+void gemm(const raft::handle_t& handle,
+          const math_t* a,
+          int n_rows_a,
+          int n_cols_a,
+          const math_t* b,
+          math_t* c,
+          int n_rows_c,
+          int n_cols_c,
+          cublasOperation_t trans_a,
+          cublasOperation_t trans_b,
+          cudaStream_t stream)
+{
   math_t alpha = math_t(1);
-  math_t beta = math_t(0);
-  gemm(handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a,
-       trans_b, alpha, beta, stream);
+  math_t beta  = math_t(0);
+  gemm(
+    handle, a, n_rows_a, n_cols_a, b, c, n_rows_c, n_cols_c, trans_a, trans_b, alpha, beta, stream);
 }
 
 /**
- * @brief A wrapper for CUBLS GEMM function designed for handling all possible 
+ * @brief A wrapper for CUBLS GEMM function designed for handling all possible
  * combinations of operand layouts.
  * It computes the following equation: Z = alpha . X * Y + beta . Z
  * @tparam T Data type of input/output matrices (float/double)
@@ -90,9 +108,20 @@ void gemm(const raft::handle_t &handle, const math_t *a, int n_rows_a,
  * @param beta scalar
  */
 template <typename T>
-void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
-          int _K, bool isZColMajor, bool isXColMajor, bool isYColMajor,
-          cudaStream_t stream, T alpha = T(1.0), T beta = T(0.0)) {
+void gemm(const raft::handle_t& handle,
+          T* z,
+          T* x,
+          T* y,
+          int _M,
+          int _N,
+          int _K,
+          bool isZColMajor,
+          bool isXColMajor,
+          bool isYColMajor,
+          cudaStream_t stream,
+          T alpha = T(1.0),
+          T beta  = T(0.0))
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
   cublasOperation_t trans_a, trans_b;
@@ -119,13 +148,13 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
     // therefore trans_x needs to be CUBLAS_OP_T. If x is in column major
     // layout, trans_b needs to be CUBLAS_OP_N.
     trans_b = isYColMajor == true ? CUBLAS_OP_N : CUBLAS_OP_T;
-    ldb = isYColMajor == true ? _K : _N;
+    ldb     = isYColMajor == true ? _K : _N;
 
-    c = z;
+    c   = z;
     ldc = _M;
-    M = _M;
-    N = _N;
-    K = _K;
+    M   = _M;
+    N   = _N;
+    K   = _K;
   } else {
     // Result c is required in row major layout Thus we pick
     // a = y, b = x and c = a * b = y * x
@@ -154,7 +183,7 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
     // Set leading dimension appropriately
     ldb = isXColMajor == true ? _M : _K;
 
-    c = z;
+    c   = z;
     ldc = _N;
 
     M = _N;
@@ -162,8 +191,8 @@ void gemm(const raft::handle_t &handle, T *z, T *x, T *y, int _M, int _N,
     K = _K;
   }
   // Actual cuBLAS call
-  CUBLAS_CHECK(cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda,
-                          b, ldb, &beta, c, ldc, stream));
+  CUBLAS_CHECK(
+    cublasgemm(cublas_h, trans_a, trans_b, M, N, K, &alpha, a, lda, b, ldb, &beta, c, ldc, stream));
 }
 
 }  // end namespace linalg
diff --git a/cpp/include/raft/linalg/gemv.h b/cpp/include/raft/linalg/gemv.h
index 0be11a0301..ac0547e30a 100644
--- a/cpp/include/raft/linalg/gemv.h
+++ b/cpp/include/raft/linalg/gemv.h
@@ -26,14 +26,23 @@ namespace raft {
 namespace linalg {
 
 template <typename math_t>
-void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows,
-          const int n_cols, const math_t *x, const int incx, math_t *y,
-          const int incy, const bool trans_a, const math_t alpha,
-          const math_t beta, cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows,
+          const int n_cols,
+          const math_t* x,
+          const int incx,
+          math_t* y,
+          const int incy,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x,
-                          incx, &beta, y, incy, stream));
+  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(
+    cublasgemv(cublas_h, op_a, n_rows, n_cols, &alpha, A, n_rows, x, incx, &beta, y, incy, stream));
 }
 
 /**
@@ -53,9 +62,17 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows,
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
  */
 template <typename math_t>
-void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
-          const int n_cols_a, const math_t *x, math_t *y, const bool trans_a,
-          const math_t alpha, const math_t beta, cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
   gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
 }
 
@@ -72,11 +89,17 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
  */
 template <typename math_t>
-void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
-          const int n_cols_a, const math_t *x, math_t *y, const bool trans_a,
-          cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
   math_t alpha = math_t(1);
-  math_t beta = math_t(0);
+  math_t beta  = math_t(0);
 
   gemv(handle, A, n_rows_a, n_cols_a, x, 1, y, 1, trans_a, alpha, beta, stream);
 }
@@ -102,14 +125,22 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
  * @param y is a vector of size `trans_a ? n_cols_a : n_rows_a`.
  */
 template <typename math_t>
-void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
-          const int n_cols_a, const int lda, const math_t *x, math_t *y,
-          const bool trans_a, const math_t alpha, const math_t beta,
-          cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          const math_t alpha,
+          const math_t beta,
+          cudaStream_t stream)
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
-  cublasOperation_t op_a = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x,
-                          1, &beta, y, 1, stream));
+  cublasOperation_t op_a  = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(
+    cublasgemv(cublas_h, op_a, n_rows_a, n_cols_a, &alpha, A, lda, x, 1, &beta, y, 1, stream));
 }
 
 /**
@@ -130,11 +161,18 @@ void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
  *
  */
 template <typename math_t>
-void gemv(const raft::handle_t &handle, const math_t *A, const int n_rows_a,
-          const int n_cols_a, const int lda, const math_t *x, math_t *y,
-          const bool trans_a, cudaStream_t stream) {
+void gemv(const raft::handle_t& handle,
+          const math_t* A,
+          const int n_rows_a,
+          const int n_cols_a,
+          const int lda,
+          const math_t* x,
+          math_t* y,
+          const bool trans_a,
+          cudaStream_t stream)
+{
   math_t alpha = math_t(1);
-  math_t beta = math_t(0);
+  math_t beta  = math_t(0);
   gemv(handle, A, n_rows_a, n_cols_a, lda, x, y, trans_a, alpha, beta, stream);
 }
 
diff --git a/cpp/include/raft/linalg/init.h b/cpp/include/raft/linalg/init.h
index 9944685a1f..41ef4d4641 100644
--- a/cpp/include/raft/linalg/init.h
+++ b/cpp/include/raft/linalg/init.h
@@ -37,7 +37,8 @@ namespace {
  * \param [in] stream cuda stream
  */
 template <typename T>
-void range(T *out, int start, int end, cudaStream_t stream) {
+void range(T* out, int start, int end, cudaStream_t stream)
+{
   thrust::counting_iterator<int> first(start);
   thrust::counting_iterator<int> last = first + (end - start);
   thrust::device_ptr<T> ptr(out);
@@ -54,7 +55,8 @@ void range(T *out, int start, int end, cudaStream_t stream) {
  * \param [in] stream cuda stream
  */
 template <typename T, int TPB = 256>
-void range(T *out, int n, cudaStream_t stream) {
+void range(T* out, int n, cudaStream_t stream)
+{
   range(out, 0, n, stream);
 }
 }  // unnamed namespace
diff --git a/cpp/include/raft/linalg/lanczos.hpp b/cpp/include/raft/linalg/lanczos.hpp
index b775a1f696..39089473e3 100644
--- a/cpp/include/raft/linalg/lanczos.hpp
+++ b/cpp/include/raft/linalg/lanczos.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-//for cmath:
+// for cmath:
 #define _USE_MATH_DEFINES
 
 #include <cmath>
@@ -40,14 +40,14 @@ using namespace linalg;
 namespace spectral {
 
 // curandGeneratorNormalX
-inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
-                                            float *outputPtr, size_t n,
-                                            float mean, float stddev) {
+inline curandStatus_t curandGenerateNormalX(
+  curandGenerator_t generator, float* outputPtr, size_t n, float mean, float stddev)
+{
   return curandGenerateNormal(generator, outputPtr, n, mean, stddev);
 }
-inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
-                                            double *outputPtr, size_t n,
-                                            double mean, double stddev) {
+inline curandStatus_t curandGenerateNormalX(
+  curandGenerator_t generator, double* outputPtr, size_t n, double mean, double stddev)
+{
   return curandGenerateNormalDouble(generator, outputPtr, n, mean, stddev);
 }
 
@@ -55,7 +55,7 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
 // Helper functions
 // =========================================================
 
-/**  
+/**
  *  @brief  Perform Lanczos iteration
  *    Lanczos iteration is performed on a shifted matrix A+shift*I.
  *  @tparam index_type_t the type of data used for indexing.
@@ -85,25 +85,30 @@ inline curandStatus_t curandGenerateNormalX(curandGenerator_t generator,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-int performLanczosIteration(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t *iter, index_type_t maxIter, value_type_t shift,
-  value_type_t tol, bool reorthogonalize, value_type_t *__restrict__ alpha_host,
-  value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev) {
+int performLanczosIteration(handle_t const& handle,
+                            sparse_matrix_t<index_type_t, value_type_t> const* A,
+                            index_type_t* iter,
+                            index_type_t maxIter,
+                            value_type_t shift,
+                            value_type_t tol,
+                            bool reorthogonalize,
+                            value_type_t* __restrict__ alpha_host,
+                            value_type_t* __restrict__ beta_host,
+                            value_type_t* __restrict__ lanczosVecs_dev,
+                            value_type_t* __restrict__ work_dev)
+{
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
 
   // Useful variables
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one    = 1;
   constexpr value_type_t negOne = -1;
-  constexpr value_type_t zero = 0;
+  constexpr value_type_t zero   = 0;
   value_type_t alpha;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
 
@@ -117,29 +122,28 @@ int performLanczosIteration(
 
     // Apply matrix
     if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n, lanczosVecs_dev,
+      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + n,
+                               lanczosVecs_dev,
                                n * sizeof(value_type_t),
-                               cudaMemcpyDeviceToDevice, stream));
+                               cudaMemcpyDeviceToDevice,
+                               stream));
     A->mv(1, lanczosVecs_dev, shift, lanczosVecs_dev + n);
 
     // Orthogonalize Lanczos vector
-    CUBLAS_CHECK(cublasdot(cublas_h, n, lanczosVecs_dev, 1,
-                           lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host,
-                           stream));
+    CUBLAS_CHECK(cublasdot(
+      cublas_h, n, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, alpha_host, stream));
 
     alpha = -alpha_host[0];
-    CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha, lanczosVecs_dev, 1,
-                            lanczosVecs_dev + IDX(0, 1, n), 1, stream));
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1,
-                            beta_host, stream));
+    CUBLAS_CHECK(cublasaxpy(
+      cublas_h, n, &alpha, lanczosVecs_dev, 1, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
+    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, 1, n), 1, beta_host, stream));
 
     // Check if Lanczos has converged
     if (beta_host[0] <= tol) return 0;
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[0];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n),
-                            1, stream));
+    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, 1, n), 1, stream));
   }
 
   // -------------------------------------------------------
@@ -151,65 +155,121 @@ int performLanczosIteration(
 
     // Apply matrix
     if (shift != 0)
-      CUDA_TRY(cudaMemcpyAsync(
-        lanczosVecs_dev + (*iter) * n, lanczosVecs_dev + (*iter - 1) * n,
-        n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
-    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift,
-          lanczosVecs_dev + IDX(0, *iter, n));
+      CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + (*iter) * n,
+                               lanczosVecs_dev + (*iter - 1) * n,
+                               n * sizeof(value_type_t),
+                               cudaMemcpyDeviceToDevice,
+                               stream));
+    A->mv(1, lanczosVecs_dev + IDX(0, *iter - 1, n), shift, lanczosVecs_dev + IDX(0, *iter, n));
 
     // Full reorthogonalization
     //   "Twice is enough" algorithm per Kahan and Parlett
     if (reorthogonalize) {
-      CUBLAS_CHECK(cublasgemv(
-        cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n,
-        lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne,
-                              lanczosVecs_dev, n, work_dev, 1, &one,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
-
-      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1), work_dev + (*iter - 1),
-                               sizeof(value_type_t), cudaMemcpyDeviceToHost,
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_T,
+                              n,
+                              *iter,
+                              &one,
+                              lanczosVecs_dev,
+                              n,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              &zero,
+                              work_dev,
+                              1,
+                              stream));
+
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_N,
+                              n,
+                              *iter,
+                              &negOne,
+                              lanczosVecs_dev,
+                              n,
+                              work_dev,
+                              1,
+                              &one,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
+
+      CUDA_TRY(cudaMemcpyAsync(alpha_host + (*iter - 1),
+                               work_dev + (*iter - 1),
+                               sizeof(value_type_t),
+                               cudaMemcpyDeviceToHost,
                                stream));
 
-      CUBLAS_CHECK(cublasgemv(
-        cublas_h, CUBLAS_OP_T, n, *iter, &one, lanczosVecs_dev, n,
-        lanczosVecs_dev + IDX(0, *iter, n), 1, &zero, work_dev, 1, stream));
-
-      CUBLAS_CHECK(cublasgemv(cublas_h, CUBLAS_OP_N, n, *iter, &negOne,
-                              lanczosVecs_dev, n, work_dev, 1, &one,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_T,
+                              n,
+                              *iter,
+                              &one,
+                              lanczosVecs_dev,
+                              n,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              &zero,
+                              work_dev,
+                              1,
+                              stream));
+
+      CUBLAS_CHECK(cublasgemv(cublas_h,
+                              CUBLAS_OP_N,
+                              n,
+                              *iter,
+                              &negOne,
+                              lanczosVecs_dev,
+                              n,
+                              work_dev,
+                              1,
+                              &one,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
     }
 
     // Orthogonalization with 3-term recurrence relation
     else {
-      CUBLAS_CHECK(cublasdot(cublas_h, n,
-                             lanczosVecs_dev + IDX(0, *iter - 1, n), 1,
-                             lanczosVecs_dev + IDX(0, *iter, n), 1,
-                             alpha_host + (*iter - 1), stream));
+      CUBLAS_CHECK(cublasdot(cublas_h,
+                             n,
+                             lanczosVecs_dev + IDX(0, *iter - 1, n),
+                             1,
+                             lanczosVecs_dev + IDX(0, *iter, n),
+                             1,
+                             alpha_host + (*iter - 1),
+                             stream));
 
       auto alpha = -alpha_host[*iter - 1];
-      CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 1, n), 1,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+      CUBLAS_CHECK(cublasaxpy(cublas_h,
+                              n,
+                              &alpha,
+                              lanczosVecs_dev + IDX(0, *iter - 1, n),
+                              1,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
 
       alpha = -beta_host[*iter - 2];
-      CUBLAS_CHECK(cublasaxpy(cublas_h, n, &alpha,
-                              lanczosVecs_dev + IDX(0, *iter - 2, n), 1,
-                              lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+      CUBLAS_CHECK(cublasaxpy(cublas_h,
+                              n,
+                              &alpha,
+                              lanczosVecs_dev + IDX(0, *iter - 2, n),
+                              1,
+                              lanczosVecs_dev + IDX(0, *iter, n),
+                              1,
+                              stream));
     }
 
     // Compute residual
-    CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1,
-                            beta_host + *iter - 1, stream));
+    CUBLAS_CHECK(cublasnrm2(
+      cublas_h, n, lanczosVecs_dev + IDX(0, *iter, n), 1, beta_host + *iter - 1, stream));
 
     // Check if Lanczos has converged
     if (beta_host[*iter - 1] <= tol) break;
 
     // Normalize Lanczos vector
     alpha = 1 / beta_host[*iter - 1];
-    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha,
-                            lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
+    CUBLAS_CHECK(cublasscal(cublas_h, n, &alpha, lanczosVecs_dev + IDX(0, *iter, n), 1, stream));
   }
 
   CUDA_TRY(cudaStreamSynchronize(stream));
@@ -217,7 +277,7 @@ int performLanczosIteration(
   return 0;
 }
 
-/** 
+/**
  *  @brief  Find Householder transform for 3-dimensional system
  *    Given an input vector v=[x,y,z]', this function finds a
  *    Householder transform P such that P*v is a multiple of
@@ -235,8 +295,8 @@ int performLanczosIteration(
  *    matrix. Matrix dimensions are 3 x 3.
  */
 template <typename index_type_t, typename value_type_t>
-static void findHouseholder3(value_type_t *v, value_type_t *Pv,
-                             value_type_t *P) {
+static void findHouseholder3(value_type_t* v, value_type_t* Pv, value_type_t* P)
+{
   // Compute norm of vector
   *Pv = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
 
@@ -246,8 +306,7 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv,
   v[0] -= *Pv;
 
   // Normalize Householder vector
-  value_type_t normHouseholder =
-    std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+  value_type_t normHouseholder = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
   if (normHouseholder != 0) {
     v[0] /= normHouseholder;
     v[1] /= normHouseholder;
@@ -261,11 +320,13 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv,
   // Construct Householder matrix
   index_type_t i, j;
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i) P[IDX(i, j, 3)] = -2 * v[i] * v[j];
-  for (i = 0; i < 3; ++i) P[IDX(i, i, 3)] += 1;
+    for (i = 0; i < 3; ++i)
+      P[IDX(i, j, 3)] = -2 * v[i] * v[j];
+  for (i = 0; i < 3; ++i)
+    P[IDX(i, i, 3)] += 1;
 }
 
-/**  
+/**
  *  @brief  Apply 3-dimensional Householder transform to 4 x 4 matrix
  *    The Householder transform is pre-applied to the top three rows
  *  of the matrix and post-applied to the left three columns. The
@@ -277,7 +338,8 @@ static void findHouseholder3(value_type_t *v, value_type_t *Pv,
  *  @param A (Input/output, host memory, 16 entries) 4 x 4 matrix.
  */
 template <typename index_type_t, typename value_type_t>
-static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
+static void applyHouseholder3(const value_type_t* v, value_type_t* A)
+{
   // Loop indices
   index_type_t i, j;
   // Dot product between Householder vector and matrix row/column
@@ -286,19 +348,23 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
   // Pre-apply Householder transform
   for (j = 0; j < 4; ++j) {
     vDotA = 0;
-    for (i = 0; i < 3; ++i) vDotA += v[i] * A[IDX(i, j, 4)];
-    for (i = 0; i < 3; ++i) A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
+    for (i = 0; i < 3; ++i)
+      vDotA += v[i] * A[IDX(i, j, 4)];
+    for (i = 0; i < 3; ++i)
+      A[IDX(i, j, 4)] -= 2 * v[i] * vDotA;
   }
 
   // Post-apply Householder transform
   for (i = 0; i < 4; ++i) {
     vDotA = 0;
-    for (j = 0; j < 3; ++j) vDotA += A[IDX(i, j, 4)] * v[j];
-    for (j = 0; j < 3; ++j) A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
+    for (j = 0; j < 3; ++j)
+      vDotA += A[IDX(i, j, 4)] * v[j];
+    for (j = 0; j < 3; ++j)
+      A[IDX(i, j, 4)] -= 2 * vDotA * v[j];
   }
 }
 
-/**  
+/**
  *  @brief  Perform one step of Francis QR algorithm
  *    Equivalent to two steps of the classical QR algorithm on a
  *    tridiagonal matrix.
@@ -319,10 +385,14 @@ static void applyHouseholder3(const value_type_t *v, value_type_t *A) {
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int francisQRIteration(index_type_t n, value_type_t shift1,
-                              value_type_t shift2, value_type_t *alpha,
-                              value_type_t *beta, value_type_t *V,
-                              value_type_t *work) {
+static int francisQRIteration(index_type_t n,
+                              value_type_t shift1,
+                              value_type_t shift2,
+                              value_type_t* alpha,
+                              value_type_t* beta,
+                              value_type_t* V,
+                              value_type_t* work)
+{
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
@@ -352,30 +422,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
   householder[0] = alpha[0] * alpha[0] + beta[0] * beta[0] + b * alpha[0] + c;
   householder[1] = beta[0] * (alpha[0] + alpha[1] + b);
   householder[2] = beta[0] * beta[1];
-  findHouseholder3<index_type_t, value_type_t>(householder, &temp,
-                                               householderMatrix);
+  findHouseholder3<index_type_t, value_type_t>(householder, &temp, householderMatrix);
 
   // Apply initial Householder transform to create bulge
   memset(bulge, 0, 16 * sizeof(value_type_t));
-  for (i = 0; i < 4; ++i) bulge[IDX(i, i, 4)] = alpha[i];
+  for (i = 0; i < 4; ++i)
+    bulge[IDX(i, i, 4)] = alpha[i];
   for (i = 0; i < 3; ++i) {
     bulge[IDX(i + 1, i, 4)] = beta[i];
     bulge[IDX(i, i + 1, 4)] = beta[i];
   }
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix,
-                             3, 0, work, n);
+  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V, n, householderMatrix, 3, 0, work, n);
   memcpy(V, work, 3 * n * sizeof(value_type_t));
 
   // Chase bulge to bottom-right of matrix with Householder transforms
   for (pos = 0; pos < n - 4; ++pos) {
     // Move to next position
-    alpha[pos] = bulge[IDX(0, 0, 4)];
+    alpha[pos]     = bulge[IDX(0, 0, 4)];
     householder[0] = bulge[IDX(1, 0, 4)];
     householder[1] = bulge[IDX(2, 0, 4)];
     householder[2] = bulge[IDX(3, 0, 4)];
     for (j = 0; j < 3; ++j)
-      for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+      for (i = 0; i < 3; ++i)
+        bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
     bulge[IDX(3, 0, 4)] = 0;
     bulge[IDX(3, 1, 4)] = 0;
     bulge[IDX(3, 2, 4)] = beta[pos + 3];
@@ -385,22 +455,22 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
     bulge[IDX(3, 3, 4)] = alpha[pos + 4];
 
     // Apply Householder transform
-    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos,
-                                                 householderMatrix);
+    findHouseholder3<index_type_t, value_type_t>(householder, beta + pos, householderMatrix);
     applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-    Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n),
-                               n, householderMatrix, 3, 0, work, n);
+    Lapack<value_type_t>::gemm(
+      false, false, n, 3, 3, 1, V + IDX(0, pos + 1, n), n, householderMatrix, 3, 0, work, n);
     memcpy(V + IDX(0, pos + 1, n), work, 3 * n * sizeof(value_type_t));
   }
 
   // Apply penultimate Householder transform
   //   Values in the last row and column are zero
-  alpha[n - 4] = bulge[IDX(0, 0, 4)];
+  alpha[n - 4]   = bulge[IDX(0, 0, 4)];
   householder[0] = bulge[IDX(1, 0, 4)];
   householder[1] = bulge[IDX(2, 0, 4)];
   householder[2] = bulge[IDX(3, 0, 4)];
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+    for (i = 0; i < 3; ++i)
+      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
   bulge[IDX(3, 0, 4)] = 0;
   bulge[IDX(3, 1, 4)] = 0;
   bulge[IDX(3, 2, 4)] = 0;
@@ -408,37 +478,36 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
   bulge[IDX(1, 3, 4)] = 0;
   bulge[IDX(2, 3, 4)] = 0;
   bulge[IDX(3, 3, 4)] = 0;
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4,
-                                               householderMatrix);
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 4, householderMatrix);
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n,
-                             householderMatrix, 3, 0, work, n);
+  Lapack<value_type_t>::gemm(
+    false, false, n, 3, 3, 1, V + IDX(0, n - 3, n), n, householderMatrix, 3, 0, work, n);
   memcpy(V + IDX(0, n - 3, n), work, 3 * n * sizeof(value_type_t));
 
   // Apply final Householder transform
   //   Values in the last two rows and columns are zero
-  alpha[n - 3] = bulge[IDX(0, 0, 4)];
+  alpha[n - 3]   = bulge[IDX(0, 0, 4)];
   householder[0] = bulge[IDX(1, 0, 4)];
   householder[1] = bulge[IDX(2, 0, 4)];
   householder[2] = 0;
   for (j = 0; j < 3; ++j)
-    for (i = 0; i < 3; ++i) bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
-  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3,
-                                               householderMatrix);
+    for (i = 0; i < 3; ++i)
+      bulge[IDX(i, j, 4)] = bulge[IDX(i + 1, j + 1, 4)];
+  findHouseholder3<index_type_t, value_type_t>(householder, beta + n - 3, householderMatrix);
   applyHouseholder3<index_type_t, value_type_t>(householder, bulge);
-  Lapack<value_type_t>::gemm(false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n,
-                             householderMatrix, 3, 0, work, n);
+  Lapack<value_type_t>::gemm(
+    false, false, n, 2, 2, 1, V + IDX(0, n - 2, n), n, householderMatrix, 3, 0, work, n);
   memcpy(V + IDX(0, n - 2, n), work, 2 * n * sizeof(value_type_t));
 
   // Bulge has been eliminated
   alpha[n - 2] = bulge[IDX(0, 0, 4)];
   alpha[n - 1] = bulge[IDX(1, 1, 4)];
-  beta[n - 2] = bulge[IDX(1, 0, 4)];
+  beta[n - 2]  = bulge[IDX(1, 0, 4)];
 
   return 0;
 }
 
-/**  
+/**
  *  @brief  Perform implicit restart of Lanczos algorithm
  *    Shifts are Chebyshev nodes of unwanted region of matrix spectrum.
  *  @tparam index_type_t the type of data used for indexing.
@@ -474,23 +543,30 @@ static int francisQRIteration(index_type_t n, value_type_t shift1,
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-static int lanczosRestart(
-  handle_t const &handle, index_type_t n, index_type_t iter,
-  index_type_t iter_new, value_type_t *shiftUpper, value_type_t *shiftLower,
-  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ V_host, value_type_t *__restrict__ work_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, bool smallest_eig) {
+static int lanczosRestart(handle_t const& handle,
+                          index_type_t n,
+                          index_type_t iter,
+                          index_type_t iter_new,
+                          value_type_t* shiftUpper,
+                          value_type_t* shiftLower,
+                          value_type_t* __restrict__ alpha_host,
+                          value_type_t* __restrict__ beta_host,
+                          value_type_t* __restrict__ V_host,
+                          value_type_t* __restrict__ work_host,
+                          value_type_t* __restrict__ lanczosVecs_dev,
+                          value_type_t* __restrict__ work_dev,
+                          bool smallest_eig)
+{
   // -------------------------------------------------------
   // Variable declaration
   // -------------------------------------------------------
 
   // Useful constants
   constexpr value_type_t zero = 0;
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one  = 1;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // Loop index
   index_type_t i;
@@ -501,12 +577,12 @@ static int lanczosRestart(
   index_type_t restartSteps = iter - iter_new;
 
   // Ritz values from Lanczos method
-  value_type_t *ritzVals_host = work_host + 3 * iter;
+  value_type_t* ritzVals_host = work_host + 3 * iter;
   // Shifts for implicit restart
-  value_type_t *shifts_host;
+  value_type_t* shifts_host;
 
   // Orthonormal matrix for similarity transform
-  value_type_t *V_dev = work_dev + n * iter;
+  value_type_t* V_dev = work_dev + n * iter;
 
   // -------------------------------------------------------
   // Implementation
@@ -524,7 +600,8 @@ static int lanczosRestart(
 
   // Initialize similarity transform with identity matrix
   memset(V_host, 0, iter * iter * sizeof(value_type_t));
-  for (i = 0; i < iter; ++i) V_host[IDX(i, i, iter)] = 1;
+  for (i = 0; i < iter; ++i)
+    V_host[IDX(i, i, iter)] = 1;
 
   // Determine interval to suppress eigenvalues
   if (smallest_eig) {
@@ -548,49 +625,71 @@ static int lanczosRestart(
   // Calculate Chebyshev nodes as shifts
   shifts_host = ritzVals_host;
   for (i = 0; i < restartSteps; ++i) {
-    shifts_host[i] =
-      cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
+    shifts_host[i] = cos((i + 0.5) * static_cast<value_type_t>(M_PI) / restartSteps);
     shifts_host[i] *= 0.5 * ((*shiftUpper) - (*shiftLower));
     shifts_host[i] += 0.5 * ((*shiftUpper) + (*shiftLower));
   }
 
   // Apply Francis QR algorithm to implicitly restart Lanczos
   for (i = 0; i < restartSteps; i += 2)
-    if (francisQRIteration(iter, shifts_host[i], shifts_host[i + 1], alpha_host,
-                           beta_host, V_host, work_host))
+    if (francisQRIteration(
+          iter, shifts_host[i], shifts_host[i + 1], alpha_host, beta_host, V_host, work_host))
       WARNING("error in implicitly shifted QR algorithm");
 
   // Obtain new residual
-  CUDA_TRY(cudaMemcpyAsync(V_dev, V_host, iter * iter * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
-
-  beta_host[iter - 1] =
-    beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
-  CUBLAS_CHECK(cublasgemv(
-    cublas_h, CUBLAS_OP_N, n, iter, beta_host + iter_new - 1, lanczosVecs_dev,
-    n, V_dev + IDX(0, iter_new, iter), 1, beta_host + iter - 1,
-    lanczosVecs_dev + IDX(0, iter, n), 1, stream));
+  CUDA_TRY(cudaMemcpyAsync(
+    V_dev, V_host, iter * iter * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+
+  beta_host[iter - 1] = beta_host[iter - 1] * V_host[IDX(iter - 1, iter_new - 1, iter)];
+  CUBLAS_CHECK(cublasgemv(cublas_h,
+                          CUBLAS_OP_N,
+                          n,
+                          iter,
+                          beta_host + iter_new - 1,
+                          lanczosVecs_dev,
+                          n,
+                          V_dev + IDX(0, iter_new, iter),
+                          1,
+                          beta_host + iter - 1,
+                          lanczosVecs_dev + IDX(0, iter, n),
+                          1,
+                          stream));
 
   // Obtain new Lanczos vectors
-  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, iter_new, iter,
-                          &one, lanczosVecs_dev, n, V_dev, iter, &zero,
-                          work_dev, n, stream));
-
-  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev, work_dev,
+  CUBLAS_CHECK(cublasgemm(cublas_h,
+                          CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          iter_new,
+                          iter,
+                          &one,
+                          lanczosVecs_dev,
+                          n,
+                          V_dev,
+                          iter,
+                          &zero,
+                          work_dev,
+                          n,
+                          stream));
+
+  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev,
+                           work_dev,
                            n * iter_new * sizeof(value_type_t),
-                           cudaMemcpyDeviceToDevice, stream));
+                           cudaMemcpyDeviceToDevice,
+                           stream));
 
   // Normalize residual to obtain new Lanczos vector
-  CUDA_TRY(cudaMemcpyAsync(
-    lanczosVecs_dev + IDX(0, iter_new, n), lanczosVecs_dev + IDX(0, iter, n),
-    n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(lanczosVecs_dev + IDX(0, iter_new, n),
+                           lanczosVecs_dev + IDX(0, iter, n),
+                           n * sizeof(value_type_t),
+                           cudaMemcpyDeviceToDevice,
+                           stream));
 
-  CUBLAS_CHECK(cublasnrm2(cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1,
-                          beta_host + iter_new - 1, stream));
+  CUBLAS_CHECK(cublasnrm2(
+    cublas_h, n, lanczosVecs_dev + IDX(0, iter_new, n), 1, beta_host + iter_new - 1, stream));
 
   auto h_beta = 1 / beta_host[iter_new - 1];
-  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta,
-                          lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
+  CUBLAS_CHECK(cublasscal(cublas_h, n, &h_beta, lanczosVecs_dev + IDX(0, iter_new, n), 1, stream));
 
   return 0;
 }
@@ -601,7 +700,7 @@ static int lanczosRestart(
 // Eigensolver
 // =========================================================
 
-/**  
+/**
  * @brief  Compute smallest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -651,19 +750,28 @@ static int lanczosRestart(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
-  index_type_t *totalIter, value_type_t *shift,
-  value_type_t *__restrict__ alpha_host, value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const* A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t* effIter,
+                                index_type_t* totalIter,
+                                value_type_t* shift,
+                                value_type_t* __restrict__ alpha_host,
+                                value_type_t* __restrict__ beta_host,
+                                value_type_t* __restrict__ lanczosVecs_dev,
+                                value_type_t* __restrict__ work_dev,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed)
+{
   using namespace spectral;
 
   // Useful constants
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one  = 1;
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
@@ -683,21 +791,20 @@ int computeSmallestEigenvectors(
   index_type_t i;
 
   // Host memory
-  value_type_t *Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t *work_host;  // Workspace
+  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t* work_host;  // Workspace
 
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
   RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // -------------------------------------------------------
   // Variable initialization
@@ -710,12 +817,11 @@ int computeSmallestEigenvectors(
   std::vector<value_type_t> Z_host_v(restartIter * restartIter);
   std::vector<value_type_t> work_host_v(4 * restartIter);
 
-  Z_host = Z_host_v.data();
+  Z_host    = Z_host_v.data();
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue to determine shift
@@ -738,10 +844,18 @@ int computeSmallestEigenvectors(
 
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
-  *shift = 0;
-  status = performLanczosIteration<index_type_t, value_type_t>(
-    handle, A, effIter, maxIter_curr, *shift, 0.0, reorthogonalize, alpha_host,
-    beta_host, lanczosVecs_dev, work_dev);
+  *shift   = 0;
+  status   = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0.0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
   if (status) WARNING("error in Lanczos iteration");
 
   // Determine largest eigenvalue
@@ -756,9 +870,17 @@ int computeSmallestEigenvectors(
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
 
-  status = performLanczosIteration<index_type_t, value_type_t>(
-    handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host,
-    beta_host, lanczosVecs_dev, work_dev);
+  status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
   if (status) WARNING("error in Lanczos iteration");
   *totalIter += *effIter;
 
@@ -775,9 +897,19 @@ int computeSmallestEigenvectors(
     if (iter_new == *effIter) break;
 
     // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(
-      handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host,
-      beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, true);
+    status = lanczosRestart<index_type_t, value_type_t>(handle,
+                                                        n,
+                                                        *effIter,
+                                                        iter_new,
+                                                        &shiftUpper,
+                                                        &shiftLower,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        Z_host,
+                                                        work_host,
+                                                        lanczosVecs_dev,
+                                                        work_dev,
+                                                        true);
     if (status) WARNING("error in Lanczos implicit restart");
     *effIter = iter_new;
 
@@ -786,9 +918,17 @@ int computeSmallestEigenvectors(
 
     // Proceed with Lanczos method
 
-    status = performLanczosIteration<index_type_t, value_type_t>(
-      handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower),
-      reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                                 A,
+                                                                 effIter,
+                                                                 maxIter_curr,
+                                                                 *shift,
+                                                                 tol * fabs(shiftLower),
+                                                                 reorthogonalize,
+                                                                 alpha_host,
+                                                                 beta_host,
+                                                                 lanczosVecs_dev,
+                                                                 work_dev);
     if (status) WARNING("error in Lanczos iteration");
     *totalIter += *effIter - iter_new;
   }
@@ -799,39 +939,59 @@ int computeSmallestEigenvectors(
   }
 
   // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host,
-         (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host,
-         (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I', *effIter, work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter), Z_host, *effIter,
+  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I',
+                              *effIter,
+                              work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter),
+                              Z_host,
+                              *effIter,
                               work_host);
 
   // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift;
-  for (i = *effIter; i < nEigVecs; ++i) work_host[i + 2 * (*effIter)] = 0;
+  for (i = 0; i < *effIter; ++i)
+    work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = *effIter; i < nEigVecs; ++i)
+    work_host[i + 2 * (*effIter)] = 0;
 
   // Copy results to device memory
-  CUDA_TRY(cudaMemcpyAsync(eigVals_dev, work_host + 2 * (*effIter),
+  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                           work_host + 2 * (*effIter),
                            nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
+                           cudaMemcpyHostToDevice,
+                           stream));
 
-  CUDA_TRY(cudaMemcpyAsync(work_dev, Z_host,
+  CUDA_TRY(cudaMemcpyAsync(work_dev,
+                           Z_host,
                            (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
+                           cudaMemcpyHostToDevice,
+                           stream));
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs,
-                          *effIter, &one, lanczosVecs_dev, n, work_dev,
-                          *effIter, &zero, eigVecs_dev, n, stream));
+  CUBLAS_CHECK(cublasgemm(cublas_h,
+                          CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          nEigVecs,
+                          *effIter,
+                          &one,
+                          lanczosVecs_dev,
+                          n,
+                          work_dev,
+                          *effIter,
+                          &zero,
+                          eigVecs_dev,
+                          n,
+                          stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
   return 0;
 }
 
-/**  
+/**
  *  @brief  Compute smallest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -869,20 +1029,25 @@ int computeSmallestEigenvectors(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeSmallestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t &iter,
-  value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 1234567) {
+int computeSmallestEigenvectors(handle_t const& handle,
+                                sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                index_type_t nEigVecs,
+                                index_type_t maxIter,
+                                index_type_t restartIter,
+                                value_type_t tol,
+                                bool reorthogonalize,
+                                index_type_t& iter,
+                                value_type_t* __restrict__ eigVals_dev,
+                                value_type_t* __restrict__ eigVecs_dev,
+                                unsigned long long seed = 1234567)
+{
   using namespace spectral;
 
   // Matrix dimension
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -892,8 +1057,8 @@ int computeSmallestEigenvectors(
   std::vector<value_type_t> alpha_host_v(restartIter);
   std::vector<value_type_t> beta_host_v(restartIter);
 
-  value_type_t *alpha_host = alpha_host_v.data();
-  value_type_t *beta_host = beta_host_v.data();
+  value_type_t* alpha_host = alpha_host_v.data();
+  value_type_t* beta_host  = beta_host_v.data();
 
   vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
   vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
@@ -901,10 +1066,23 @@ int computeSmallestEigenvectors(
   // Perform Lanczos method
   index_type_t effIter;
   value_type_t shift;
-  int status = computeSmallestEigenvectors(
-    handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter,
-    &iter, &shift, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(),
-    eigVals_dev, eigVecs_dev, seed);
+  int status = computeSmallestEigenvectors(handle,
+                                           &A,
+                                           nEigVecs,
+                                           maxIter,
+                                           restartIter,
+                                           tol,
+                                           reorthogonalize,
+                                           &effIter,
+                                           &iter,
+                                           &shift,
+                                           alpha_host,
+                                           beta_host,
+                                           lanczosVecs_dev.raw(),
+                                           work_dev.raw(),
+                                           eigVals_dev,
+                                           eigVecs_dev,
+                                           seed);
 
   // Clean up and return
   return status;
@@ -914,7 +1092,7 @@ int computeSmallestEigenvectors(
 // Eigensolver
 // =========================================================
 
-/**  
+/**
  *  @brief Compute largest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -959,19 +1137,27 @@ int computeSmallestEigenvectors(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const *A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t *effIter,
-  index_type_t *totalIter, value_type_t *__restrict__ alpha_host,
-  value_type_t *__restrict__ beta_host,
-  value_type_t *__restrict__ lanczosVecs_dev,
-  value_type_t *__restrict__ work_dev, value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed) {
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const* A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t* effIter,
+                               index_type_t* totalIter,
+                               value_type_t* __restrict__ alpha_host,
+                               value_type_t* __restrict__ beta_host,
+                               value_type_t* __restrict__ lanczosVecs_dev,
+                               value_type_t* __restrict__ work_dev,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed)
+{
   using namespace spectral;
 
   // Useful constants
-  constexpr value_type_t one = 1;
+  constexpr value_type_t one  = 1;
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
@@ -987,8 +1173,8 @@ int computeLargestEigenvectors(
   index_type_t i;
 
   // Host memory
-  value_type_t *Z_host;     // Eigenvectors in Lanczos basis
-  value_type_t *work_host;  // Workspace
+  value_type_t* Z_host;     // Eigenvectors in Lanczos basis
+  value_type_t* work_host;  // Workspace
 
   // -------------------------------------------------------
   // Check that LAPACK is enabled
@@ -998,15 +1184,14 @@ int computeLargestEigenvectors(
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
   RAFT_EXPECTS(restartIter >= nEigVecs, "Invalid restartIter.");
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // -------------------------------------------------------
   // Variable initialization
@@ -1019,12 +1204,11 @@ int computeLargestEigenvectors(
   std::vector<value_type_t> Z_host_v(restartIter * restartIter);
   std::vector<value_type_t> work_host_v(4 * restartIter);
 
-  Z_host = Z_host_v.data();
+  Z_host    = Z_host_v.data();
   work_host = work_host_v.data();
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // Compute largest eigenvalue
@@ -1044,13 +1228,21 @@ int computeLargestEigenvectors(
   CUBLAS_CHECK(cublasscal(cublas_h, n, &h_val, lanczosVecs_dev, 1, stream));
 
   // Obtain tridiagonal matrix with Lanczos
-  *effIter = 0;
+  *effIter               = 0;
   value_type_t shift_val = 0.0;
-  value_type_t *shift = &shift_val;
-
-  status = performLanczosIteration<index_type_t, value_type_t>(
-    handle, A, effIter, maxIter_curr, *shift, 0, reorthogonalize, alpha_host,
-    beta_host, lanczosVecs_dev, work_dev);
+  value_type_t* shift    = &shift_val;
+
+  status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                               A,
+                                                               effIter,
+                                                               maxIter_curr,
+                                                               *shift,
+                                                               0,
+                                                               reorthogonalize,
+                                                               alpha_host,
+                                                               beta_host,
+                                                               lanczosVecs_dev,
+                                                               work_dev);
   if (status) WARNING("error in Lanczos iteration");
   *totalIter += *effIter;
 
@@ -1067,9 +1259,19 @@ int computeLargestEigenvectors(
     if (iter_new == *effIter) break;
 
     // Implicit restart of Lanczos method
-    status = lanczosRestart<index_type_t, value_type_t>(
-      handle, n, *effIter, iter_new, &shiftUpper, &shiftLower, alpha_host,
-      beta_host, Z_host, work_host, lanczosVecs_dev, work_dev, false);
+    status = lanczosRestart<index_type_t, value_type_t>(handle,
+                                                        n,
+                                                        *effIter,
+                                                        iter_new,
+                                                        &shiftUpper,
+                                                        &shiftLower,
+                                                        alpha_host,
+                                                        beta_host,
+                                                        Z_host,
+                                                        work_host,
+                                                        lanczosVecs_dev,
+                                                        work_dev,
+                                                        false);
     if (status) WARNING("error in Lanczos implicit restart");
     *effIter = iter_new;
 
@@ -1078,9 +1280,17 @@ int computeLargestEigenvectors(
 
     // Proceed with Lanczos method
 
-    status = performLanczosIteration<index_type_t, value_type_t>(
-      handle, A, effIter, maxIter_curr, *shift, tol * fabs(shiftLower),
-      reorthogonalize, alpha_host, beta_host, lanczosVecs_dev, work_dev);
+    status = performLanczosIteration<index_type_t, value_type_t>(handle,
+                                                                 A,
+                                                                 effIter,
+                                                                 maxIter_curr,
+                                                                 *shift,
+                                                                 tol * fabs(shiftLower),
+                                                                 reorthogonalize,
+                                                                 alpha_host,
+                                                                 beta_host,
+                                                                 lanczosVecs_dev,
+                                                                 work_dev);
     if (status) WARNING("error in Lanczos iteration");
     *totalIter += *effIter - iter_new;
   }
@@ -1090,15 +1300,18 @@ int computeLargestEigenvectors(
     WARNING("implicitly restarted Lanczos failed to converge");
   }
   for (int i = 0; i < restartIter; ++i) {
-    for (int j = 0; j < restartIter; ++j) Z_host[i * restartIter + j] = 0;
+    for (int j = 0; j < restartIter; ++j)
+      Z_host[i * restartIter + j] = 0;
   }
   // Solve tridiagonal system
-  memcpy(work_host + 2 * (*effIter), alpha_host,
-         (*effIter) * sizeof(value_type_t));
-  memcpy(work_host + 3 * (*effIter), beta_host,
-         (*effIter - 1) * sizeof(value_type_t));
-  Lapack<value_type_t>::steqr('I', *effIter, work_host + 2 * (*effIter),
-                              work_host + 3 * (*effIter), Z_host, *effIter,
+  memcpy(work_host + 2 * (*effIter), alpha_host, (*effIter) * sizeof(value_type_t));
+  memcpy(work_host + 3 * (*effIter), beta_host, (*effIter - 1) * sizeof(value_type_t));
+  Lapack<value_type_t>::steqr('I',
+                              *effIter,
+                              work_host + 2 * (*effIter),
+                              work_host + 3 * (*effIter),
+                              Z_host,
+                              *effIter,
                               work_host);
 
   // note: We need to pick the top nEigVecs eigenvalues
@@ -1123,36 +1336,52 @@ int computeLargestEigenvectors(
   //}
 
   // Obtain desired eigenvalues by applying shift
-  for (i = 0; i < *effIter; ++i) work_host[i + 2 * (*effIter)] -= *shift;
+  for (i = 0; i < *effIter; ++i)
+    work_host[i + 2 * (*effIter)] -= *shift;
 
   for (i = 0; i < top_eigenparis_idx_offset; ++i)
     work_host[i + 2 * (*effIter)] = 0;
 
   // Copy results to device memory
   // skip smallest eigenvalue if needed
-  CUDA_TRY(cudaMemcpyAsync(
-    eigVals_dev, work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
-    nEigVecs * sizeof(value_type_t), cudaMemcpyHostToDevice, stream));
+  CUDA_TRY(cudaMemcpyAsync(eigVals_dev,
+                           work_host + 2 * (*effIter) + top_eigenparis_idx_offset,
+                           nEigVecs * sizeof(value_type_t),
+                           cudaMemcpyHostToDevice,
+                           stream));
 
   // skip smallest eigenvector if needed
   CUDA_TRY(cudaMemcpyAsync(work_dev,
                            Z_host + (top_eigenparis_idx_offset * (*effIter)),
                            (*effIter) * nEigVecs * sizeof(value_type_t),
-                           cudaMemcpyHostToDevice, stream));
+                           cudaMemcpyHostToDevice,
+                           stream));
 
   CHECK_CUDA(stream);
 
   // Convert eigenvectors from Lanczos basis to standard basis
-  CUBLAS_CHECK(cublasgemm(cublas_h, CUBLAS_OP_N, CUBLAS_OP_N, n, nEigVecs,
-                          *effIter, &one, lanczosVecs_dev, n, work_dev,
-                          *effIter, &zero, eigVecs_dev, n, stream));
+  CUBLAS_CHECK(cublasgemm(cublas_h,
+                          CUBLAS_OP_N,
+                          CUBLAS_OP_N,
+                          n,
+                          nEigVecs,
+                          *effIter,
+                          &one,
+                          lanczosVecs_dev,
+                          n,
+                          work_dev,
+                          *effIter,
+                          &zero,
+                          eigVecs_dev,
+                          n,
+                          stream));
 
   // Clean up and exit
   curandDestroyGenerator(randGen);
   return 0;
 }
 
-/**  
+/**
  *  @brief  Compute largest eigenvectors of symmetric matrix
  *    Computes eigenvalues and eigenvectors that are least
  *    positive. If matrix is positive definite or positive
@@ -1190,18 +1419,23 @@ int computeLargestEigenvectors(
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int computeLargestEigenvectors(
-  handle_t const &handle, sparse_matrix_t<index_type_t, value_type_t> const &A,
-  index_type_t nEigVecs, index_type_t maxIter, index_type_t restartIter,
-  value_type_t tol, bool reorthogonalize, index_type_t &iter,
-  value_type_t *__restrict__ eigVals_dev,
-  value_type_t *__restrict__ eigVecs_dev, unsigned long long seed = 123456) {
+int computeLargestEigenvectors(handle_t const& handle,
+                               sparse_matrix_t<index_type_t, value_type_t> const& A,
+                               index_type_t nEigVecs,
+                               index_type_t maxIter,
+                               index_type_t restartIter,
+                               value_type_t tol,
+                               bool reorthogonalize,
+                               index_type_t& iter,
+                               value_type_t* __restrict__ eigVals_dev,
+                               value_type_t* __restrict__ eigVecs_dev,
+                               unsigned long long seed = 123456)
+{
   // Matrix dimension
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n,
-               "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -1211,18 +1445,30 @@ int computeLargestEigenvectors(
   std::vector<value_type_t> alpha_host_v(restartIter);
   std::vector<value_type_t> beta_host_v(restartIter);
 
-  value_type_t *alpha_host = alpha_host_v.data();
-  value_type_t *beta_host = beta_host_v.data();
+  value_type_t* alpha_host = alpha_host_v.data();
+  value_type_t* beta_host  = beta_host_v.data();
 
   vector_t<value_type_t> lanczosVecs_dev(handle, n * (restartIter + 1));
   vector_t<value_type_t> work_dev(handle, (n + restartIter) * restartIter);
 
   // Perform Lanczos method
   index_type_t effIter;
-  int status = computeLargestEigenvectors(
-    handle, &A, nEigVecs, maxIter, restartIter, tol, reorthogonalize, &effIter,
-    &iter, alpha_host, beta_host, lanczosVecs_dev.raw(), work_dev.raw(),
-    eigVals_dev, eigVecs_dev, seed);
+  int status = computeLargestEigenvectors(handle,
+                                          &A,
+                                          nEigVecs,
+                                          maxIter,
+                                          restartIter,
+                                          tol,
+                                          reorthogonalize,
+                                          &effIter,
+                                          &iter,
+                                          alpha_host,
+                                          beta_host,
+                                          lanczosVecs_dev.raw(),
+                                          work_dev.raw(),
+                                          eigVals_dev,
+                                          eigVecs_dev,
+                                          seed);
 
   // Clean up and return
   return status;
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh
index aff08da2d3..200818fdc3 100644
--- a/cpp/include/raft/linalg/map.cuh
+++ b/cpp/include/raft/linalg/map.cuh
@@ -24,21 +24,18 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, typename OutType, typename MapOp, int TPB,
-          typename... Args>
-__global__ void mapKernel(OutType *out, size_t len, MapOp map, const InType *in,
-                          Args... args) {
+template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
+__global__ void mapKernel(OutType* out, size_t len, MapOp map, const InType* in, Args... args)
+{
   auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-  if (idx < len) {
-    out[idx] = map(in[idx], args[idx]...);
-  }
+  if (idx < len) { out[idx] = map(in[idx], args[idx]...); }
 }
 
-template <typename InType, typename OutType, typename MapOp, int TPB,
-          typename... Args>
-void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-             const InType *in, Args... args) {
+template <typename InType, typename OutType, typename MapOp, int TPB, typename... Args>
+void mapImpl(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapKernel<InType, OutType, MapOp, TPB, Args...>
     <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
@@ -60,12 +57,14 @@ void mapImpl(OutType *out, size_t len, MapOp map, cudaStream_t stream,
  * @param args additional input arrays
  */
 
-template <typename InType, typename MapOp, int TPB = 256, typename... Args,
+template <typename InType,
+          typename MapOp,
+          int TPB = 256,
+          typename... Args,
           typename OutType = InType>
-void map(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-         const InType *in, Args... args) {
-  mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in,
-                                                args...);
+void map(OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  mapImpl<InType, OutType, MapOp, TPB, Args...>(out, len, map, stream, in, args...);
 }
 
 }  // namespace linalg
diff --git a/cpp/include/raft/linalg/map_then_reduce.cuh b/cpp/include/raft/linalg/map_then_reduce.cuh
index f2f198670a..78a7017c5c 100644
--- a/cpp/include/raft/linalg/map_then_reduce.cuh
+++ b/cpp/include/raft/linalg/map_then_reduce.cuh
@@ -24,50 +24,66 @@
 namespace raft {
 namespace linalg {
 
-struct sum_tag {};
+struct sum_tag {
+};
 
 template <typename InType, typename OutType, int TPB>
-__device__ void reduce(OutType *out, const InType acc, sum_tag) {
+__device__ void reduce(OutType* out, const InType acc, sum_tag)
+{
   typedef cub::BlockReduce<InType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType tmp = BlockReduce(temp_storage).Sum(acc);
-  if (threadIdx.x == 0) {
-    raft::myAtomicAdd(out, tmp);
-  }
+  if (threadIdx.x == 0) { raft::myAtomicAdd(out, tmp); }
 }
 
 template <typename InType, typename OutType, int TPB, typename ReduceLambda>
-__device__ void reduce(OutType *out, const InType acc, ReduceLambda op) {
+__device__ void reduce(OutType* out, const InType acc, ReduceLambda op)
+{
   typedef cub::BlockReduce<InType, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   OutType tmp = BlockReduce(temp_storage).Reduce(acc, op);
-  if (threadIdx.x == 0) {
-    raft::myAtomicReduce(out, tmp, op);
-  }
+  if (threadIdx.x == 0) { raft::myAtomicReduce(out, tmp, op); }
 }
 
-template <typename InType, typename OutType, typename MapOp,
-          typename ReduceLambda, int TPB, typename... Args>
-__global__ void mapThenReduceKernel(OutType *out, size_t len, OutType neutral,
-                                    MapOp map, ReduceLambda op,
-                                    const InType *in, Args... args) {
+template <typename InType,
+          typename OutType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB,
+          typename... Args>
+__global__ void mapThenReduceKernel(OutType* out,
+                                    size_t len,
+                                    OutType neutral,
+                                    MapOp map,
+                                    ReduceLambda op,
+                                    const InType* in,
+                                    Args... args)
+{
   OutType acc = neutral;
-  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
+  auto idx    = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-  if (idx < len) {
-    acc = map(in[idx], args[idx]...);
-  }
+  if (idx < len) { acc = map(in[idx], args[idx]...); }
 
   __syncthreads();
 
   reduce<InType, OutType, TPB>(out, acc, op);
 }
 
-template <typename InType, typename OutType, typename MapOp,
-          typename ReduceLambda, int TPB, typename... Args>
-void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map,
-                       ReduceLambda op, cudaStream_t stream, const InType *in,
-                       Args... args) {
+template <typename InType,
+          typename OutType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB,
+          typename... Args>
+void mapThenReduceImpl(OutType* out,
+                       size_t len,
+                       OutType neutral,
+                       MapOp map,
+                       ReduceLambda op,
+                       cudaStream_t stream,
+                       const InType* in,
+                       Args... args)
+{
   raft::update_device(out, &neutral, 1, stream);
   const int nblks = raft::ceildiv(len, (size_t)TPB);
   mapThenReduceKernel<InType, OutType, MapOp, ReduceLambda, TPB, Args...>
@@ -89,10 +105,14 @@ void mapThenReduceImpl(OutType *out, size_t len, OutType neutral, MapOp map,
  * @param args additional input arrays
  */
 
-template <typename InType, typename MapOp, int TPB = 256, typename... Args,
+template <typename InType,
+          typename MapOp,
+          int TPB = 256,
+          typename... Args,
           typename OutType = InType>
-void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream,
-                      const InType *in, Args... args) {
+void mapThenSumReduce(
+  OutType* out, size_t len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
   mapThenReduceImpl<InType, OutType, MapOp, sum_tag, TPB, Args...>(
     out, len, (OutType)0, map, sum_tag(), stream, in, args...);
 }
@@ -115,11 +135,21 @@ void mapThenSumReduce(OutType *out, size_t len, MapOp map, cudaStream_t stream,
  * @param args additional input arrays
  */
 
-template <typename InType, typename MapOp, typename ReduceLambda, int TPB = 256,
-          typename OutType = InType, typename... Args>
-void mapThenReduce(OutType *out, size_t len, OutType neutral, MapOp map,
-                   ReduceLambda op, cudaStream_t stream, const InType *in,
-                   Args... args) {
+template <typename InType,
+          typename MapOp,
+          typename ReduceLambda,
+          int TPB          = 256,
+          typename OutType = InType,
+          typename... Args>
+void mapThenReduce(OutType* out,
+                   size_t len,
+                   OutType neutral,
+                   MapOp map,
+                   ReduceLambda op,
+                   cudaStream_t stream,
+                   const InType* in,
+                   Args... args)
+{
   mapThenReduceImpl<InType, OutType, MapOp, ReduceLambda, TPB, Args...>(
     out, len, neutral, map, op, stream, in, args...);
 }
diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh
index 93f2d746fa..81c1919b2e 100644
--- a/cpp/include/raft/linalg/matrix_vector_op.cuh
+++ b/cpp/include/raft/linalg/matrix_vector_op.cuh
@@ -27,19 +27,24 @@ namespace {
 template <size_t VecBytes>
 struct AlignedAccess {
   template <typename T>
-  static inline bool test(const T *matrix, size_t strideBytes) {
-    return Pow2<VecBytes>::isAligned(matrix) &&
-           Pow2<VecBytes>::isAligned(strideBytes) &&
+  static inline bool test(const T* matrix, size_t strideBytes)
+  {
+    return Pow2<VecBytes>::isAligned(matrix) && Pow2<VecBytes>::isAligned(strideBytes) &&
            Pow2<sizeof(T)>::isAligned(VecBytes);
   }
 };
 };  // namespace
 
 template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
-                                     const Type *vector, IdxType D, IdxType N,
-                                     bool rowMajor, bool bcastAlongRows,
-                                     Lambda op) {
+__global__ void matrixVectorOpKernel(Type* out,
+                                     const Type* matrix,
+                                     const Type* vector,
+                                     IdxType D,
+                                     IdxType N,
+                                     bool rowMajor,
+                                     bool bcastAlongRows,
+                                     Lambda op)
+{
   typedef TxN_t<Type, veclen_> VecType;
   IdxType len = N * D;
   IdxType idx = threadIdx.x;
@@ -70,17 +75,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
   mat.store(out, idx);
 }
 
-template <typename Type, int veclen_, typename Lambda, typename IdxType,
-          int TPB>
-void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
-                        IdxType D, IdxType N, bool rowMajor,
-                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
-  IdxType len = N * D;
-  IdxType nblks =
-    raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
+template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
+void matrixVectorOpImpl(Type* out,
+                        const Type* matrix,
+                        const Type* vec,
+                        IdxType D,
+                        IdxType N,
+                        bool rowMajor,
+                        bool bcastAlongRows,
+                        Lambda op,
+                        cudaStream_t stream)
+{
+  IdxType len   = N * D;
+  IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB);
   matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor,
-                                bcastAlongRows, op);
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -108,10 +117,17 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
-                    IdxType N, bool rowMajor, bool bcastAlongRows, Lambda op,
-                    cudaStream_t stream) {
-  IdxType stride = rowMajor ? D : N;
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  IdxType stride      = rowMajor ? D : N;
   size_t stride_bytes = stride * sizeof(Type);
 
   if (AlignedAccess<16>::test(matrix, stride_bytes)) {
@@ -138,10 +154,16 @@ void matrixVectorOp(Type *out, const Type *matrix, const Type *vec, IdxType D,
 ///@todo: come up with a cleaner interface to support these cases in future!
 
 template <typename Type, int veclen_, typename Lambda, typename IdxType>
-__global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
-                                     const Type *vector1, const Type *vector2,
-                                     IdxType D, IdxType N, bool rowMajor,
-                                     bool bcastAlongRows, Lambda op) {
+__global__ void matrixVectorOpKernel(Type* out,
+                                     const Type* matrix,
+                                     const Type* vector1,
+                                     const Type* vector2,
+                                     IdxType D,
+                                     IdxType N,
+                                     bool rowMajor,
+                                     bool bcastAlongRows,
+                                     Lambda op)
+{
   typedef TxN_t<Type, veclen_> VecType;
   IdxType len = N * D;
   IdxType idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * VecType::Ratio;
@@ -174,15 +196,21 @@ __global__ void matrixVectorOpKernel(Type *out, const Type *matrix,
   mat.store(out, idx);
 }
 
-template <typename Type, int veclen_, typename Lambda, typename IdxType,
-          int TPB>
-void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
-                        const Type *vec2, IdxType D, IdxType N, bool rowMajor,
-                        bool bcastAlongRows, Lambda op, cudaStream_t stream) {
+template <typename Type, int veclen_, typename Lambda, typename IdxType, int TPB>
+void matrixVectorOpImpl(Type* out,
+                        const Type* matrix,
+                        const Type* vec1,
+                        const Type* vec2,
+                        IdxType D,
+                        IdxType N,
+                        bool rowMajor,
+                        bool bcastAlongRows,
+                        Lambda op,
+                        cudaStream_t stream)
+{
   IdxType nblks = raft::ceildiv(N * D, (IdxType)TPB);
   matrixVectorOpKernel<Type, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor,
-                                bcastAlongRows, op);
+    <<<nblks, TPB, 0, stream>>>(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -211,10 +239,18 @@ void matrixVectorOpImpl(Type *out, const Type *matrix, const Type *vec1,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename Lambda, typename IdxType = int, int TPB = 256>
-void matrixVectorOp(Type *out, const Type *matrix, const Type *vec1,
-                    const Type *vec2, IdxType D, IdxType N, bool rowMajor,
-                    bool bcastAlongRows, Lambda op, cudaStream_t stream) {
-  IdxType stride = rowMajor ? D : N;
+void matrixVectorOp(Type* out,
+                    const Type* matrix,
+                    const Type* vec1,
+                    const Type* vec2,
+                    IdxType D,
+                    IdxType N,
+                    bool rowMajor,
+                    bool bcastAlongRows,
+                    Lambda op,
+                    cudaStream_t stream)
+{
+  IdxType stride      = rowMajor ? D : N;
   size_t stride_bytes = stride * sizeof(Type);
 
   if (AlignedAccess<16>::test(matrix, stride_bytes)) {
diff --git a/cpp/include/raft/linalg/mean_squared_error.cuh b/cpp/include/raft/linalg/mean_squared_error.cuh
index 9d1538c172..a3fcc5bac6 100644
--- a/cpp/include/raft/linalg/mean_squared_error.cuh
+++ b/cpp/include/raft/linalg/mean_squared_error.cuh
@@ -24,7 +24,7 @@ namespace linalg {
 /**
  * @brief CUDA version mean squared error function mean((A-B)**2)
  * @tparam math_t data-type upon which the math operation will be performed
- * @tparam TPB threads-per-block 
+ * @tparam TPB threads-per-block
  * @param out the output mean squared error value (assumed to be a device pointer)
  * @param A input array (assumed to be a device pointer)
  * @param B input array (assumed to be a device pointer)
@@ -33,14 +33,14 @@ namespace linalg {
  * @param stream cuda-stream where to launch this kernel
  */
 template <typename math_t, int TPB = 256>
-void meanSquaredError(math_t* out, const math_t* A, const math_t* B, size_t len,
-                      math_t weight, cudaStream_t stream) {
+void meanSquaredError(
+  math_t* out, const math_t* A, const math_t* B, size_t len, math_t weight, cudaStream_t stream)
+{
   auto sq_diff = [len, weight] __device__(const math_t a, const math_t b) {
     math_t diff = a - b;
     return diff * diff * weight / len;
   };
-  mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A,
-                                                   B);
+  mapThenSumReduce<math_t, decltype(sq_diff), TPB>(out, len, sq_diff, stream, A, B);
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/multiply.cuh b/cpp/include/raft/linalg/multiply.cuh
index ce948c927d..53d57ecd00 100644
--- a/cpp/include/raft/linalg/multiply.cuh
+++ b/cpp/include/raft/linalg/multiply.cuh
@@ -33,11 +33,10 @@ namespace linalg {
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void multiplyScalar(math_t *out, const math_t *in, math_t scalar, IdxType len,
-                    cudaStream_t stream) {
+void multiplyScalar(math_t* out, const math_t* in, math_t scalar, IdxType len, cudaStream_t stream)
+{
   unaryOp(
-    out, in, len, [scalar] __device__(math_t in) { return in * scalar; },
-    stream);
+    out, in, len, [scalar] __device__(math_t in) { return in * scalar; }, stream);
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/norm.cuh b/cpp/include/raft/linalg/norm.cuh
index 64930a7123..82558c8023 100644
--- a/cpp/include/raft/linalg/norm.cuh
+++ b/cpp/include/raft/linalg/norm.cuh
@@ -44,22 +44,46 @@ enum NormType { L1Norm = 0, L2Norm };
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int,
-          typename Lambda = raft::Nop<Type, IdxType>>
-void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
-             bool rowMajor, cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>()) {
+template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+void rowNorm(Type* dots,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             NormType type,
+             bool rowMajor,
+             cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>())
+{
   switch (type) {
     case L1Norm:
-      reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false,
-             raft::L1Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
+      reduce(dots,
+             data,
+             D,
+             N,
+             (Type)0,
+             rowMajor,
+             true,
+             stream,
+             false,
+             raft::L1Op<Type, IdxType>(),
+             raft::Sum<Type>(),
+             fin_op);
       break;
     case L2Norm:
-      reduce(dots, data, D, N, (Type)0, rowMajor, true, stream, false,
-             raft::L2Op<Type>(), raft::Sum<Type>(), fin_op);
+      reduce(dots,
+             data,
+             D,
+             N,
+             (Type)0,
+             rowMajor,
+             true,
+             stream,
+             false,
+             raft::L2Op<Type>(),
+             raft::Sum<Type>(),
+             fin_op);
       break;
-    default:
-      ASSERT(false, "Invalid norm type passed! [%d]", type);
+    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
   };
 }
 
@@ -77,22 +101,46 @@ void rowNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
  * @param stream cuda stream where to launch work
  * @param fin_op the final lambda op
  */
-template <typename Type, typename IdxType = int,
-          typename Lambda = raft::Nop<Type, IdxType>>
-void colNorm(Type *dots, const Type *data, IdxType D, IdxType N, NormType type,
-             bool rowMajor, cudaStream_t stream,
-             Lambda fin_op = raft::Nop<Type, IdxType>()) {
+template <typename Type, typename IdxType = int, typename Lambda = raft::Nop<Type, IdxType>>
+void colNorm(Type* dots,
+             const Type* data,
+             IdxType D,
+             IdxType N,
+             NormType type,
+             bool rowMajor,
+             cudaStream_t stream,
+             Lambda fin_op = raft::Nop<Type, IdxType>())
+{
   switch (type) {
     case L1Norm:
-      reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false,
-             raft::L1Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
+      reduce(dots,
+             data,
+             D,
+             N,
+             (Type)0,
+             rowMajor,
+             false,
+             stream,
+             false,
+             raft::L1Op<Type, IdxType>(),
+             raft::Sum<Type>(),
+             fin_op);
       break;
     case L2Norm:
-      reduce(dots, data, D, N, (Type)0, rowMajor, false, stream, false,
-             raft::L2Op<Type, IdxType>(), raft::Sum<Type>(), fin_op);
+      reduce(dots,
+             data,
+             D,
+             N,
+             (Type)0,
+             rowMajor,
+             false,
+             stream,
+             false,
+             raft::L2Op<Type, IdxType>(),
+             raft::Sum<Type>(),
+             fin_op);
       break;
-    default:
-      ASSERT(false, "Invalid norm type passed! [%d]", type);
+    default: ASSERT(false, "Invalid norm type passed! [%d]", type);
   };
 }
 
diff --git a/cpp/include/raft/linalg/qr.cuh b/cpp/include/raft/linalg/qr.cuh
index a50448acbe..c85cfda934 100644
--- a/cpp/include/raft/linalg/qr.cuh
+++ b/cpp/include/raft/linalg/qr.cuh
@@ -41,14 +41,18 @@ namespace linalg {
  * @{
  */
 template <typename math_t>
-void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
-            int n_rows, int n_cols, cudaStream_t stream) {
+void qrGetQ(const raft::handle_t& handle,
+            const math_t* M,
+            math_t* Q,
+            int n_rows,
+            int n_cols,
+            cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
   int k = min(m, n);
-  CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n,
-                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(cudaMemcpyAsync(Q, M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
 
   rmm::device_uvector<math_t> tau(k, stream);
   CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * k, stream));
@@ -58,19 +62,16 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
 
   CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, m, n, Q, m, &Lwork));
   rmm::device_uvector<math_t> workspace(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDngeqrf(cusolverH, m, n, Q, m, tau.data(),
-                                 workspace.data(), Lwork, devInfo.data(),
-                                 stream));
+  CUSOLVER_CHECK(cusolverDngeqrf(
+    cusolverH, m, n, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
   /// @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
 #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
   CUDA_CHECK(cudaDeviceSynchronize());
 #endif
-  CUSOLVER_CHECK(
-    cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
+  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, m, n, k, Q, m, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDnorgqr(cusolverH, m, n, k, Q, m, tau.data(),
-                                 workspace.data(), Lwork, devInfo.data(),
-                                 stream));
+  CUSOLVER_CHECK(cusolverDnorgqr(
+    cusolverH, m, n, k, Q, m, tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
 }
 
 /**
@@ -84,29 +85,40 @@ void qrGetQ(const raft::handle_t &handle, const math_t *M, math_t *Q,
  * @param stream cuda stream
  */
 template <typename math_t>
-void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
-             int n_rows, int n_cols, cudaStream_t stream) {
+void qrGetQR(const raft::handle_t& handle,
+             math_t* M,
+             math_t* Q,
+             math_t* R,
+             int n_rows,
+             int n_cols,
+             cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   int m = n_rows, n = n_cols;
   rmm::device_uvector<math_t> R_full(m * n, stream);
   rmm::device_uvector<math_t> tau(min(m, n), stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
+  CUDA_CHECK(cudaMemsetAsync(tau.data(), 0, sizeof(math_t) * min(m, n), stream));
   int R_full_nrows = m, R_full_ncols = n;
-  CUDA_CHECK(cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n,
-                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(R_full.data(), M, sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
 
   int Lwork;
   rmm::device_scalar<int> devInfo(stream);
 
-  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(cusolverH, R_full_nrows,
-                                            R_full_ncols, R_full.data(),
-                                            R_full_nrows, &Lwork));
+  CUSOLVER_CHECK(cusolverDngeqrf_bufferSize(
+    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows, &Lwork));
   rmm::device_uvector<math_t> workspace(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDngeqrf(
-    cusolverH, R_full_nrows, R_full_ncols, R_full.data(), R_full_nrows,
-    tau.data(), workspace.data(), Lwork, devInfo.data(), stream));
+  CUSOLVER_CHECK(cusolverDngeqrf(cusolverH,
+                                 R_full_nrows,
+                                 R_full_ncols,
+                                 R_full.data(),
+                                 R_full_nrows,
+                                 tau.data(),
+                                 workspace.data(),
+                                 Lwork,
+                                 devInfo.data(),
+                                 stream));
   // @note in v9.2, without deviceSynchronize *SquareMatrixNorm* ml-prims unit-tests fail.
 #if defined(CUDART_VERSION) && CUDART_VERSION <= 9020
   CUDA_CHECK(cudaDeviceSynchronize());
@@ -114,17 +126,24 @@ void qrGetQR(const raft::handle_t &handle, math_t *M, math_t *Q, math_t *R,
 
   raft::matrix::copyUpperTriangular(R_full.data(), R, m, n, stream);
 
-  CUDA_CHECK(cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n,
-                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(Q, R_full.data(), sizeof(math_t) * m * n, cudaMemcpyDeviceToDevice, stream));
   int Q_nrows = m, Q_ncols = n;
 
-  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(cusolverH, Q_nrows, Q_ncols,
-                                            min(Q_ncols, Q_nrows), Q, Q_nrows,
-                                            tau.data(), &Lwork));
+  CUSOLVER_CHECK(cusolverDnorgqr_bufferSize(
+    cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(), &Lwork));
   workspace.resize(Lwork, stream);
-  CUSOLVER_CHECK(cusolverDnorgqr(
-    cusolverH, Q_nrows, Q_ncols, min(Q_ncols, Q_nrows), Q, Q_nrows, tau.data(),
-    workspace.data(), Lwork, devInfo.data(), stream));
+  CUSOLVER_CHECK(cusolverDnorgqr(cusolverH,
+                                 Q_nrows,
+                                 Q_ncols,
+                                 min(Q_ncols, Q_nrows),
+                                 Q,
+                                 Q_nrows,
+                                 tau.data(),
+                                 workspace.data(),
+                                 Lwork,
+                                 devInfo.data(),
+                                 stream));
 }
 /** @} */
 
diff --git a/cpp/include/raft/linalg/reduce.cuh b/cpp/include/raft/linalg/reduce.cuh
index d39577bbdd..693a797db9 100644
--- a/cpp/include/raft/linalg/reduce.cuh
+++ b/cpp/include/raft/linalg/reduce.cuh
@@ -52,28 +52,33 @@ namespace linalg {
  * @param reduce_op binary reduction operation
  * @param final_op elementwise operation to apply before storing results
  */
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda = raft::Nop<OutType>>
-void reduce(OutType *dots, const InType *data, int D, int N, OutType init,
-            bool rowMajor, bool alongRows, cudaStream_t stream,
-            bool inplace = false,
-            MainLambda main_op = raft::Nop<InType, IdxType>(),
+          typename FinalLambda  = raft::Nop<OutType>>
+void reduce(OutType* dots,
+            const InType* data,
+            int D,
+            int N,
+            OutType init,
+            bool rowMajor,
+            bool alongRows,
+            cudaStream_t stream,
+            bool inplace           = false,
+            MainLambda main_op     = raft::Nop<InType, IdxType>(),
             ReduceLambda reduce_op = raft::Sum<OutType>(),
-            FinalLambda final_op = raft::Nop<OutType>()) {
+            FinalLambda final_op   = raft::Nop<OutType>())
+{
   if (rowMajor && alongRows) {
-    coalescedReduction(dots, data, D, N, init, stream, inplace, main_op,
-                       reduce_op, final_op);
+    coalescedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (rowMajor && !alongRows) {
-    stridedReduction(dots, data, D, N, init, stream, inplace, main_op,
-                     reduce_op, final_op);
+    stridedReduction(dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (!rowMajor && alongRows) {
-    stridedReduction(dots, data, N, D, init, stream, inplace, main_op,
-                     reduce_op, final_op);
+    stridedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
-    coalescedReduction(dots, data, N, D, init, stream, inplace, main_op,
-                       reduce_op, final_op);
+    coalescedReduction(dots, data, N, D, init, stream, inplace, main_op, reduce_op, final_op);
   }
 }
 
diff --git a/cpp/include/raft/linalg/strided_reduction.cuh b/cpp/include/raft/linalg/strided_reduction.cuh
index bba652e137..f931c976fd 100644
--- a/cpp/include/raft/linalg/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/strided_reduction.cuh
@@ -28,14 +28,15 @@ namespace linalg {
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
 // for column major layout
 template <typename Type, typename MainLambda>
-__global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
-                                       int N, Type init, MainLambda main_op) {
+__global__ void stridedSummationKernel(
+  Type* dots, const Type* data, int D, int N, Type init, MainLambda main_op)
+{
   // Thread reduction
   Type thread_data = Type(init);
-  int colStart = blockIdx.x * blockDim.x + threadIdx.x;
+  int colStart     = blockIdx.x * blockDim.x + threadIdx.x;
   if (colStart < D) {
     int rowStart = blockIdx.y * blockDim.y + threadIdx.y;
-    int stride = blockDim.y * gridDim.y;
+    int stride   = blockDim.y * gridDim.y;
     for (int j = rowStart; j < N; j += stride) {
       int idx = colStart + j * D;
       thread_data += main_op(data[idx], j);
@@ -44,8 +45,8 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
 
   // Block reduction
   extern __shared__ char tmp[];  // One element per thread in block
-  Type *temp = (Type *)tmp;      // Cast to desired type
-  int myidx = threadIdx.x + blockDim.x * threadIdx.y;
+  Type* temp  = (Type*)tmp;      // Cast to desired type
+  int myidx   = threadIdx.x + blockDim.x * threadIdx.y;
   temp[myidx] = thread_data;
   __syncthreads();
   for (int j = blockDim.y / 2; j > 0; j /= 2) {
@@ -54,24 +55,31 @@ __global__ void stridedSummationKernel(Type *dots, const Type *data, int D,
   }
 
   // Grid reduction
-  if ((colStart < D) && (threadIdx.y == 0))
-    raft::myAtomicAdd(dots + colStart, temp[myidx]);
+  if ((colStart < D) && (threadIdx.y == 0)) raft::myAtomicAdd(dots + colStart, temp[myidx]);
 }
 
 // Kernel to perform reductions along the strided dimension
 // of the matrix, i.e. reduce along columns for row major or reduce along rows
 // for column major layout
-template <typename InType, typename OutType, typename IdxType,
-          typename MainLambda, typename ReduceLambda>
-__global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
-                                       int N, OutType init, MainLambda main_op,
-                                       ReduceLambda reduce_op) {
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MainLambda,
+          typename ReduceLambda>
+__global__ void stridedReductionKernel(OutType* dots,
+                                       const InType* data,
+                                       int D,
+                                       int N,
+                                       OutType init,
+                                       MainLambda main_op,
+                                       ReduceLambda reduce_op)
+{
   // Thread reduction
   OutType thread_data = init;
-  IdxType colStart = blockIdx.x * blockDim.x + threadIdx.x;
+  IdxType colStart    = blockIdx.x * blockDim.x + threadIdx.x;
   if (colStart < D) {
     IdxType rowStart = blockIdx.y * blockDim.y + threadIdx.y;
-    IdxType stride = blockDim.y * gridDim.y;
+    IdxType stride   = blockDim.y * gridDim.y;
     for (IdxType j = rowStart; j < N; j += stride) {
       IdxType idx = colStart + j * D;
       thread_data = reduce_op(thread_data, main_op(data[idx], j));
@@ -79,14 +87,13 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
   }
 
   // Block reduction
-  extern __shared__ char tmp[];  // One element per thread in block
-  auto *temp = (OutType *)tmp;   // Cast to desired type
+  extern __shared__ char tmp[];   // One element per thread in block
+  auto* temp    = (OutType*)tmp;  // Cast to desired type
   IdxType myidx = threadIdx.x + ((IdxType)blockDim.x * (IdxType)threadIdx.y);
-  temp[myidx] = thread_data;
+  temp[myidx]   = thread_data;
   __syncthreads();
   for (int j = blockDim.y / 2; j > 0; j /= 2) {
-    if (threadIdx.y < j)
-      temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]);
+    if (threadIdx.y < j) temp[myidx] = reduce_op(temp[myidx], temp[myidx + j * blockDim.x]);
     __syncthreads();
   }
 
@@ -122,15 +129,23 @@ __global__ void stridedReductionKernel(OutType *dots, const InType *data, int D,
  * @param inplace reduction result added inplace or overwrites old values?
  * @param stream cuda stream where to launch work
  */
-template <typename InType, typename OutType = InType, typename IdxType = int,
-          typename MainLambda = raft::Nop<InType, IdxType>,
+template <typename InType,
+          typename OutType      = InType,
+          typename IdxType      = int,
+          typename MainLambda   = raft::Nop<InType, IdxType>,
           typename ReduceLambda = raft::Sum<OutType>,
-          typename FinalLambda = raft::Nop<OutType>>
-void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
-                      OutType init, cudaStream_t stream, bool inplace = false,
-                      MainLambda main_op = raft::Nop<InType, IdxType>(),
+          typename FinalLambda  = raft::Nop<OutType>>
+void stridedReduction(OutType* dots,
+                      const InType* data,
+                      IdxType D,
+                      IdxType N,
+                      OutType init,
+                      cudaStream_t stream,
+                      bool inplace           = false,
+                      MainLambda main_op     = raft::Nop<InType, IdxType>(),
                       ReduceLambda reduce_op = raft::Sum<OutType>(),
-                      FinalLambda final_op = raft::Nop<OutType>()) {
+                      FinalLambda final_op   = raft::Nop<OutType>())
+{
   ///@todo: this extra should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
   if (!inplace)
@@ -140,7 +155,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
   // Arbitrary numbers for now, probably need to tune
   const dim3 thrds(32, 16);
   IdxType elemsPerThread = raft::ceildiv(N, (IdxType)thrds.y);
-  elemsPerThread = (elemsPerThread > 8) ? 8 : elemsPerThread;
+  elemsPerThread         = (elemsPerThread > 8) ? 8 : elemsPerThread;
   const dim3 nblks(raft::ceildiv(D, (IdxType)thrds.x),
                    raft::ceildiv(N, (IdxType)thrds.y * elemsPerThread));
   const size_t shmemSize = sizeof(OutType) * thrds.x * thrds.y;
@@ -153,8 +168,7 @@ void stridedReduction(OutType *dots, const InType *data, IdxType D, IdxType N,
       <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op);
   else
     stridedReductionKernel<InType, OutType, IdxType>
-      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op,
-                                            reduce_op);
+      <<<nblks, thrds, shmemSize, stream>>>(dots, data, D, N, init, main_op, reduce_op);
 
   ///@todo: this complication should go away once we have eliminated the need
   /// for atomics in stridedKernel (redesign for this is already underway)
diff --git a/cpp/include/raft/linalg/subtract.cuh b/cpp/include/raft/linalg/subtract.cuh
index 882c105689..43060d0818 100644
--- a/cpp/include/raft/linalg/subtract.cuh
+++ b/cpp/include/raft/linalg/subtract.cuh
@@ -38,8 +38,8 @@ namespace linalg {
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len,
-                    cudaStream_t stream) {
+void subtractScalar(OutT* out, const InT* in, InT scalar, IdxType len, cudaStream_t stream)
+{
   auto op = [scalar] __device__(InT in) { return OutT(in - scalar); };
   unaryOp<InT, decltype(op), IdxType, OutT>(out, in, len, op, stream);
 }
@@ -58,24 +58,25 @@ void subtractScalar(OutT *out, const InT *in, InT scalar, IdxType len,
  * @param stream cuda stream where to launch work
  */
 template <typename InT, typename OutT = InT, typename IdxType = int>
-void subtract(OutT *out, const InT *in1, const InT *in2, IdxType len,
-              cudaStream_t stream) {
+void subtract(OutT* out, const InT* in1, const InT* in2, IdxType len, cudaStream_t stream)
+{
   auto op = [] __device__(InT a, InT b) { return OutT(a - b); };
   binaryOp<InT, decltype(op), OutT, IdxType>(out, in1, in2, len, op, stream);
 }
 
 template <class math_t, typename IdxType>
-__global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
-                                           const math_t *singleScalarDev,
-                                           IdxType len) {
-  //TODO: kernel do not use shared memory in current implementation
+__global__ void subtract_dev_scalar_kernel(math_t* outDev,
+                                           const math_t* inDev,
+                                           const math_t* singleScalarDev,
+                                           IdxType len)
+{
+  // TODO: kernel do not use shared memory in current implementation
   int i = ((IdxType)blockIdx.x * (IdxType)blockDim.x) + threadIdx.x;
-  if (i < len) {
-    outDev[i] = inDev[i] - *singleScalarDev;
-  }
+  if (i < len) { outDev[i] = inDev[i] - *singleScalarDev; }
 }
 
-/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and write result to outDev[i]
+/** Substract single value pointed by singleScalarDev parameter in device memory from inDev[i] and
+ * write result to outDev[i]
  * @tparam math_t data-type upon which the math operation will be performed
  * @tparam IdxType Integer type used to for addressing
  * @param outDev the output buffer
@@ -86,9 +87,12 @@ __global__ void subtract_dev_scalar_kernel(math_t *outDev, const math_t *inDev,
  * @remark block size has not been tuned
  */
 template <typename math_t, typename IdxType = int, int TPB = 256>
-void subtractDevScalar(math_t *outDev, const math_t *inDev,
-                       const math_t *singleScalarDev, IdxType len,
-                       cudaStream_t stream) {
+void subtractDevScalar(math_t* outDev,
+                       const math_t* inDev,
+                       const math_t* singleScalarDev,
+                       IdxType len,
+                       cudaStream_t stream)
+{
   // Just for the note - there is no way to express such operation with cuBLAS in effective way
   // https://stackoverflow.com/questions/14051064/add-scalar-to-vector-in-blas-cublas-cuda
   const IdxType nblks = raft::ceildiv(len, (IdxType)TPB);
diff --git a/cpp/include/raft/linalg/svd.cuh b/cpp/include/raft/linalg/svd.cuh
index 2315920689..e14a5b6a50 100644
--- a/cpp/include/raft/linalg/svd.cuh
+++ b/cpp/include/raft/linalg/svd.cuh
@@ -51,12 +51,20 @@ namespace linalg {
 // TODO: couldn't template this function due to cusolverDnSgesvd and
 // cusolverSnSgesvd. Check if there is any other way.
 template <typename T>
-void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
-           T *sing_vals, T *left_sing_vecs, T *right_sing_vecs,
-           bool trans_right, bool gen_left_vec, bool gen_right_vec,
-           cudaStream_t stream) {
+void svdQR(const raft::handle_t& handle,
+           T* in,
+           int n_rows,
+           int n_cols,
+           T* sing_vals,
+           T* left_sing_vecs,
+           T* right_sing_vecs,
+           bool trans_right,
+           bool gen_left_vec,
+           bool gen_right_vec,
+           cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-  cublasHandle_t cublasH = handle.get_cublas_handle();
+  cublasHandle_t cublasH       = handle.get_cublas_handle();
 
 #if CUDART_VERSION >= 10010 && CUDART_VERSION < 11000
   // 46340: sqrt of max int value
@@ -71,14 +79,13 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
   const int n = n_cols;
 
   rmm::device_scalar<int> devInfo(stream);
-  T *d_rwork = nullptr;
+  T* d_rwork = nullptr;
 
   int lwork = 0;
-  CUSOLVER_CHECK(
-    cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
+  CUSOLVER_CHECK(cusolverDngesvd_bufferSize<T>(cusolverH, n_rows, n_cols, &lwork));
   rmm::device_uvector<T> d_work(lwork, stream);
 
-  char jobu = 'S';
+  char jobu  = 'S';
   char jobvt = 'A';
 
   if (!gen_left_vec) {
@@ -91,9 +98,23 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
     strcpy(&jobvt, &new_vt);
   }
 
-  CUSOLVER_CHECK(cusolverDngesvd(
-    cusolverH, jobu, jobvt, m, n, in, m, sing_vals, left_sing_vecs, m,
-    right_sing_vecs, n, d_work.data(), lwork, d_rwork, devInfo.data(), stream));
+  CUSOLVER_CHECK(cusolverDngesvd(cusolverH,
+                                 jobu,
+                                 jobvt,
+                                 m,
+                                 n,
+                                 in,
+                                 m,
+                                 sing_vals,
+                                 left_sing_vecs,
+                                 m,
+                                 right_sing_vecs,
+                                 n,
+                                 d_work.data(),
+                                 lwork,
+                                 d_rwork,
+                                 devInfo.data(),
+                                 stream));
 
   // Transpose the right singular vector back
   if (trans_right) raft::linalg::transpose(right_sing_vecs, n_cols, stream);
@@ -109,18 +130,36 @@ void svdQR(const raft::handle_t &handle, T *in, int n_rows, int n_cols,
 }
 
 template <typename T>
-void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
-            T *U, T *V, bool gen_left_vec, cudaStream_t stream) {
+void svdEig(const raft::handle_t& handle,
+            T* in,
+            int n_rows,
+            int n_cols,
+            T* S,
+            T* U,
+            T* V,
+            bool gen_left_vec,
+            cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-  cublasHandle_t cublasH = handle.get_cublas_handle();
+  cublasHandle_t cublasH       = handle.get_cublas_handle();
 
   int len = n_cols * n_cols;
   rmm::device_uvector<T> in_cross_mult(len, stream);
 
   T alpha = T(1);
-  T beta = T(0);
-  raft::linalg::gemm(handle, in, n_rows, n_cols, in, in_cross_mult.data(),
-                     n_cols, n_cols, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta,
+  T beta  = T(0);
+  raft::linalg::gemm(handle,
+                     in,
+                     n_rows,
+                     n_cols,
+                     in,
+                     in_cross_mult.data(),
+                     n_cols,
+                     n_cols,
+                     CUBLAS_OP_T,
+                     CUBLAS_OP_N,
+                     alpha,
+                     beta,
                      stream);
 
   eigDC(handle, in_cross_mult.data(), n_cols, n_cols, V, S, stream);
@@ -131,10 +170,20 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
   raft::matrix::seqRoot(S, S, alpha, n_cols, stream, true);
 
   if (gen_left_vec) {
-    raft::linalg::gemm(handle, in, n_rows, n_cols, V, U, n_rows, n_cols,
-                       CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
-    raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false,
-                                                true, stream);
+    raft::linalg::gemm(handle,
+                       in,
+                       n_rows,
+                       n_cols,
+                       V,
+                       U,
+                       n_rows,
+                       n_cols,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       alpha,
+                       beta,
+                       stream);
+    raft::matrix::matrixVectorBinaryDivSkipZero(U, S, n_rows, n_cols, false, true, stream);
   }
 }
 
@@ -156,10 +205,19 @@ void svdEig(const raft::handle_t &handle, T *in, int n_rows, int n_cols, T *S,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
-               math_t *sing_vals, math_t *left_sing_vecs,
-               math_t *right_sing_vecs, bool gen_left_vec, bool gen_right_vec,
-               math_t tol, int max_sweeps, cudaStream_t stream) {
+void svdJacobi(const raft::handle_t& handle,
+               math_t* in,
+               int n_rows,
+               int n_cols,
+               math_t* sing_vals,
+               math_t* left_sing_vecs,
+               math_t* right_sing_vecs,
+               bool gen_left_vec,
+               bool gen_right_vec,
+               math_t tol,
+               int max_sweeps,
+               cudaStream_t stream)
+{
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
 
   gesvdjInfo_t gesvdj_params = NULL;
@@ -174,18 +232,42 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
   rmm::device_scalar<int> devInfo(stream);
 
   int lwork = 0;
-  int econ = 1;
-
-  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
-    left_sing_vecs, m, right_sing_vecs, n, &lwork, gesvdj_params));
+  int econ  = 1;
+
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj_bufferSize(cusolverH,
+                                                           CUSOLVER_EIG_MODE_VECTOR,
+                                                           econ,
+                                                           m,
+                                                           n,
+                                                           in,
+                                                           m,
+                                                           sing_vals,
+                                                           left_sing_vecs,
+                                                           m,
+                                                           right_sing_vecs,
+                                                           n,
+                                                           &lwork,
+                                                           gesvdj_params));
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
 
-  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(
-    cusolverH, CUSOLVER_EIG_MODE_VECTOR, econ, m, n, in, m, sing_vals,
-    left_sing_vecs, m, right_sing_vecs, n, d_work.data(), lwork, devInfo.data(),
-    gesvdj_params, stream));
+  CUSOLVER_CHECK(raft::linalg::cusolverDngesvdj(cusolverH,
+                                                CUSOLVER_EIG_MODE_VECTOR,
+                                                econ,
+                                                m,
+                                                n,
+                                                in,
+                                                m,
+                                                sing_vals,
+                                                left_sing_vecs,
+                                                m,
+                                                right_sing_vecs,
+                                                n,
+                                                d_work.data(),
+                                                lwork,
+                                                devInfo.data(),
+                                                gesvdj_params,
+                                                stream));
 
   CUSOLVER_CHECK(cusolverDnDestroyGesvdjInfo(gesvdj_params));
 }
@@ -204,16 +286,34 @@ void svdJacobi(const raft::handle_t &handle, math_t *in, int n_rows, int n_cols,
  * @param stream cuda stream
  */
 template <typename math_t>
-void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S,
-                       math_t *V, math_t *out, int n_rows, int n_cols, int k,
-                       cudaStream_t stream) {
+void svdReconstruction(const raft::handle_t& handle,
+                       math_t* U,
+                       math_t* S,
+                       math_t* V,
+                       math_t* out,
+                       int n_rows,
+                       int n_cols,
+                       int k,
+                       cudaStream_t stream)
+{
   const math_t alpha = 1.0, beta = 0.0;
   rmm::device_uvector<math_t> SVT(k * n_cols, stream);
 
-  raft::linalg::gemm(handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N,
-                     CUBLAS_OP_T, alpha, beta, stream);
-  raft::linalg::gemm(handle, U, n_rows, k, SVT.data(), out, n_rows, n_cols,
-                     CUBLAS_OP_N, CUBLAS_OP_N, alpha, beta, stream);
+  raft::linalg::gemm(
+    handle, S, k, k, V, SVT.data(), k, n_cols, CUBLAS_OP_N, CUBLAS_OP_T, alpha, beta, stream);
+  raft::linalg::gemm(handle,
+                     U,
+                     n_rows,
+                     k,
+                     SVT.data(),
+                     out,
+                     n_rows,
+                     n_cols,
+                     CUBLAS_OP_N,
+                     CUBLAS_OP_N,
+                     alpha,
+                     beta,
+                     stream);
 }
 
 /**
@@ -231,9 +331,17 @@ void svdReconstruction(const raft::handle_t &handle, math_t *U, math_t *S,
  * @param stream cuda stream
  */
 template <typename math_t>
-bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
-                         math_t *S_vec, math_t *V, int n_rows, int n_cols,
-                         int k, math_t tol, cudaStream_t stream) {
+bool evaluateSVDByL2Norm(const raft::handle_t& handle,
+                         math_t* A_d,
+                         math_t* U,
+                         math_t* S_vec,
+                         math_t* V,
+                         int n_rows,
+                         int n_cols,
+                         int k,
+                         math_t tol,
+                         cudaStream_t stream)
+{
   cublasHandle_t cublasH = handle.get_cublas_handle();
 
   int m = n_rows, n = n_cols;
@@ -257,16 +365,25 @@ bool evaluateSVDByL2Norm(const raft::handle_t &handle, math_t *A_d, math_t *U,
   // calculate percent error
   const math_t alpha = 1.0, beta = -1.0;
   rmm::device_uvector<math_t> A_minus_P(m * n, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
-
-  CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n,
-                                        &alpha, A_d, m, &beta, P_d.data(), m,
-                                        A_minus_P.data(), m, stream));
-
-  math_t norm_A_minus_P =
-    raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
-  math_t percent_error = 100.0 * norm_A_minus_P / normA;
+  CUDA_CHECK(cudaMemsetAsync(A_minus_P.data(), 0, sizeof(math_t) * m * n, stream));
+
+  CUBLAS_CHECK(raft::linalg::cublasgeam(cublasH,
+                                        CUBLAS_OP_N,
+                                        CUBLAS_OP_N,
+                                        m,
+                                        n,
+                                        &alpha,
+                                        A_d,
+                                        m,
+                                        &beta,
+                                        P_d.data(),
+                                        m,
+                                        A_minus_P.data(),
+                                        m,
+                                        stream));
+
+  math_t norm_A_minus_P = raft::matrix::getL2Norm(handle, A_minus_P.data(), m * n, stream);
+  math_t percent_error  = 100.0 * norm_A_minus_P / normA;
   return (percent_error / 100.0 < tol);
 }
 
diff --git a/cpp/include/raft/linalg/transpose.h b/cpp/include/raft/linalg/transpose.h
index db1cabd694..e84ddd1166 100644
--- a/cpp/include/raft/linalg/transpose.h
+++ b/cpp/include/raft/linalg/transpose.h
@@ -33,18 +33,34 @@ namespace linalg {
  * @param stream: cuda stream
  */
 template <typename math_t>
-void transpose(const raft::handle_t &handle, math_t *in, math_t *out,
-               int n_rows, int n_cols, cudaStream_t stream) {
+void transpose(const raft::handle_t& handle,
+               math_t* in,
+               math_t* out,
+               int n_rows,
+               int n_cols,
+               cudaStream_t stream)
+{
   cublasHandle_t cublas_h = handle.get_cublas_handle();
 
   int out_n_rows = n_cols;
   int out_n_cols = n_rows;
 
   const math_t alpha = 1.0;
-  const math_t beta = 0.0;
-  CUBLAS_CHECK(raft::linalg::cublasgeam(
-    cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, out_n_rows, out_n_cols, &alpha, in,
-    n_rows, &beta, out, out_n_rows, out, out_n_rows, stream));
+  const math_t beta  = 0.0;
+  CUBLAS_CHECK(raft::linalg::cublasgeam(cublas_h,
+                                        CUBLAS_OP_T,
+                                        CUBLAS_OP_N,
+                                        out_n_rows,
+                                        out_n_cols,
+                                        &alpha,
+                                        in,
+                                        n_rows,
+                                        &beta,
+                                        out,
+                                        out_n_rows,
+                                        out,
+                                        out_n_rows,
+                                        stream));
 }
 
 /**
@@ -54,24 +70,24 @@ void transpose(const raft::handle_t &handle, math_t *in, math_t *out,
  * @param stream: cuda stream
  */
 template <typename math_t>
-void transpose(math_t *inout, int n, cudaStream_t stream) {
-  auto m = n;
-  auto size = n * n;
-  auto d_inout = inout;
+void transpose(math_t* inout, int n, cudaStream_t stream)
+{
+  auto m        = n;
+  auto size     = n * n;
+  auto d_inout  = inout;
   auto counting = thrust::make_counting_iterator<int>(0);
 
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
-                   [=] __device__(int idx) {
-                     int s_row = idx % m;
-                     int s_col = idx / m;
-                     int d_row = s_col;
-                     int d_col = s_row;
-                     if (s_row < s_col) {
-                       auto temp = d_inout[d_col * m + d_row];
-                       d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row];
-                       d_inout[s_col * m + s_row] = temp;
-                     }
-                   });
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(int idx) {
+    int s_row = idx % m;
+    int s_col = idx / m;
+    int d_row = s_col;
+    int d_col = s_row;
+    if (s_row < s_col) {
+      auto temp                  = d_inout[d_col * m + d_row];
+      d_inout[d_col * m + d_row] = d_inout[s_col * m + s_row];
+      d_inout[s_col * m + s_row] = temp;
+    }
+  });
 }
 
 };  // end namespace linalg
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index 46b4d296cb..198b9b2b10 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -23,10 +23,9 @@
 namespace raft {
 namespace linalg {
 
-template <typename InType, int VecLen, typename Lambda, typename OutType,
-          typename IdxType>
-__global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len,
-                              Lambda op) {
+template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType>
+__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op)
+{
   typedef TxN_t<InType, VecLen> InVecType;
   typedef TxN_t<OutType, VecLen> OutVecType;
   InVecType a;
@@ -42,12 +41,10 @@ __global__ void unaryOpKernel(OutType *out, const InType *in, IdxType len,
   b.store(out, idx);
 }
 
-template <typename InType, int VecLen, typename Lambda, typename OutType,
-          typename IdxType, int TPB>
-void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op,
-                 cudaStream_t stream) {
-  const IdxType nblks =
-    raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType, int TPB>
+void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
   unaryOpKernel<InType, VecLen, Lambda, OutType, IdxType>
     <<<nblks, TPB, 0, stream>>>(out, in, len, op);
   CUDA_CHECK(cudaPeekAtLastError());
@@ -68,47 +65,38 @@ void unaryOpImpl(OutType *out, const InType *in, IdxType len, Lambda op,
  * @note Lambda must be a functor with the following signature:
  *       `OutType func(const InType& val);`
  */
-template <typename InType, typename Lambda, typename IdxType = int,
-          typename OutType = InType, int TPB = 256>
-void unaryOp(OutType *out, const InType *in, IdxType len, Lambda op,
-             cudaStream_t stream) {
-  if (len <= 0) return;  //silently skip in case of 0 length input
-  constexpr auto maxSize =
-    sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes = len * maxSize;
-  uint64_t inAddr = uint64_t(in);
-  uint64_t outAddr = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 &&
-      outAddr % 16 == 0) {
-    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 &&
-             outAddr % 8 == 0) {
-    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 &&
-             outAddr % 4 == 0) {
-    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
-  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 &&
-             outAddr % 2 == 0) {
-    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
+template <typename InType,
+          typename Lambda,
+          typename IdxType = int,
+          typename OutType = InType,
+          int TPB          = 256>
+void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  if (len <= 0) return;  // silently skip in case of 0 length input
+  constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes           = len * maxSize;
+  uint64_t inAddr        = uint64_t(in);
+  uint64_t outAddr       = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) {
+    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) {
+    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) {
+    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) {
+    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
   } else if (1 / maxSize) {
-    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(
-      out, in, len, op, stream);
+    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
   } else {
-    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op,
-                                                          stream);
+    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
   }
 }
 
 template <typename OutType, typename Lambda, typename IdxType>
-__global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) {
+__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op)
+{
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-  if (idx < len) {
-    op(out + idx, idx);
-  }
+  if (idx < len) { op(out + idx, idx); }
 }
 
 /**
@@ -128,14 +116,12 @@ __global__ void writeOnlyUnaryOpKernel(OutType *out, IdxType len, Lambda op) {
  *                    where outLocationOffset will be out + idx.
  * @param[in]  stream cuda stream where to launch work
  */
-template <typename OutType, typename Lambda, typename IdxType = int,
-          int TPB = 256>
-void writeOnlyUnaryOp(OutType *out, IdxType len, Lambda op,
-                      cudaStream_t stream) {
+template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
+void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+{
   if (len <= 0) return;  // silently skip in case of 0 length input
   auto nblks = raft::ceildiv<IdxType>(len, TPB);
-  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, len, op);
+  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType><<<nblks, TPB, 0, stream>>>(out, len, op);
   CUDA_CHECK(cudaGetLastError());
 }
 
diff --git a/cpp/include/raft/matrix/detail/math.cuh b/cpp/include/raft/matrix/detail/math.cuh
index f79cb397b7..4b56f3986f 100644
--- a/cpp/include/raft/matrix/detail/math.cuh
+++ b/cpp/include/raft/matrix/detail/math.cuh
@@ -25,30 +25,29 @@ namespace detail {
 
 // Computes the argmax(d_in) column-wise in a DxN matrix
 template <typename T, int TPB>
-__global__ void argmaxKernel(const T *d_in, int D, int N, T *argmax) {
+__global__ void argmaxKernel(const T* d_in, int D, int N, T* argmax)
+{
   typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   // compute maxIndex=argMax  index for column
-  using KVP = cub::KeyValuePair<int, T>;
+  using KVP    = cub::KeyValuePair<int, T>;
   int rowStart = blockIdx.x * D;
   KVP thread_data(-1, -raft::myInf<T>());
 
   for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx = rowStart + i;
+    int idx     = rowStart + i;
     thread_data = cub::ArgMax()(thread_data, KVP(i, d_in[idx]));
   }
 
   auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
 
-  if (threadIdx.x == 0) {
-    argmax[blockIdx.x] = maxKV.key;
-  }
+  if (threadIdx.x == 0) { argmax[blockIdx.x] = maxKV.key; }
 }
 
 template <typename math_t>
-void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
-            cudaStream_t stream) {
+void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream)
+{
   int D = n_rows;
   int N = n_cols;
   if (D <= 32) {
@@ -67,39 +66,39 @@ void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
 // Computes the argmax(abs(d_in)) column-wise in a DxN matrix followed by
 // flipping the sign if the |max| value for each column is negative.
 template <typename T, int TPB>
-__global__ void signFlipKernel(T *d_in, int D, int N) {
+__global__ void signFlipKernel(T* d_in, int D, int N)
+{
   typedef cub::BlockReduce<cub::KeyValuePair<int, T>, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
   // compute maxIndex=argMax (with abs()) index for column
-  using KVP = cub::KeyValuePair<int, T>;
+  using KVP    = cub::KeyValuePair<int, T>;
   int rowStart = blockIdx.x * D;
   KVP thread_data(0, 0);
   for (int i = threadIdx.x; i < D; i += TPB) {
-    int idx = rowStart + i;
+    int idx     = rowStart + i;
     thread_data = cub::ArgMax()(thread_data, KVP(idx, abs(d_in[idx])));
   }
   auto maxKV = BlockReduce(temp_storage).Reduce(thread_data, cub::ArgMax());
 
   // flip column sign if d_in[maxIndex] < 0
   __shared__ bool need_sign_flip;
-  if (threadIdx.x == 0) {
-    need_sign_flip = d_in[maxKV.key] < T(0);
-  }
+  if (threadIdx.x == 0) { need_sign_flip = d_in[maxKV.key] < T(0); }
   __syncthreads();
 
   if (need_sign_flip) {
     for (int i = threadIdx.x; i < D; i += TPB) {
-      int idx = rowStart + i;
+      int idx   = rowStart + i;
       d_in[idx] = -d_in[idx];
     }
   }
 }
 
 template <typename math_t>
-void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
-  int D = n_rows;
-  int N = n_cols;
+void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
+{
+  int D     = n_rows;
+  int N     = n_cols;
   auto data = inout;
   if (D <= 32) {
     signFlipKernel<math_t, 32><<<N, 32, 0, stream>>>(data, D, N);
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index 8293d01bdb..709570ae56 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -28,29 +28,32 @@ namespace matrix {
 namespace detail {
 
 template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
-void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
-              const idx_array_t *indices, idx_t n_rows_indices,
-              cudaStream_t stream, bool rowMajor = false) {
+void copyRows(const m_t* in,
+              idx_t n_rows,
+              idx_t n_cols,
+              m_t* out,
+              const idx_array_t* indices,
+              idx_t n_rows_indices,
+              cudaStream_t stream,
+              bool rowMajor = false)
+{
   if (rowMajor) {
     const idx_t TPB = 256;
-    cache::
-      get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
-        in, n_cols, indices, n_rows_indices, out);
+    cache::get_vecs<<<raft::ceildiv(n_rows_indices * n_cols, TPB), TPB, 0, stream>>>(
+      in, n_cols, indices, n_rows_indices, out);
     CUDA_CHECK(cudaPeekAtLastError());
     return;
   }
 
-  idx_t size = n_rows_indices * n_cols;
+  idx_t size    = n_rows_indices * n_cols;
   auto counting = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
-                   [=] __device__(idx_t idx) {
-                     idx_t row = idx % n_rows_indices;
-                     idx_t col = idx / n_rows_indices;
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) {
+    idx_t row = idx % n_rows_indices;
+    idx_t col = idx / n_rows_indices;
 
-                     out[col * n_rows_indices + row] =
-                       in[col * n_rows + indices[row]];
-                   });
+    out[col * n_rows_indices + row] = in[col * n_rows + indices[row]];
+  });
 }
 
 /**
@@ -65,8 +68,9 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
  * (1-based)
  */
 template <typename m_t, typename idx_t = int>
-__global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
-                      idx_t y1, idx_t x2, idx_t y2) {
+__global__ void slice(
+  m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2)
+{
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t dm = x2 - x1, dn = y2 - y1;
   if (idx < dm * dn) {
@@ -77,8 +81,16 @@ __global__ void slice(m_t *src_d, idx_t m, idx_t n, m_t *dst_d, idx_t x1,
 }
 
 template <typename m_t, typename idx_t = int>
-void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
-                 idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) {
+void sliceMatrix(m_t* in,
+                 idx_t n_rows,
+                 idx_t n_cols,
+                 m_t* out,
+                 idx_t x1,
+                 idx_t y1,
+                 idx_t x2,
+                 idx_t y2,
+                 cudaStream_t stream)
+{
   // Slicing
   dim3 block(64);
   dim3 grid(((x2 - x1) * (y2 - y1) + block.x - 1) / block.x);
@@ -94,21 +106,19 @@ void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
  * @param k: min(n_rows, n_cols)
  */
 template <typename m_t, typename idx_t = int>
-__global__ void getUpperTriangular(m_t *src, m_t *dst, idx_t n_rows,
-                                   idx_t n_cols, idx_t k) {
+__global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k)
+{
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t m = n_rows, n = n_cols;
   if (idx < m * n) {
     idx_t i = idx % m, j = idx / m;
-    if (i < k && j < k && j >= i) {
-      dst[i + j * k] = src[idx];
-    }
+    if (i < k && j < k && j >= i) { dst[i + j * k] = src[idx]; }
   }
 }
 
 template <typename m_t, typename idx_t = int>
-void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
-                         cudaStream_t stream) {
+void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   idx_t m = n_rows, n = n_cols;
   idx_t k = min(m, n);
   dim3 block(64);
@@ -125,23 +135,21 @@ void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
  * @param k: dimensionality
  */
 template <typename m_t, typename idx_t = int>
-__global__ void copyVectorToMatrixDiagonal(m_t *vec, m_t *matrix, idx_t m,
-                                           idx_t n, idx_t k) {
+__global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k)
+{
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
 
-  if (idx < k) {
-    matrix[idx + idx * m] = vec[idx];
-  }
+  if (idx < k) { matrix[idx + idx * m] = vec[idx]; }
 }
 
 template <typename m_t, typename idx_t = int>
-void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
-                              cudaStream_t stream) {
+void initializeDiagonalMatrix(
+  m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   idx_t k = min(n_rows, n_cols);
   dim3 block(64);
   dim3 grid((k + block.x - 1) / block.x);
-  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows,
-                                                         n_cols, k);
+  copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows, n_cols, k);
 }
 
 /**
@@ -151,15 +159,15 @@ void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
  * @param len: size of one side of the matrix
  */
 template <typename m_t, typename idx_t = int>
-__global__ void matrixDiagonalInverse(m_t *in, idx_t len) {
+__global__ void matrixDiagonalInverse(m_t* in, idx_t len)
+{
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (idx < len) {
-    in[idx + idx * len] = 1.0 / in[idx + idx * len];
-  }
+  if (idx < len) { in[idx + idx * len] = 1.0 / in[idx + idx * len]; }
 }
 
 template <typename m_t, typename idx_t = int>
-void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
+void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
+{
   dim3 block(64);
   dim3 grid((len + block.x - 1) / block.x);
   matrixDiagonalInverse<m_t><<<grid, block, 0, stream>>>(in, len);
diff --git a/cpp/include/raft/matrix/math.hpp b/cpp/include/raft/matrix/math.hpp
index e67440019f..df6eb6f489 100644
--- a/cpp/include/raft/matrix/math.hpp
+++ b/cpp/include/raft/matrix/math.hpp
@@ -43,14 +43,18 @@ namespace matrix {
  * @param stream cuda stream
  */
 template <typename math_t>
-void power(math_t *in, math_t *out, math_t scalar, int len,
-           cudaStream_t stream) {
-  auto d_src = in;
+void power(math_t* in, math_t* out, math_t scalar, int len, cudaStream_t stream)
+{
+  auto d_src  = in;
   auto d_dest = out;
 
   raft::linalg::binaryOp(
-    d_dest, d_src, d_src, len,
-    [=] __device__(math_t a, math_t b) { return scalar * a * b; }, stream);
+    d_dest,
+    d_src,
+    d_src,
+    len,
+    [=] __device__(math_t a, math_t b) { return scalar * a * b; },
+    stream);
 }
 
 /**
@@ -61,7 +65,8 @@ void power(math_t *in, math_t *out, math_t scalar, int len,
  * @param stream cuda stream
  */
 template <typename math_t>
-void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) {
+void power(math_t* inout, math_t scalar, int len, cudaStream_t stream)
+{
   power(inout, inout, scalar, len, stream);
 }
 
@@ -72,7 +77,8 @@ void power(math_t *inout, math_t scalar, int len, cudaStream_t stream) {
  * @param stream cuda stream
  */
 template <typename math_t>
-void power(math_t *inout, int len, cudaStream_t stream) {
+void power(math_t* inout, int len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   power(inout, scalar, len, stream);
 }
@@ -86,7 +92,8 @@ void power(math_t *inout, int len, cudaStream_t stream) {
  * @{
  */
 template <typename math_t>
-void power(math_t *in, math_t *out, int len, cudaStream_t stream) {
+void power(math_t* in, math_t* out, int len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   power(in, out, scalar, len, stream);
 }
@@ -103,13 +110,20 @@ void power(math_t *in, math_t *out, int len, cudaStream_t stream) {
  * @param set_neg_zero whether to set negative numbers to zero
  */
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len,
-             cudaStream_t stream, bool set_neg_zero = false) {
-  auto d_src = in;
+void seqRoot(math_t* in,
+             math_t* out,
+             math_t scalar,
+             IdxType len,
+             cudaStream_t stream,
+             bool set_neg_zero = false)
+{
+  auto d_src  = in;
   auto d_dest = out;
 
   raft::linalg::unaryOp(
-    d_dest, d_src, len,
+    d_dest,
+    d_src,
+    len,
     [=] __device__(math_t a) {
       if (set_neg_zero) {
         if (a < math_t(0)) {
@@ -135,8 +149,9 @@ void seqRoot(math_t *in, math_t *out, math_t scalar, IdxType len,
  * @param set_neg_zero whether to set negative numbers to zero
  */
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
-             bool set_neg_zero = false) {
+void seqRoot(
+  math_t* inout, math_t scalar, IdxType len, cudaStream_t stream, bool set_neg_zero = false)
+{
   seqRoot(inout, inout, scalar, len, stream, set_neg_zero);
 }
 
@@ -150,22 +165,27 @@ void seqRoot(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t *in, math_t *out, IdxType len, cudaStream_t stream) {
+void seqRoot(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   seqRoot(in, out, scalar, len, stream);
 }
 
 template <typename math_t, typename IdxType = int>
-void seqRoot(math_t *inout, IdxType len, cudaStream_t stream) {
+void seqRoot(math_t* inout, IdxType len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   seqRoot(inout, inout, scalar, len, stream);
 }
 
 template <typename math_t, typename IdxType = int>
-void setSmallValuesZero(math_t *out, const math_t *in, IdxType len,
-                        cudaStream_t stream, math_t thres = 1e-15) {
+void setSmallValuesZero(
+  math_t* out, const math_t* in, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
   raft::linalg::unaryOp(
-    out, in, len,
+    out,
+    in,
+    len,
     [=] __device__(math_t a) {
       if (a <= thres && -a <= thres) {
         return math_t(0);
@@ -186,8 +206,8 @@ void setSmallValuesZero(math_t *out, const math_t *in, IdxType len,
  * @param thres: threshold
  */
 template <typename math_t, typename IdxType = int>
-void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream,
-                        math_t thres = 1e-15) {
+void setSmallValuesZero(math_t* inout, IdxType len, cudaStream_t stream, math_t thres = 1e-15)
+{
   setSmallValuesZero(inout, inout, len, stream, thres);
 }
 
@@ -205,14 +225,21 @@ void setSmallValuesZero(math_t *inout, IdxType len, cudaStream_t stream,
  * @{
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t *in, math_t *out, math_t scalar, int len,
-                cudaStream_t stream, bool setzero = false,
-                math_t thres = 1e-15) {
-  auto d_src = in;
+void reciprocal(math_t* in,
+                math_t* out,
+                math_t scalar,
+                int len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
+  auto d_src  = in;
   auto d_dest = out;
 
   raft::linalg::unaryOp(
-    d_dest, d_src, len,
+    d_dest,
+    d_src,
+    len,
     [=] __device__(math_t a) {
       if (setzero) {
         if (abs(a) <= thres) {
@@ -239,8 +266,13 @@ void reciprocal(math_t *in, math_t *out, math_t scalar, int len,
  * @param thres: Threshold to avoid dividing by zero (|value| < thres -> result = 0)
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
-                bool setzero = false, math_t thres = 1e-15) {
+void reciprocal(math_t* inout,
+                math_t scalar,
+                IdxType len,
+                cudaStream_t stream,
+                bool setzero = false,
+                math_t thres = 1e-15)
+{
   reciprocal(inout, inout, scalar, len, stream, setzero, thres);
 }
 
@@ -253,7 +285,8 @@ void reciprocal(math_t *inout, math_t scalar, IdxType len, cudaStream_t stream,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) {
+void reciprocal(math_t* inout, IdxType len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   reciprocal(inout, scalar, len, stream);
 }
@@ -268,14 +301,15 @@ void reciprocal(math_t *inout, IdxType len, cudaStream_t stream) {
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void reciprocal(math_t *in, math_t *out, IdxType len, cudaStream_t stream) {
+void reciprocal(math_t* in, math_t* out, IdxType len, cudaStream_t stream)
+{
   math_t scalar = 1.0;
   reciprocal(in, out, scalar, len, stream);
 }
 
 template <typename math_t>
-void setValue(math_t *out, const math_t *in, math_t scalar, int len,
-              cudaStream_t stream = 0) {
+void setValue(math_t* out, const math_t* in, math_t scalar, int len, cudaStream_t stream = 0)
+{
   raft::linalg::unaryOp(
     out, in, len, [scalar] __device__(math_t in) { return scalar; }, stream);
 }
@@ -290,18 +324,18 @@ void setValue(math_t *out, const math_t *in, math_t scalar, int len,
  * @param stream cuda stream
  */
 template <typename math_t, typename IdxType = int>
-void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len,
-           cudaStream_t stream) {
-  auto d_src = src;
+void ratio(
+  const raft::handle_t& handle, math_t* src, math_t* dest, IdxType len, cudaStream_t stream)
+{
+  auto d_src  = src;
   auto d_dest = dest;
 
   rmm::device_scalar<math_t> d_sum(stream);
-  auto *d_sum_ptr = d_sum.data();
-  auto no_op = [] __device__(math_t in) { return in; };
+  auto* d_sum_ptr = d_sum.data();
+  auto no_op      = [] __device__(math_t in) { return in; };
   raft::linalg::mapThenSumReduce(d_sum_ptr, len, no_op, stream, src);
   raft::linalg::unaryOp(
-    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); },
-    stream);
+    d_dest, d_src, len, [=] __device__(math_t a) { return a / (*d_sum_ptr); }, stream);
 }
 
 /** @} */
@@ -315,8 +349,8 @@ void ratio(const raft::handle_t &handle, math_t *src, math_t *dest, IdxType len,
  * @param stream: cuda stream
  */
 template <typename math_t>
-void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
-            cudaStream_t stream) {
+void argmax(const math_t* in, int n_rows, int n_cols, math_t* out, cudaStream_t stream)
+{
   detail::argmax(in, n_rows, n_cols, out, stream);
 }
 
@@ -329,25 +363,49 @@ void argmax(const math_t *in, int n_rows, int n_cols, math_t *out,
  * @param stream cuda stream
  */
 template <typename math_t>
-void signFlip(math_t *inout, int n_rows, int n_cols, cudaStream_t stream) {
+void signFlip(math_t* inout, int n_rows, int n_cols, cudaStream_t stream)
+{
   detail::signFlip(inout, n_rows, n_cols, stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryMult(Type *data, const Type *vec, IdxType n_row,
-                            IdxType n_col, bool rowMajor, bool bcastAlongRows,
-                            cudaStream_t stream) {
+void matrixVectorBinaryMult(Type* data,
+                            const Type* vec,
+                            IdxType n_row,
+                            IdxType n_col,
+                            bool rowMajor,
+                            bool bcastAlongRows,
+                            cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a * b; }, stream);
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a * b; },
+    stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row,
-                                    IdxType n_col, bool rowMajor,
-                                    bool bcastAlongRows, cudaStream_t stream) {
+void matrixVectorBinaryMultSkipZero(Type* data,
+                                    const Type* vec,
+                                    IdxType n_row,
+                                    IdxType n_col,
+                                    bool rowMajor,
+                                    bool bcastAlongRows,
+                                    cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
     [] __device__(Type a, Type b) {
       if (b == Type(0))
         return a;
@@ -358,22 +416,45 @@ void matrixVectorBinaryMultSkipZero(Type *data, const Type *vec, IdxType n_row,
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryDiv(Type *data, const Type *vec, IdxType n_row,
-                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
-                           cudaStream_t stream) {
+void matrixVectorBinaryDiv(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a / b; }, stream);
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a / b; },
+    stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row,
-                                   IdxType n_col, bool rowMajor,
-                                   bool bcastAlongRows, cudaStream_t stream,
-                                   bool return_zero = false) {
+void matrixVectorBinaryDivSkipZero(Type* data,
+                                   const Type* vec,
+                                   IdxType n_row,
+                                   IdxType n_col,
+                                   bool rowMajor,
+                                   bool bcastAlongRows,
+                                   cudaStream_t stream,
+                                   bool return_zero = false)
+{
   if (return_zero) {
     raft::linalg::matrixVectorOp(
-      data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+      data,
+      data,
+      vec,
+      n_col,
+      n_row,
+      rowMajor,
+      bcastAlongRows,
       [] __device__(Type a, Type b) {
         if (raft::myAbs(b) < Type(1e-10))
           return Type(0);
@@ -383,7 +464,13 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row,
       stream);
   } else {
     raft::linalg::matrixVectorOp(
-      data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
+      data,
+      data,
+      vec,
+      n_col,
+      n_row,
+      rowMajor,
+      bcastAlongRows,
       [] __device__(Type a, Type b) {
         if (raft::myAbs(b) < Type(1e-10))
           return a;
@@ -395,21 +482,45 @@ void matrixVectorBinaryDivSkipZero(Type *data, const Type *vec, IdxType n_row,
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinaryAdd(Type *data, const Type *vec, IdxType n_row,
-                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
-                           cudaStream_t stream) {
+void matrixVectorBinaryAdd(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; }, stream);
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; },
+    stream);
 }
 
 template <typename Type, typename IdxType = int, int TPB = 256>
-void matrixVectorBinarySub(Type *data, const Type *vec, IdxType n_row,
-                           IdxType n_col, bool rowMajor, bool bcastAlongRows,
-                           cudaStream_t stream) {
+void matrixVectorBinarySub(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    data, data, vec, n_col, n_row, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; }, stream);
+    data,
+    data,
+    vec,
+    n_col,
+    n_row,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; },
+    stream);
 }
 
 };  // end namespace matrix
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index 8dd9fbf487..c4cd30b7bc 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -47,11 +47,16 @@ using namespace std;
  * @param rowMajor whether the matrix has row major layout
  */
 template <typename m_t, typename idx_array_t = int, typename idx_t = size_t>
-void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
-              const idx_array_t *indices, idx_t n_rows_indices,
-              cudaStream_t stream, bool rowMajor = false) {
-  detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream,
-                   rowMajor);
+void copyRows(const m_t* in,
+              idx_t n_rows,
+              idx_t n_cols,
+              m_t* out,
+              const idx_array_t* indices,
+              idx_t n_rows_indices,
+              cudaStream_t stream,
+              bool rowMajor = false)
+{
+  detail::copyRows(in, n_rows, n_cols, out, indices, n_rows_indices, stream, rowMajor);
 }
 
 /**
@@ -63,8 +68,8 @@ void copyRows(const m_t *in, idx_t n_rows, idx_t n_cols, m_t *out,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols,
-          cudaStream_t stream) {
+void copy(const m_t* in, m_t* out, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   raft::copy_async(out, in, n_rows * n_cols, stream);
 }
 
@@ -79,21 +84,21 @@ void copy(const m_t *in, m_t *out, idx_t n_rows, idx_t n_cols,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows,
-                     idx_t out_n_cols, cudaStream_t stream) {
-  auto m = out_n_rows;
-  auto k = in_n_rows;
-  idx_t size = out_n_rows * out_n_cols;
-  auto d_q = in;
+void truncZeroOrigin(
+  m_t* in, idx_t in_n_rows, m_t* out, idx_t out_n_rows, idx_t out_n_cols, cudaStream_t stream)
+{
+  auto m         = out_n_rows;
+  auto k         = in_n_rows;
+  idx_t size     = out_n_rows * out_n_cols;
+  auto d_q       = in;
   auto d_q_trunc = out;
-  auto counting = thrust::make_counting_iterator<idx_t>(0);
+  auto counting  = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + size,
-                   [=] __device__(idx_t idx) {
-                     idx_t row = idx % m;
-                     idx_t col = idx / m;
-                     d_q_trunc[col * m + row] = d_q[col * k + row];
-                   });
+  thrust::for_each(rmm::exec_policy(stream), counting, counting + size, [=] __device__(idx_t idx) {
+    idx_t row                = idx % m;
+    idx_t col                = idx / m;
+    d_q_trunc[col * m + row] = d_q[col * k + row];
+  });
 }
 
 /**
@@ -105,24 +110,25 @@ void truncZeroOrigin(m_t *in, idx_t in_n_rows, m_t *out, idx_t out_n_rows,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
-  auto n = n_cols;
-  auto m = n_rows;
-  idx_t size = n_rows * n_cols;
-  auto d_q = inout;
+void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  auto n            = n_cols;
+  auto m            = n_rows;
+  idx_t size        = n_rows * n_cols;
+  auto d_q          = inout;
   auto d_q_reversed = inout;
-  auto counting = thrust::make_counting_iterator<idx_t>(0);
+  auto counting     = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2),
-                   [=] __device__(idx_t idx) {
-                     idx_t dest_row = idx % m;
-                     idx_t dest_col = idx / m;
-                     idx_t src_row = dest_row;
-                     idx_t src_col = (n - dest_col) - 1;
-                     m_t temp = (m_t)d_q_reversed[idx];
-                     d_q_reversed[idx] = d_q[src_col * m + src_row];
-                     d_q[src_col * m + src_row] = temp;
-                   });
+  thrust::for_each(
+    rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
+      idx_t dest_row             = idx % m;
+      idx_t dest_col             = idx / m;
+      idx_t src_row              = dest_row;
+      idx_t src_col              = (n - dest_col) - 1;
+      m_t temp                   = (m_t)d_q_reversed[idx];
+      d_q_reversed[idx]          = d_q[src_col * m + src_row];
+      d_q[src_col * m + src_row] = temp;
+    });
 }
 
 /**
@@ -134,25 +140,26 @@ void colReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
-  auto m = n_rows;
-  idx_t size = n_rows * n_cols;
-  auto d_q = inout;
+void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  auto m            = n_rows;
+  idx_t size        = n_rows * n_cols;
+  auto d_q          = inout;
   auto d_q_reversed = inout;
-  auto counting = thrust::make_counting_iterator<idx_t>(0);
+  auto counting     = thrust::make_counting_iterator<idx_t>(0);
 
-  thrust::for_each(rmm::exec_policy(stream), counting, counting + (size / 2),
-                   [=] __device__(idx_t idx) {
-                     idx_t dest_row = idx % m;
-                     idx_t dest_col = idx / m;
-                     idx_t src_row = (m - dest_row) - 1;
-                     ;
-                     idx_t src_col = dest_col;
+  thrust::for_each(
+    rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
+      idx_t dest_row = idx % m;
+      idx_t dest_col = idx / m;
+      idx_t src_row  = (m - dest_row) - 1;
+      ;
+      idx_t src_col = dest_col;
 
-                     m_t temp = (m_t)d_q_reversed[idx];
-                     d_q_reversed[idx] = d_q[src_col * m + src_row];
-                     d_q[src_col * m + src_row] = temp;
-                   });
+      m_t temp                   = (m_t)d_q_reversed[idx];
+      d_q_reversed[idx]          = d_q[src_col * m + src_row];
+      d_q[src_col * m + src_row] = temp;
+    });
 }
 
 /**
@@ -164,16 +171,19 @@ void rowReverse(m_t *inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream) {
  * @param v_separator: vertical separator character
  */
 template <typename m_t, typename idx_t = int>
-void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ',
-           char v_separator = '\n',
-           cudaStream_t stream = rmm::cuda_stream_default) {
+void print(const m_t* in,
+           idx_t n_rows,
+           idx_t n_cols,
+           char h_separator    = ' ',
+           char v_separator    = '\n',
+           cudaStream_t stream = rmm::cuda_stream_default)
+{
   std::vector<m_t> h_matrix = std::vector<m_t>(n_cols * n_rows);
   raft::update_host(h_matrix.data(), in, n_cols * n_rows, stream);
 
   for (idx_t i = 0; i < n_rows; i++) {
     for (idx_t j = 0; j < n_cols; j++) {
-      printf("%1.4f%c", h_matrix[j * n_rows + i],
-             j < n_cols - 1 ? h_separator : v_separator);
+      printf("%1.4f%c", h_matrix[j * n_rows + i], j < n_cols - 1 ? h_separator : v_separator);
     }
   }
 }
@@ -185,7 +195,8 @@ void print(const m_t *in, idx_t n_rows, idx_t n_cols, char h_separator = ' ',
  * @param n_cols: number of columns of input matrix
  */
 template <typename m_t, typename idx_t = int>
-void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) {
+void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
+{
   for (idx_t i = 0; i < n_rows; i++) {
     for (idx_t j = 0; j < n_cols; j++) {
       printf("%1.4f ", in[j * n_rows + i]);
@@ -208,8 +219,16 @@ void printHost(const m_t *in, idx_t n_rows, idx_t n_cols) {
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
-                 idx_t y1, idx_t x2, idx_t y2, cudaStream_t stream) {
+void sliceMatrix(m_t* in,
+                 idx_t n_rows,
+                 idx_t n_cols,
+                 m_t* out,
+                 idx_t x1,
+                 idx_t y1,
+                 idx_t x2,
+                 idx_t y2,
+                 cudaStream_t stream)
+{
   detail::sliceMatrix(in, n_rows, n_cols, out, x1, y1, x2, y2, stream);
 }
 
@@ -222,8 +241,8 @@ void sliceMatrix(m_t *in, idx_t n_rows, idx_t n_cols, m_t *out, idx_t x1,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
-                         cudaStream_t stream) {
+void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   detail::copyUpperTriangular(src, dst, n_rows, n_cols, stream);
 }
 
@@ -236,8 +255,9 @@ void copyUpperTriangular(m_t *src, m_t *dst, idx_t n_rows, idx_t n_cols,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
-                              cudaStream_t stream) {
+void initializeDiagonalMatrix(
+  m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
   detail::initializeDiagonalMatrix(vec, matrix, n_rows, n_cols, stream);
 }
 
@@ -248,7 +268,8 @@ void initializeDiagonalMatrix(m_t *vec, m_t *matrix, idx_t n_rows, idx_t n_cols,
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
+void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
+{
   detail::getDiagonalInverseMatrix(in, len, stream);
 }
 
@@ -260,12 +281,11 @@ void getDiagonalInverseMatrix(m_t *in, idx_t len, cudaStream_t stream) {
  * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t = int>
-m_t getL2Norm(const raft::handle_t &handle, m_t *in, idx_t size,
-              cudaStream_t stream) {
+m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
+{
   cublasHandle_t cublasH = handle.get_cublas_handle();
-  m_t normval = 0;
-  CUBLAS_CHECK(
-    raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  m_t normval            = 0;
+  CUBLAS_CHECK(raft::linalg::cublasnrm2(cublasH, size, in, 1, &normval, stream));
   return normval;
 }
 
diff --git a/cpp/include/raft/mr/buffer_base.hpp b/cpp/include/raft/mr/buffer_base.hpp
index 4a2362bf97..38ef59aadf 100644
--- a/cpp/include/raft/mr/buffer_base.hpp
+++ b/cpp/include/raft/mr/buffer_base.hpp
@@ -38,11 +38,11 @@ namespace mr {
 template <typename T, typename AllocatorT>
 class buffer_base {
  public:
-  using size_type = std::size_t;
-  using value_type = T;
-  using iterator = value_type*;
-  using const_iterator = const value_type*;
-  using reference = T&;
+  using size_type       = std::size_t;
+  using value_type      = T;
+  using iterator        = value_type*;
+  using const_iterator  = const value_type*;
+  using reference       = T&;
   using const_reference = const T&;
 
   buffer_base() = delete;
@@ -58,16 +58,12 @@ class buffer_base {
    * @param[in] stream    cuda stream where this allocation operations are async
    * @param[in] n         size of the buffer (in number of elements)
    */
-  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream,
-              size_type n = 0)
-    : data_(nullptr),
-      size_(n),
-      capacity_(n),
-      stream_(stream),
-      allocator_(std::move(allocator)) {
+  buffer_base(std::shared_ptr<AllocatorT> allocator, cudaStream_t stream, size_type n = 0)
+    : data_(nullptr), size_(n), capacity_(n), stream_(stream), allocator_(std::move(allocator))
+  {
     if (capacity_ > 0) {
-      data_ = static_cast<value_type*>(
-        allocator_->allocate(capacity_ * sizeof(value_type), stream_));
+      data_ =
+        static_cast<value_type*>(allocator_->allocate(capacity_ * sizeof(value_type), stream_));
       CUDA_CHECK(cudaStreamSynchronize(stream_));
     }
   }
@@ -100,23 +96,23 @@ class buffer_base {
    * @param[in] new_capacity new capacity (in number of elements)
    * @{
    */
-  void reserve(size_type new_capacity) {
+  void reserve(size_type new_capacity)
+  {
     if (new_capacity > capacity_) {
-      auto* new_data = static_cast<value_type*>(
-        allocator_->allocate(new_capacity * sizeof(value_type), stream_));
-      if (size_ > 0) {
-        raft::copy(new_data, data_, size_, stream_);
-      }
+      auto* new_data =
+        static_cast<value_type*>(allocator_->allocate(new_capacity * sizeof(value_type), stream_));
+      if (size_ > 0) { raft::copy(new_data, data_, size_, stream_); }
       // Only deallocate if we have allocated a pointer
       if (nullptr != data_) {
         allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
       }
-      data_ = new_data;
+      data_     = new_data;
       capacity_ = new_capacity;
     }
   }
 
-  void reserve(size_type new_capacity, cudaStream_t stream) {
+  void reserve(size_type new_capacity, cudaStream_t stream)
+  {
     set_stream(stream);
     reserve(new_capacity);
   }
@@ -128,12 +124,14 @@ class buffer_base {
    * @param[in] new_size new buffer size
    * @{
    */
-  void resize(const size_type new_size) {
+  void resize(const size_type new_size)
+  {
     reserve(new_size);
     size_ = new_size;
   }
 
-  void resize(const size_type new_size, cudaStream_t stream) {
+  void resize(const size_type new_size, cudaStream_t stream)
+  {
     set_stream(stream);
     resize(new_size);
   }
@@ -145,16 +143,18 @@ class buffer_base {
    * If this method is not explicitly called, it will be during the destructor
    * @{
    */
-  void release() {
+  void release()
+  {
     if (nullptr != data_) {
       allocator_->deallocate(data_, capacity_ * sizeof(value_type), stream_);
     }
-    data_ = nullptr;
+    data_     = nullptr;
     capacity_ = 0;
-    size_ = 0;
+    size_     = 0;
   }
 
-  void release(cudaStream_t stream) {
+  void release(cudaStream_t stream)
+  {
     set_stream(stream);
     release();
   }
@@ -194,7 +194,8 @@ class buffer_base {
    * @param[in] stream new cuda stream to be set. If it is the same as the
    *                   current one, then this method will be a no-op.
    */
-  void set_stream(cudaStream_t stream) {
+  void set_stream(cudaStream_t stream)
+  {
     if (stream_ != stream) {
       cudaEvent_t event;
       CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
diff --git a/cpp/include/raft/mr/device/allocator.hpp b/cpp/include/raft/mr/device/allocator.hpp
index 3d1ce38c31..8d306a199f 100644
--- a/cpp/include/raft/mr/device/allocator.hpp
+++ b/cpp/include/raft/mr/device/allocator.hpp
@@ -34,17 +34,20 @@ namespace device {
  * further to the ones listed in `Allocator`:
  * - Allocations may be always on the device that was specified on construction.
  */
-class allocator : public base_allocator {};
+class allocator : public base_allocator {
+};
 
 /** Default device allocator based on the one provided by RMM */
 class default_allocator : public allocator {
  public:
-  void* allocate(std::size_t n, cudaStream_t stream) override {
+  void* allocate(std::size_t n, cudaStream_t stream) override
+  {
     void* ptr = rmm::mr::get_current_device_resource()->allocate(n, stream);
     return ptr;
   }
 
-  void deallocate(void* p, std::size_t n, cudaStream_t stream) override {
+  void deallocate(void* p, std::size_t n, cudaStream_t stream) override
+  {
     rmm::mr::get_current_device_resource()->deallocate(p, n, stream);
   }
 };  // class default_allocator
diff --git a/cpp/include/raft/mr/device/buffer.hpp b/cpp/include/raft/mr/device/buffer.hpp
index 39b5674ce4..2b9d84368f 100644
--- a/cpp/include/raft/mr/device/buffer.hpp
+++ b/cpp/include/raft/mr/device/buffer.hpp
@@ -46,11 +46,11 @@ namespace device {
 template <typename T>
 class buffer : public buffer_base<T, allocator> {
  public:
-  using size_type = typename buffer_base<T, allocator>::size_type;
-  using value_type = typename buffer_base<T, allocator>::value_type;
-  using iterator = typename buffer_base<T, allocator>::iterator;
-  using const_iterator = typename buffer_base<T, allocator>::const_iterator;
-  using reference = typename buffer_base<T, allocator>::reference;
+  using size_type       = typename buffer_base<T, allocator>::size_type;
+  using value_type      = typename buffer_base<T, allocator>::value_type;
+  using iterator        = typename buffer_base<T, allocator>::iterator;
+  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
+  using reference       = typename buffer_base<T, allocator>::reference;
   using const_reference = typename buffer_base<T, allocator>::const_reference;
 
   buffer() = delete;
@@ -60,7 +60,9 @@ class buffer : public buffer_base<T, allocator> {
   buffer& operator=(const buffer& other) = delete;
 
   buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, device::allocator>(alloc, stream, n) {}
+    : buffer_base<T, device::allocator>(alloc, stream, n)
+  {
+  }
 };  // class buffer
 
 };  // namespace device
diff --git a/cpp/include/raft/mr/host/allocator.hpp b/cpp/include/raft/mr/host/allocator.hpp
index e5b3da24eb..7d31248e7f 100644
--- a/cpp/include/raft/mr/host/allocator.hpp
+++ b/cpp/include/raft/mr/host/allocator.hpp
@@ -35,20 +35,23 @@ namespace host {
  * further to the ones listed in `Allocator`:
  * - Allocations don't need to be zero copy accessible form a device.
  */
-class allocator : public base_allocator {};
+class allocator : public base_allocator {
+};
 
 /** Default cudaMallocHost/cudaFreeHost based host allocator */
 class default_allocator : public allocator {
  public:
-  void* allocate(std::size_t n, cudaStream_t stream) override {
+  void* allocate(std::size_t n, cudaStream_t stream) override
+  {
     void* ptr = nullptr;
     CUDA_CHECK(cudaMallocHost(&ptr, n));
     return ptr;
   }
 
-  void deallocate(void* p, std::size_t n, cudaStream_t stream) override {
-    //Must call _NO_THROW here since this is called frequently from object
-    //destructors which are "nothrow" by default
+  void deallocate(void* p, std::size_t n, cudaStream_t stream) override
+  {
+    // Must call _NO_THROW here since this is called frequently from object
+    // destructors which are "nothrow" by default
     CUDA_CHECK_NO_THROW(cudaFreeHost(p));
   }
 };  // class default_allocator
diff --git a/cpp/include/raft/mr/host/buffer.hpp b/cpp/include/raft/mr/host/buffer.hpp
index 3c505bf2ed..52475ad6ec 100644
--- a/cpp/include/raft/mr/host/buffer.hpp
+++ b/cpp/include/raft/mr/host/buffer.hpp
@@ -48,11 +48,11 @@ namespace host {
 template <typename T>
 class buffer : public buffer_base<T, allocator> {
  public:
-  using size_type = typename buffer_base<T, allocator>::size_type;
-  using value_type = typename buffer_base<T, allocator>::value_type;
-  using iterator = typename buffer_base<T, allocator>::iterator;
-  using const_iterator = typename buffer_base<T, allocator>::const_iterator;
-  using reference = typename buffer_base<T, allocator>::reference;
+  using size_type       = typename buffer_base<T, allocator>::size_type;
+  using value_type      = typename buffer_base<T, allocator>::value_type;
+  using iterator        = typename buffer_base<T, allocator>::iterator;
+  using const_iterator  = typename buffer_base<T, allocator>::const_iterator;
+  using reference       = typename buffer_base<T, allocator>::reference;
   using const_reference = typename buffer_base<T, allocator>::const_reference;
 
   buffer() = delete;
@@ -62,14 +62,15 @@ class buffer : public buffer_base<T, allocator> {
   buffer& operator=(const buffer& other) = delete;
 
   buffer(std::shared_ptr<allocator> alloc, const device::buffer<T>& other)
-    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size()) {
-    if (other.size() > 0) {
-      raft::copy(data_, other.data(), other.size(), other.get_stream());
-    }
+    : buffer_base<T, allocator>(alloc, other.get_stream(), other.size())
+  {
+    if (other.size() > 0) { raft::copy(data_, other.data(), other.size(), other.get_stream()); }
   }
 
   buffer(std::shared_ptr<allocator> alloc, cudaStream_t stream, size_type n = 0)
-    : buffer_base<T, allocator>(alloc, stream, n) {}
+    : buffer_base<T, allocator>(alloc, stream, n)
+  {
+  }
 
   reference operator[](size_type pos) { return data_[pos]; }
 
diff --git a/cpp/include/raft/pow2_utils.cuh b/cpp/include/raft/pow2_utils.cuh
index de5fc46452..56a3192f9f 100644
--- a/cpp/include/raft/pow2_utils.cuh
+++ b/cpp/include/raft/pow2_utils.cuh
@@ -29,14 +29,13 @@ template <auto Value_>
 struct Pow2 {
   typedef decltype(Value_) Type;
   static constexpr Type Value = Value_;
-  static constexpr Type Log2 = log2(Value);
-  static constexpr Type Mask = Value - 1;
+  static constexpr Type Log2  = log2(Value);
+  static constexpr Type Mask  = Value - 1;
 
   static_assert(std::is_integral<Type>::value, "Value must be integral.");
   static_assert(Value && !(Value & Mask), "Value must be power of two.");
 
-#define Pow2_IsRepresentableAs(I) \
-  (std::is_integral<I>::value && Type(I(Value)) == Value)
+#define Pow2_IsRepresentableAs(I) (std::is_integral<I>::value && Type(I(Value)) == Value)
 
   /**
    * Integer division by Value truncated toward zero
@@ -45,10 +44,9 @@ struct Pow2 {
    *  Invariant: `x = Value * quot(x) + rem(x)`
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> quot(
-    I x) noexcept {
-    if constexpr (std::is_signed<I>::value)
-      return (x >> I(Log2)) + (x < 0 && (x & I(Mask)));
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> quot(I x) noexcept
+  {
+    if constexpr (std::is_signed<I>::value) return (x >> I(Log2)) + (x < 0 && (x & I(Mask)));
     if constexpr (std::is_unsigned<I>::value) return x >> I(Log2);
   }
 
@@ -59,10 +57,9 @@ struct Pow2 {
    *  Invariant: `x = Value * quot(x) + rem(x)`.
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> rem(
-    I x) noexcept {
-    if constexpr (std::is_signed<I>::value)
-      return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask));
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> rem(I x) noexcept
+  {
+    if constexpr (std::is_signed<I>::value) return x < 0 ? -((-x) & I(Mask)) : (x & I(Mask));
     if constexpr (std::is_unsigned<I>::value) return x & I(Mask);
   }
 
@@ -77,8 +74,8 @@ struct Pow2 {
    * compared to normal C++ operators `/` and `%`.
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> div(
-    I x) noexcept {
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> div(I x) noexcept
+  {
     return x >> I(Log2);
   }
 
@@ -94,8 +91,8 @@ struct Pow2 {
    * compared to normal C++ operators `/` and `%`.
    */
   template <typename I>
-  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> mod(
-    I x) noexcept {
+  static constexpr HDI std::enable_if_t<Pow2_IsRepresentableAs(I), I> mod(I x) noexcept
+  {
     return x & I(Mask);
   }
 
@@ -108,16 +105,17 @@ struct Pow2 {
    * NB: for pointers, the alignment is checked in bytes, not in elements.
    */
   template <typename PtrT>
-  static constexpr HDI bool isAligned(PtrT p) noexcept {
+  static constexpr HDI bool isAligned(PtrT p) noexcept
+  {
     Pow2_CHECK_TYPE(PtrT);
     if constexpr (Pow2_IsRepresentableAs(PtrT)) return mod(p) == 0;
-    if constexpr (!Pow2_IsRepresentableAs(PtrT))
-      return mod(reinterpret_cast<Type>(p)) == 0;
+    if constexpr (!Pow2_IsRepresentableAs(PtrT)) return mod(reinterpret_cast<Type>(p)) == 0;
   }
 
   /** Tell whether two pointers have the same address modulo Value. */
   template <typename PtrT, typename PtrS>
-  static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept {
+  static constexpr HDI bool areSameAlignOffsets(PtrT a, PtrS b) noexcept
+  {
     Pow2_CHECK_TYPE(PtrT);
     Pow2_CHECK_TYPE(PtrS);
     Type x, y;
@@ -134,10 +132,10 @@ struct Pow2 {
 
   /** Get this or next Value-aligned address (in bytes) or integral. */
   template <typename PtrT>
-  static constexpr HDI PtrT roundUp(PtrT p) noexcept {
+  static constexpr HDI PtrT roundUp(PtrT p) noexcept
+  {
     Pow2_CHECK_TYPE(PtrT);
-    if constexpr (Pow2_IsRepresentableAs(PtrT))
-      return p + PtrT(Mask) - mod(p + PtrT(Mask));
+    if constexpr (Pow2_IsRepresentableAs(PtrT)) return p + PtrT(Mask) - mod(p + PtrT(Mask));
     if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
       auto x = reinterpret_cast<Type>(p);
       return reinterpret_cast<PtrT>(x + Mask - mod(x + Mask));
@@ -146,7 +144,8 @@ struct Pow2 {
 
   /** Get this or previous Value-aligned address (in bytes) or integral. */
   template <typename PtrT>
-  static constexpr HDI PtrT roundDown(PtrT p) noexcept {
+  static constexpr HDI PtrT roundDown(PtrT p) noexcept
+  {
     Pow2_CHECK_TYPE(PtrT);
     if constexpr (Pow2_IsRepresentableAs(PtrT)) return p - mod(p);
     if constexpr (!Pow2_IsRepresentableAs(PtrT)) {
diff --git a/cpp/include/raft/random/detail/rng_impl.cuh b/cpp/include/raft/random/detail/rng_impl.cuh
index 654c46bbf9..0f3b58975e 100644
--- a/cpp/include/raft/random/detail/rng_impl.cuh
+++ b/cpp/include/raft/random/detail/rng_impl.cuh
@@ -44,19 +44,20 @@ enum GeneratorType {
 };
 
 template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
-                             Type sigma2, Type mu2) {
-  constexpr Type twoPi = Type(2.0) * Type(3.141592654);
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
+{
+  constexpr Type twoPi  = Type(2.0) * Type(3.141592654);
   constexpr Type minus2 = -Type(2.0);
-  Type R = raft::mySqrt(minus2 * raft::myLog(val1));
-  Type theta = twoPi * val2;
+  Type R                = raft::mySqrt(minus2 * raft::myLog(val1));
+  Type theta            = twoPi * val2;
   Type s, c;
   raft::mySinCos(theta, s, c);
   val1 = R * c * sigma1 + mu1;
   val2 = R * s * sigma2 + mu2;
 }
 template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
+{
   box_muller_transform<Type>(val1, val2, sigma1, mu1, sigma1, mu1);
 }
 
@@ -67,10 +68,13 @@ DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
 template <typename GenType>
 struct Generator {
   DI Generator(uint64_t seed, uint64_t subsequence, uint64_t offset)
-    : gen(seed, subsequence, offset) {}
+    : gen(seed, subsequence, offset)
+  {
+  }
 
   template <typename Type>
-  DI void next(Type &ret) {
+  DI void next(Type& ret)
+  {
     gen.next(ret);
   }
 
@@ -79,10 +83,9 @@ struct Generator {
   GenType gen;
 };
 
-template <typename OutType, typename MathType, typename GenType,
-          typename LenType, typename Lambda>
-__global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr,
-                           LenType len, Lambda randOp) {
+template <typename OutType, typename MathType, typename GenType, typename LenType, typename Lambda>
+__global__ void randKernel(uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda randOp)
+{
   LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   Generator<GenType> gen(seed, (uint64_t)tid, offset);
   const LenType stride = gridDim.x * blockDim.x;
@@ -94,10 +97,10 @@ __global__ void randKernel(uint64_t seed, uint64_t offset, OutType *ptr,
 }
 
 // used for Box-Muller type transformations
-template <typename OutType, typename MathType, typename GenType,
-          typename LenType, typename Lambda2>
-__global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr,
-                            LenType len, Lambda2 rand2Op) {
+template <typename OutType, typename MathType, typename GenType, typename LenType, typename Lambda2>
+__global__ void rand2Kernel(
+  uint64_t seed, uint64_t offset, OutType* ptr, LenType len, Lambda2 rand2Op)
+{
   LenType tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   Generator<GenType> gen(seed, (uint64_t)tid, offset);
   const LenType stride = gridDim.x * blockDim.x;
@@ -113,8 +116,9 @@ __global__ void rand2Kernel(uint64_t seed, uint64_t offset, OutType *ptr,
 }
 
 template <typename Type>
-__global__ void constFillKernel(Type *ptr, int len, Type val) {
-  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+__global__ void constFillKernel(Type* ptr, int len, Type val)
+{
+  unsigned tid          = (blockIdx.x * blockDim.x) + threadIdx.x;
   const unsigned stride = gridDim.x * blockDim.x;
   for (unsigned idx = tid; idx < len; idx += stride) {
     ptr[idx] = val;
@@ -130,7 +134,8 @@ struct PhiloxGenerator {
    * @param subsequence as found in curand docs
    * @param offset as found in curand docs
    */
-  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
+  DI PhiloxGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+  {
     curand_init(seed, subsequence, offset, &state);
   }
 
@@ -138,21 +143,24 @@ struct PhiloxGenerator {
    * @defgroup NextRand Generate the next random number
    * @{
    */
-  DI void next(float &ret) { ret = curand_uniform(&(this->state)); }
-  DI void next(double &ret) { ret = curand_uniform_double(&(this->state)); }
-  DI void next(uint32_t &ret) { ret = curand(&(this->state)); }
-  DI void next(uint64_t &ret) {
+  DI void next(float& ret) { ret = curand_uniform(&(this->state)); }
+  DI void next(double& ret) { ret = curand_uniform_double(&(this->state)); }
+  DI void next(uint32_t& ret) { ret = curand(&(this->state)); }
+  DI void next(uint64_t& ret)
+  {
     uint32_t a, b;
     next(a);
     next(b);
     ret = (uint64_t)a | ((uint64_t)b << 32);
   }
-  DI void next(int32_t &ret) {
+  DI void next(int32_t& ret)
+  {
     uint32_t val;
     next(val);
     ret = int32_t(val & 0x7fffffff);
   }
-  DI void next(int64_t &ret) {
+  DI void next(int64_t& ret)
+  {
     uint64_t val;
     next(val);
     ret = int64_t(val & 0x7fffffffffffffff);
@@ -173,8 +181,9 @@ struct TapsGenerator {
    * @param subsequence unused
    * @param offset unused
    */
-  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
-    uint64_t delta = (blockIdx.x * blockDim.x) + threadIdx.x;
+  DI TapsGenerator(uint64_t seed, uint64_t subsequence, uint64_t offset)
+  {
+    uint64_t delta  = (blockIdx.x * blockDim.x) + threadIdx.x;
     uint64_t stride = blockDim.x * gridDim.x;
     delta += ((blockIdx.y * blockDim.y) + threadIdx.y) * stride;
     stride *= blockDim.y * gridDim.y;
@@ -187,31 +196,36 @@ struct TapsGenerator {
    * @{
    */
   template <typename Type>
-  DI void next(Type &ret) {
+  DI void next(Type& ret)
+  {
     constexpr double ULL_LARGE = 1.8446744073709551614e19;
     uint64_t val;
     next(val);
     ret = static_cast<Type>(val);
     ret /= static_cast<Type>(ULL_LARGE);
   }
-  DI void next(uint64_t &ret) {
+  DI void next(uint64_t& ret)
+  {
     constexpr uint64_t TAPS = 0x8000100040002000ULL;
-    constexpr int ROUNDS = 128;
+    constexpr int ROUNDS    = 128;
     for (int i = 0; i < ROUNDS; i++)
       state = (state >> 1) ^ (-(state & 1ULL) & TAPS);
     ret = state;
   }
-  DI void next(uint32_t &ret) {
+  DI void next(uint32_t& ret)
+  {
     uint64_t val;
     next(val);
     ret = (uint32_t)val;
   }
-  DI void next(int32_t &ret) {
+  DI void next(int32_t& ret)
+  {
     uint32_t val;
     next(val);
     ret = int32_t(val & 0x7fffffff);
   }
-  DI void next(int64_t &ret) {
+  DI void next(int64_t& ret)
+  {
     uint64_t val;
     next(val);
     ret = int64_t(val & 0x7fffffffffffffff);
@@ -232,46 +246,49 @@ struct Kiss99Generator {
    * @param subsequence unused
    * @param offset unused
    */
-  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) {
-    initKiss99(seed);
-  }
+  DI Kiss99Generator(uint64_t seed, uint64_t subsequence, uint64_t offset) { initKiss99(seed); }
 
   /**
    * @defgroup NextRand Generate the next random number
    * @{
    */
   template <typename Type>
-  DI void next(Type &ret) {
+  DI void next(Type& ret)
+  {
     constexpr double U_LARGE = 4.294967295e9;
     uint32_t val;
     next(val);
     ret = static_cast<Type>(val);
     ret /= static_cast<Type>(U_LARGE);
   }
-  DI void next(uint32_t &ret) {
+  DI void next(uint32_t& ret)
+  {
     uint32_t MWC;
-    z = 36969 * (z & 65535) + (z >> 16);
-    w = 18000 * (w & 65535) + (w >> 16);
+    z   = 36969 * (z & 65535) + (z >> 16);
+    w   = 18000 * (w & 65535) + (w >> 16);
     MWC = ((z << 16) + w);
     jsr ^= (jsr << 17);
     jsr ^= (jsr >> 13);
     jsr ^= (jsr << 5);
     jcong = 69069 * jcong + 1234567;
-    MWC = ((MWC ^ jcong) + jsr);
-    ret = MWC;
+    MWC   = ((MWC ^ jcong) + jsr);
+    ret   = MWC;
   }
-  DI void next(uint64_t &ret) {
+  DI void next(uint64_t& ret)
+  {
     uint32_t a, b;
     next(a);
     next(b);
     ret = (uint64_t)a | ((uint64_t)b << 32);
   }
-  DI void next(int32_t &ret) {
+  DI void next(int32_t& ret)
+  {
     uint32_t val;
     next(val);
     ret = int32_t(val & 0x7fffffff);
   }
-  DI void next(int64_t &ret) {
+  DI void next(int64_t& ret)
+  {
     uint64_t val;
     next(val);
     ret = int64_t(val & 0x7fffffffffffffff);
@@ -290,7 +307,8 @@ struct Kiss99Generator {
 
   // This function multiplies 128-bit hash by 128-bit FNV prime and returns lower
   // 128 bits. It uses 32-bit wide multiply only.
-  DI void mulByFnv1a128Prime(uint32_t *h) {
+  DI void mulByFnv1a128Prime(uint32_t* h)
+  {
     typedef union {
       uint32_t u32[2];
       uint64_t u64[1];
@@ -314,12 +332,12 @@ struct Kiss99Generator {
     // h_n[2] = HI(h[1]*p[0]) + LO(h[2]*p[0]) + LO(h[0]*p[2]);
     // h_n[3] = HI(h[2]*p[0]) + HI(h[0]*p[2]) + LO(h[3]*p[0]) + LO(h[1]*p[2]);
     uint32_t carry = 0;
-    h[0] = h0p0.u32[0];
+    h[0]           = h0p0.u32[0];
 
-    h[1] = h0p0.u32[1] + h1p0.u32[0];
+    h[1]  = h0p0.u32[1] + h1p0.u32[0];
     carry = h[1] < h0p0.u32[1] ? 1 : 0;
 
-    h[2] = h1p0.u32[1] + carry;
+    h[2]  = h1p0.u32[1] + carry;
     carry = h[2] < h1p0.u32[1] ? 1 : 0;
     h[2] += h2p0.u32[0];
     carry = h[2] < h2p0.u32[0] ? carry + 1 : carry;
@@ -330,7 +348,8 @@ struct Kiss99Generator {
     return;
   }
 
-  DI void fnv1a128(uint32_t *hash, uint32_t txt) {
+  DI void fnv1a128(uint32_t* hash, uint32_t txt)
+  {
     hash[0] ^= (txt >> 0) & 0xFF;
     mulByFnv1a128Prime(hash);
     hash[0] ^= (txt >> 8) & 0xFF;
@@ -341,7 +360,8 @@ struct Kiss99Generator {
     mulByFnv1a128Prime(hash);
   }
 
-  DI void initKiss99(uint64_t seed) {
+  DI void initKiss99(uint64_t seed)
+  {
     // Initialize hash to 128-bit FNV1a basis
     uint32_t hash[4] = {1653982605UL, 1656234357UL, 129696066UL, 1818371886UL};
 
@@ -356,9 +376,9 @@ struct Kiss99Generator {
     fnv1a128(hash, uint32_t(seed >> 32));
 
     // Initialize KISS99 state with hash
-    z = hash[0];
-    w = hash[1];
-    jsr = hash[2];
+    z     = hash[0];
+    w     = hash[1];
+    jsr   = hash[2];
     jcong = hash[3];
   }
 };
@@ -372,17 +392,20 @@ class RngImpl {
       // simple heuristic to make sure all SMs will be occupied properly
       // and also not too many initialization calls will be made by each thread
       nBlocks(4 * getMultiProcessorCount()),
-      gen() {
+      gen()
+  {
     seed(_s);
   }
 
-  void seed(uint64_t _s) {
+  void seed(uint64_t _s)
+  {
     gen.seed(_s);
     offset = 0;
   }
 
   template <typename IdxT>
-  void affine_transform_params(IdxT n, IdxT &a, IdxT &b) {
+  void affine_transform_params(IdxT n, IdxT& a, IdxT& b)
+  {
     // always keep 'a' to be coprime to 'n'
     a = gen() % n;
     while (gcd(a, n) != 1) {
@@ -394,128 +417,150 @@ class RngImpl {
   }
 
   template <typename Type, typename LenType = int>
-  void uniform(Type *ptr, LenType len, Type start, Type end,
-               cudaStream_t stream) {
+  void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream)
+  {
     static_assert(std::is_floating_point<Type>::value,
                   "Type for 'uniform' can only be floating point!");
     custom_distribution(
-      ptr, len,
-      [=] __device__(Type val, LenType idx) {
-        return (val * (end - start)) + start;
-      },
+      ptr,
+      len,
+      [=] __device__(Type val, LenType idx) { return (val * (end - start)) + start; },
       stream);
   }
   template <typename IntType, typename LenType = int>
-  void uniformInt(IntType *ptr, LenType len, IntType start, IntType end,
-                  cudaStream_t stream) {
-    static_assert(std::is_integral<IntType>::value,
-                  "Type for 'uniformInt' can only be integer!");
+  void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream)
+  {
+    static_assert(std::is_integral<IntType>::value, "Type for 'uniformInt' can only be integer!");
     custom_distribution(
-      ptr, len,
-      [=] __device__(IntType val, LenType idx) {
-        return (val % (end - start)) + start;
-      },
+      ptr,
+      len,
+      [=] __device__(IntType val, LenType idx) { return (val % (end - start)) + start; },
       stream);
   }
 
   template <typename Type, typename LenType = int>
-  void normal(Type *ptr, LenType len, Type mu, Type sigma,
-              cudaStream_t stream) {
+  void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  {
     static_assert(std::is_floating_point<Type>::value,
                   "Type for 'normal' can only be floating point!");
     rand2Impl(
-      offset, ptr, len,
+      offset,
+      ptr,
+      len,
       [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
         box_muller_transform<Type>(val1, val2, sigma, mu);
       },
-      NumThreads, nBlocks, type, stream);
+      NumThreads,
+      nBlocks,
+      type,
+      stream);
   }
   template <typename IntType, typename LenType = int>
-  void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma,
-                 cudaStream_t stream) {
-    static_assert(std::is_integral<IntType>::value,
-                  "Type for 'normalInt' can only be integer!");
+  void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
+  {
+    static_assert(std::is_integral<IntType>::value, "Type for 'normalInt' can only be integer!");
     rand2Impl<IntType, double>(
-      offset, ptr, len,
-      [=] __device__(double &val1, double &val2, LenType idx1, LenType idx2) {
+      offset,
+      ptr,
+      len,
+      [=] __device__(double& val1, double& val2, LenType idx1, LenType idx2) {
         box_muller_transform<double>(val1, val2, sigma, mu);
       },
-      NumThreads, nBlocks, type, stream);
+      NumThreads,
+      nBlocks,
+      type,
+      stream);
   }
 
   template <typename Type, typename LenType = int>
-  void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu,
-                   const Type *sigma_vec, Type sigma, cudaStream_t stream) {
+  void normalTable(Type* ptr,
+                   LenType n_rows,
+                   LenType n_cols,
+                   const Type* mu,
+                   const Type* sigma_vec,
+                   Type sigma,
+                   cudaStream_t stream)
+  {
     rand2Impl(
-      offset, ptr, n_rows * n_cols,
+      offset,
+      ptr,
+      n_rows * n_cols,
       [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
         // yikes! use fast-int-div
-        auto col1 = idx1 % n_cols;
-        auto col2 = idx2 % n_cols;
+        auto col1  = idx1 % n_cols;
+        auto col2  = idx2 % n_cols;
         auto mean1 = mu[col1];
         auto mean2 = mu[col2];
-        auto sig1 = sigma_vec == nullptr ? sigma : sigma_vec[col1];
-        auto sig2 = sigma_vec == nullptr ? sigma : sigma_vec[col2];
+        auto sig1  = sigma_vec == nullptr ? sigma : sigma_vec[col1];
+        auto sig2  = sigma_vec == nullptr ? sigma : sigma_vec[col2];
         box_muller_transform<Type>(val1, val2, sig1, mean1, sig2, mean2);
       },
-      NumThreads, nBlocks, type, stream);
+      NumThreads,
+      nBlocks,
+      type,
+      stream);
   }
 
   template <typename Type, typename LenType = int>
-  void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) {
-    detail::constFillKernel<Type>
-      <<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
+  void fill(Type* ptr, LenType len, Type val, cudaStream_t stream)
+  {
+    detail::constFillKernel<Type><<<nBlocks, NumThreads, 0, stream>>>(ptr, len, val);
     CUDA_CHECK(cudaPeekAtLastError());
   }
 
   template <typename Type, typename OutType = bool, typename LenType = int>
-  void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) {
+  void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream)
+  {
     custom_distribution<OutType, Type>(
-      ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; },
-      stream);
+      ptr, len, [=] __device__(Type val, LenType idx) { return val > prob; }, stream);
   }
 
   template <typename Type, typename LenType = int>
-  void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale,
-                        cudaStream_t stream) {
+  void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream)
+  {
     static_assert(std::is_floating_point<Type>::value,
                   "Type for 'scaled_bernoulli' can only be floating point!");
     custom_distribution(
-      ptr, len,
-      [=] __device__(Type val, LenType idx) {
-        return val > prob ? -scale : scale;
-      },
+      ptr,
+      len,
+      [=] __device__(Type val, LenType idx) { return val > prob ? -scale : scale; },
       stream);
   }
 
   template <typename Type, typename LenType = int>
-  void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) {
+  void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
-      [=] __device__(Type val, LenType idx) {
-        return mu - beta * raft::myLog(-raft::myLog(val));
-      },
+      ptr,
+      len,
+      [=] __device__(Type val, LenType idx) { return mu - beta * raft::myLog(-raft::myLog(val)); },
       stream);
   }
 
   template <typename Type, typename LenType = int>
-  void lognormal(Type *ptr, LenType len, Type mu, Type sigma,
-                 cudaStream_t stream) {
+  void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  {
     rand2Impl(
-      offset, ptr, len,
+      offset,
+      ptr,
+      len,
       [=] __device__(Type & val1, Type & val2, LenType idx1, LenType idx2) {
         box_muller_transform<Type>(val1, val2, sigma, mu);
         val1 = raft::myExp(val1);
         val2 = raft::myExp(val2);
       },
-      NumThreads, nBlocks, type, stream);
+      NumThreads,
+      nBlocks,
+      type,
+      stream);
   }
 
   template <typename Type, typename LenType = int>
-  void logistic(Type *ptr, LenType len, Type mu, Type scale,
-                cudaStream_t stream) {
+  void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
+      ptr,
+      len,
       [=] __device__(Type val, LenType idx) {
         constexpr Type one = (Type)1.0;
         return mu - scale * raft::myLog(one / val - one);
@@ -524,9 +569,11 @@ class RngImpl {
   }
 
   template <typename Type, typename LenType = int>
-  void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) {
+  void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
+      ptr,
+      len,
       [=] __device__(Type val, LenType idx) {
         constexpr Type one = (Type)1.0;
         return -raft::myLog(one - val) / lambda;
@@ -535,9 +582,11 @@ class RngImpl {
   }
 
   template <typename Type, typename LenType = int>
-  void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) {
+  void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
+      ptr,
+      len,
       [=] __device__(Type val, LenType idx) {
         constexpr Type one = (Type)1.0;
         constexpr Type two = (Type)2.0;
@@ -547,13 +596,14 @@ class RngImpl {
   }
 
   template <typename Type, typename LenType = int>
-  void laplace(Type *ptr, LenType len, Type mu, Type scale,
-               cudaStream_t stream) {
+  void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  {
     custom_distribution(
-      ptr, len,
+      ptr,
+      len,
       [=] __device__(Type val, LenType idx) {
-        constexpr Type one = (Type)1.0;
-        constexpr Type two = (Type)2.0;
+        constexpr Type one     = (Type)1.0;
+        constexpr Type two     = (Type)2.0;
         constexpr Type oneHalf = (Type)0.5;
         Type out;
         if (val <= oneHalf) {
@@ -567,55 +617,55 @@ class RngImpl {
   }
 
   template <typename DataT, typename WeightsT, typename IdxT = int>
-  void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out,
-                                IdxT *outIdx, const DataT *in,
-                                const WeightsT *wts, IdxT sampledLen, IdxT len,
-                                cudaStream_t stream) {
-    ASSERT(sampledLen <= len,
-           "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
+  void sampleWithoutReplacement(const raft::handle_t& handle,
+                                DataT* out,
+                                IdxT* outIdx,
+                                const DataT* in,
+                                const WeightsT* wts,
+                                IdxT sampledLen,
+                                IdxT len,
+                                cudaStream_t stream)
+  {
+    ASSERT(sampledLen <= len, "sampleWithoutReplacement: 'sampledLen' cant be more than 'len'.");
 
     rmm::device_uvector<WeightsT> expWts(len, stream);
     rmm::device_uvector<WeightsT> sortedWts(len, stream);
     rmm::device_uvector<IdxT> inIdx(len, stream);
     rmm::device_uvector<IdxT> outIdxBuff(len, stream);
-    auto *inIdxPtr = inIdx.data();
+    auto* inIdxPtr = inIdx.data();
     // generate modified weights
     custom_distribution(
-      expWts.data(), len,
+      expWts.data(),
+      len,
       [wts, inIdxPtr] __device__(WeightsT val, IdxT idx) {
-        inIdxPtr[idx] = idx;
+        inIdxPtr[idx]          = idx;
         constexpr WeightsT one = (WeightsT)1.0;
-        auto exp = -raft::myLog(one - val);
-        if (wts != nullptr) {
-          return exp / wts[idx];
-        }
+        auto exp               = -raft::myLog(one - val);
+        if (wts != nullptr) { return exp / wts[idx]; }
         return exp;
       },
       stream);
     ///@todo: use a more efficient partitioning scheme instead of full sort
     // sort the array and pick the top sampledLen items
-    IdxT *outIdxPtr = outIdxBuff.data();
+    IdxT* outIdxPtr = outIdxBuff.data();
     rmm::device_uvector<char> workspace(0, stream);
-    sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr,
-              (int)len, stream);
+    sortPairs(workspace, expWts.data(), sortedWts.data(), inIdxPtr, outIdxPtr, (int)len, stream);
     if (outIdx != nullptr) {
-      CUDA_CHECK(cudaMemcpyAsync(outIdx, outIdxPtr, sizeof(IdxT) * sampledLen,
-                                 cudaMemcpyDeviceToDevice, stream));
+      CUDA_CHECK(cudaMemcpyAsync(
+        outIdx, outIdxPtr, sizeof(IdxT) * sampledLen, cudaMemcpyDeviceToDevice, stream));
     }
     raft::scatter<DataT, IdxT>(out, in, outIdxPtr, sampledLen, stream);
   }
 
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void custom_distribution(OutType *ptr, LenType len, Lambda randOp,
-                           cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
+  {
     randImpl<OutType, MathType, LenType, Lambda>(
       offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
   }
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void custom_distribution2(OutType *ptr, LenType len, Lambda randOp,
-                            cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
+  {
     rand2Impl<OutType, MathType, LenType, Lambda>(
       offset, ptr, len, randOp, NumThreads, nBlocks, type, stream);
   }
@@ -625,10 +675,10 @@ class RngImpl {
   /** generator type */
   GeneratorType type;
   /**
-  * offset is also used to initialize curand state.
-  * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64),
-  * but is still a large period.
-  */
+   * offset is also used to initialize curand state.
+   * Limits period of Philox RNG from (4 * 2^128) to (Blocks * Threads * 2^64),
+   * but is still a large period.
+   */
   uint64_t offset;
   /** number of blocks to launch */
   int nBlocks;
@@ -638,12 +688,10 @@ class RngImpl {
   static const int NumThreads = 256;
 
   template <bool IsNormal, typename Type, typename LenType>
-  uint64_t _setupSeeds(uint64_t &seed, uint64_t &offset, LenType len,
-                       int nThreads, int nBlocks) {
+  uint64_t _setupSeeds(uint64_t& seed, uint64_t& offset, LenType len, int nThreads, int nBlocks)
+  {
     LenType itemsPerThread = raft::ceildiv(len, LenType(nBlocks * nThreads));
-    if (IsNormal && itemsPerThread % 2 == 1) {
-      ++itemsPerThread;
-    }
+    if (IsNormal && itemsPerThread % 2 == 1) { ++itemsPerThread; }
     // curand uses 2 32b uint's to generate one double
     uint64_t factor = sizeof(Type) / sizeof(float);
     if (factor == 0) ++factor;
@@ -651,72 +699,72 @@ class RngImpl {
     // If not, then generate new seed and start from zero offset
     uint64_t newOffset = offset + LenType(itemsPerThread) * factor;
     if (newOffset < offset) {
-      offset = 0;
-      seed = gen();
+      offset    = 0;
+      seed      = gen();
       newOffset = itemsPerThread * factor;
     }
     return newOffset;
   }
 
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void randImpl(uint64_t &offset, OutType *ptr, LenType len, Lambda randOp,
-                int nThreads, int nBlocks, GeneratorType type,
-                cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void randImpl(uint64_t& offset,
+                OutType* ptr,
+                LenType len,
+                Lambda randOp,
+                int nThreads,
+                int nBlocks,
+                GeneratorType type,
+                cudaStream_t stream)
+  {
     if (len <= 0) return;
-    uint64_t seed = gen();
-    auto newOffset = _setupSeeds<false, MathType, LenType>(seed, offset, len,
-                                                           nThreads, nBlocks);
+    uint64_t seed  = gen();
+    auto newOffset = _setupSeeds<false, MathType, LenType>(seed, offset, len, nThreads, nBlocks);
     switch (type) {
       case GenPhilox:
-        detail::randKernel<OutType, MathType, detail::PhiloxGenerator, LenType,
-                           Lambda>
+        detail::randKernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
       case GenTaps:
-        detail::randKernel<OutType, MathType, detail::TapsGenerator, LenType,
-                           Lambda>
+        detail::randKernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
       case GenKiss99:
-        detail::randKernel<OutType, MathType, detail::Kiss99Generator, LenType,
-                           Lambda>
+        detail::randKernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, randOp);
         break;
-      default:
-        ASSERT(false, "randImpl: Incorrect generator type! %d", type);
+      default: ASSERT(false, "randImpl: Incorrect generator type! %d", type);
     };
     CUDA_CHECK(cudaGetLastError());
     offset = newOffset;
   }
 
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda2>
-  void rand2Impl(uint64_t &offset, OutType *ptr, LenType len, Lambda2 rand2Op,
-                 int nThreads, int nBlocks, GeneratorType type,
-                 cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda2>
+  void rand2Impl(uint64_t& offset,
+                 OutType* ptr,
+                 LenType len,
+                 Lambda2 rand2Op,
+                 int nThreads,
+                 int nBlocks,
+                 GeneratorType type,
+                 cudaStream_t stream)
+  {
     if (len <= 0) return;
-    auto seed = gen();
-    auto newOffset = _setupSeeds<true, MathType, LenType>(seed, offset, len,
-                                                          nThreads, nBlocks);
+    auto seed      = gen();
+    auto newOffset = _setupSeeds<true, MathType, LenType>(seed, offset, len, nThreads, nBlocks);
     switch (type) {
       case GenPhilox:
-        detail::rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType,
-                            Lambda2>
+        detail::rand2Kernel<OutType, MathType, detail::PhiloxGenerator, LenType, Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       case GenTaps:
-        detail::rand2Kernel<OutType, MathType, detail::TapsGenerator, LenType,
-                            Lambda2>
+        detail::rand2Kernel<OutType, MathType, detail::TapsGenerator, LenType, Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
       case GenKiss99:
-        detail::rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType,
-                            Lambda2>
+        detail::rand2Kernel<OutType, MathType, detail::Kiss99Generator, LenType, Lambda2>
           <<<nBlocks, nThreads, 0, stream>>>(seed, offset, ptr, len, rand2Op);
         break;
-      default:
-        ASSERT(false, "rand2Impl: Incorrect generator type! %d", type);
+      default: ASSERT(false, "rand2Impl: Incorrect generator type! %d", type);
     };
     CUDA_CHECK(cudaGetLastError());
     offset = newOffset;
diff --git a/cpp/include/raft/random/rng.hpp b/cpp/include/raft/random/rng.hpp
index b6b0911ab0..0cced7c626 100644
--- a/cpp/include/raft/random/rng.hpp
+++ b/cpp/include/raft/random/rng.hpp
@@ -51,12 +51,13 @@ using detail::Kiss99Generator;
  * @{
  */
 template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1,
-                             Type sigma2, Type mu2) {
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1, Type sigma2, Type mu2)
+{
   detail::box_muller_transform(val1, val2, sigma1, mu1, sigma1, mu2);
 }
 template <typename Type>
-DI void box_muller_transform(Type &val1, Type &val2, Type sigma1, Type mu1) {
+DI void box_muller_transform(Type& val1, Type& val2, Type sigma1, Type mu1)
+{
   detail::box_muller_transform(val1, val2, sigma1, mu1);
 }
 /** @} */
@@ -92,7 +93,8 @@ class Rng : public detail::RngImpl {
    * @param[out] b intercept parameter
    */
   template <typename IdxT>
-  void affine_transform_params(IdxT n, IdxT &a, IdxT &b) {
+  void affine_transform_params(IdxT n, IdxT& a, IdxT& b)
+  {
     detail::RngImpl::affine_transform_params(n, a, b);
   }
 
@@ -108,13 +110,13 @@ class Rng : public detail::RngImpl {
    * @{
    */
   template <typename Type, typename LenType = int>
-  void uniform(Type *ptr, LenType len, Type start, Type end,
-               cudaStream_t stream) {
+  void uniform(Type* ptr, LenType len, Type start, Type end, cudaStream_t stream)
+  {
     detail::RngImpl::uniform(ptr, len, start, end, stream);
   }
   template <typename IntType, typename LenType = int>
-  void uniformInt(IntType *ptr, LenType len, IntType start, IntType end,
-                  cudaStream_t stream) {
+  void uniformInt(IntType* ptr, LenType len, IntType start, IntType end, cudaStream_t stream)
+  {
     detail::RngImpl::uniformInt(ptr, len, start, end, stream);
   }
   /** @} */
@@ -131,13 +133,13 @@ class Rng : public detail::RngImpl {
    * @{
    */
   template <typename Type, typename LenType = int>
-  void normal(Type *ptr, LenType len, Type mu, Type sigma,
-              cudaStream_t stream) {
+  void normal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  {
     detail::RngImpl::normal(ptr, len, mu, sigma, stream);
   }
   template <typename IntType, typename LenType = int>
-  void normalInt(IntType *ptr, LenType len, IntType mu, IntType sigma,
-                 cudaStream_t stream) {
+  void normalInt(IntType* ptr, LenType len, IntType mu, IntType sigma, cudaStream_t stream)
+  {
     detail::RngImpl::normalInt(ptr, len, mu, sigma, stream);
   }
   /** @} */
@@ -163,10 +165,15 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void normalTable(Type *ptr, LenType n_rows, LenType n_cols, const Type *mu,
-                   const Type *sigma_vec, Type sigma, cudaStream_t stream) {
-    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma,
-                                 stream);
+  void normalTable(Type* ptr,
+                   LenType n_rows,
+                   LenType n_cols,
+                   const Type* mu,
+                   const Type* sigma_vec,
+                   Type sigma,
+                   cudaStream_t stream)
+  {
+    detail::RngImpl::normalTable(ptr, n_rows, n_cols, mu, sigma_vec, sigma, stream);
   }
 
   /**
@@ -179,7 +186,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void fill(Type *ptr, LenType len, Type val, cudaStream_t stream) {
+  void fill(Type* ptr, LenType len, Type val, cudaStream_t stream)
+  {
     detail::RngImpl::fill(ptr, len, val, stream);
   }
 
@@ -196,7 +204,8 @@ class Rng : public detail::RngImpl {
    * @param[in]  stream stream where to launch the kernel
    */
   template <typename Type, typename OutType = bool, typename LenType = int>
-  void bernoulli(OutType *ptr, LenType len, Type prob, cudaStream_t stream) {
+  void bernoulli(OutType* ptr, LenType len, Type prob, cudaStream_t stream)
+  {
     detail::RngImpl::bernoulli(ptr, len, prob, stream);
   }
 
@@ -211,8 +220,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void scaled_bernoulli(Type *ptr, LenType len, Type prob, Type scale,
-                        cudaStream_t stream) {
+  void scaled_bernoulli(Type* ptr, LenType len, Type prob, Type scale, cudaStream_t stream)
+  {
     detail::RngImpl::scaled_bernoulli(ptr, len, prob, scale, stream);
   }
 
@@ -228,7 +237,8 @@ class Rng : public detail::RngImpl {
    * @note https://en.wikipedia.org/wiki/Gumbel_distribution
    */
   template <typename Type, typename LenType = int>
-  void gumbel(Type *ptr, LenType len, Type mu, Type beta, cudaStream_t stream) {
+  void gumbel(Type* ptr, LenType len, Type mu, Type beta, cudaStream_t stream)
+  {
     detail::RngImpl::gumbel(ptr, len, mu, beta, stream);
   }
 
@@ -243,8 +253,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void lognormal(Type *ptr, LenType len, Type mu, Type sigma,
-                 cudaStream_t stream) {
+  void lognormal(Type* ptr, LenType len, Type mu, Type sigma, cudaStream_t stream)
+  {
     detail::RngImpl::lognormal(ptr, len, mu, sigma, stream);
   }
 
@@ -259,8 +269,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void logistic(Type *ptr, LenType len, Type mu, Type scale,
-                cudaStream_t stream) {
+  void logistic(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  {
     detail::RngImpl::logistic(ptr, len, mu, scale, stream);
   }
 
@@ -274,7 +284,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void exponential(Type *ptr, LenType len, Type lambda, cudaStream_t stream) {
+  void exponential(Type* ptr, LenType len, Type lambda, cudaStream_t stream)
+  {
     detail::RngImpl::exponential(ptr, len, lambda, stream);
   }
 
@@ -288,7 +299,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void rayleigh(Type *ptr, LenType len, Type sigma, cudaStream_t stream) {
+  void rayleigh(Type* ptr, LenType len, Type sigma, cudaStream_t stream)
+  {
     detail::RngImpl::rayleigh(ptr, len, sigma, stream);
   }
 
@@ -303,8 +315,8 @@ class Rng : public detail::RngImpl {
    * @param stream stream where to launch the kernel
    */
   template <typename Type, typename LenType = int>
-  void laplace(Type *ptr, LenType len, Type mu, Type scale,
-               cudaStream_t stream) {
+  void laplace(Type* ptr, LenType len, Type mu, Type scale, cudaStream_t stream)
+  {
     detail::RngImpl::laplace(ptr, len, mu, scale, stream);
   }
 
@@ -334,12 +346,17 @@ class Rng : public detail::RngImpl {
    * @param stream cuda stream
    */
   template <typename DataT, typename WeightsT, typename IdxT = int>
-  void sampleWithoutReplacement(const raft::handle_t &handle, DataT *out,
-                                IdxT *outIdx, const DataT *in,
-                                const WeightsT *wts, IdxT sampledLen, IdxT len,
-                                cudaStream_t stream) {
-    detail::RngImpl::sampleWithoutReplacement(handle, out, outIdx, in, wts,
-                                              sampledLen, len, stream);
+  void sampleWithoutReplacement(const raft::handle_t& handle,
+                                DataT* out,
+                                IdxT* outIdx,
+                                const DataT* in,
+                                const WeightsT* wts,
+                                IdxT sampledLen,
+                                IdxT len,
+                                cudaStream_t stream)
+  {
+    detail::RngImpl::sampleWithoutReplacement(
+      handle, out, outIdx, in, wts, sampledLen, len, stream);
   }
 
   /**
@@ -357,16 +374,14 @@ class Rng : public detail::RngImpl {
    * @param[in]  stream cuda stream
    * @{
    */
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void custom_distribution(OutType *ptr, LenType len, Lambda randOp,
-                           cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void custom_distribution(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
+  {
     detail::RngImpl::custom_distribution(ptr, len, randOp, stream);
   }
-  template <typename OutType, typename MathType = OutType,
-            typename LenType = int, typename Lambda>
-  void custom_distribution2(OutType *ptr, LenType len, Lambda randOp,
-                            cudaStream_t stream) {
+  template <typename OutType, typename MathType = OutType, typename LenType = int, typename Lambda>
+  void custom_distribution2(OutType* ptr, LenType len, Lambda randOp, cudaStream_t stream)
+  {
     detail::RngImpl::custom_distribution2(ptr, len, randOp, stream);
   }
   /** @} */
diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh
index e367550060..5d38bdf4a8 100644
--- a/cpp/include/raft/sparse/convert/coo.cuh
+++ b/cpp/include/raft/sparse/convert/coo.cuh
@@ -37,14 +37,18 @@ namespace sparse {
 namespace convert {
 
 template <typename value_idx = int, int TPB_X = 32>
-__global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m,
-                                  value_idx *coo_rows, value_idx nnz) {
+__global__ void csr_to_coo_kernel(const value_idx* row_ind,
+                                  value_idx m,
+                                  value_idx* coo_rows,
+                                  value_idx nnz)
+{
   // row-based matrix 1 thread per row
   value_idx row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < m) {
     value_idx start_idx = row_ind[row];
-    value_idx stop_idx = get_stop_idx(row, m, nnz, row_ind);
-    for (value_idx i = start_idx; i < stop_idx; i++) coo_rows[i] = row;
+    value_idx stop_idx  = get_stop_idx(row, m, nnz, row_ind);
+    for (value_idx i = start_idx; i < stop_idx; i++)
+      coo_rows[i] = row;
   }
 }
 
@@ -57,14 +61,14 @@ __global__ void csr_to_coo_kernel(const value_idx *row_ind, value_idx m,
  * @param stream: cuda stream to use
  */
 template <typename value_idx = int, int TPB_X = 32>
-void csr_to_coo(const value_idx *row_ind, value_idx m, value_idx *coo_rows,
-                value_idx nnz, cudaStream_t stream) {
+void csr_to_coo(
+  const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
+{
   // @TODO: Use cusparse for this.
   dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_to_coo_kernel<value_idx, TPB_X>
-    <<<grid, blk, 0, stream>>>(row_ind, m, coo_rows, nnz);
+  csr_to_coo_kernel<value_idx, TPB_X><<<grid, blk, 0, stream>>>(row_ind, m, coo_rows, nnz);
 
   CUDA_CHECK(cudaGetLastError());
 }
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 79b18ebd0a..2569b5d90f 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -43,28 +43,32 @@ namespace sparse {
 namespace convert {
 
 template <typename value_t>
-void coo_to_csr(const raft::handle_t &handle, const int *srcRows,
-                const int *srcCols, const value_t *srcVals, int nnz, int m,
-                int *dst_offsets, int *dstCols, value_t *dstVals) {
-  auto stream = handle.get_stream();
+void coo_to_csr(const raft::handle_t& handle,
+                const int* srcRows,
+                const int* srcCols,
+                const value_t* srcVals,
+                int nnz,
+                int m,
+                int* dst_offsets,
+                int* dstCols,
+                value_t* dstVals)
+{
+  auto stream         = handle.get_stream();
   auto cusparseHandle = handle.get_cusparse_handle();
   rmm::device_uvector<int> dstRows(nnz, stream);
-  CUDA_CHECK(cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz,
-                             cudaMemcpyDeviceToDevice, stream));
-  CUDA_CHECK(cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz,
-                             cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(dstRows.data(), srcRows, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
+  CUDA_CHECK(
+    cudaMemcpyAsync(dstCols, srcCols, sizeof(int) * nnz, cudaMemcpyDeviceToDevice, stream));
   auto buffSize = raft::sparse::cusparsecoosort_bufferSizeExt(
     cusparseHandle, m, m, nnz, srcRows, srcCols, stream);
   rmm::device_uvector<char> pBuffer(buffSize, stream);
   rmm::device_uvector<int> P(nnz, stream);
-  CUSPARSE_CHECK(
-    cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
-  raft::sparse::cusparsecoosortByRow(cusparseHandle, m, m, nnz, dstRows.data(),
-                                     dstCols, P.data(), pBuffer.data(), stream);
-  raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(),
-                             stream);
-  raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m,
-                                dst_offsets, stream);
+  CUSPARSE_CHECK(cusparseCreateIdentityPermutation(cusparseHandle, nnz, P.data()));
+  raft::sparse::cusparsecoosortByRow(
+    cusparseHandle, m, m, nnz, dstRows.data(), dstCols, P.data(), pBuffer.data(), stream);
+  raft::sparse::cusparsegthr(cusparseHandle, nnz, srcVals, dstVals, P.data(), stream);
+  raft::sparse::cusparsecoo2csr(cusparseHandle, dstRows.data(), nnz, m, dst_offsets, stream);
   CUDA_CHECK(cudaDeviceSynchronize());
 }
 
@@ -83,14 +87,20 @@ void coo_to_csr(const raft::handle_t &handle, const int *srcRows,
  * @param stream cuda stream to use
  * @param fused_op: the fused operation
  */
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
-                           Index_ batchSize, const bool *adj,
-                           Index_ *row_ind_ptr, cudaStream_t stream,
-                           Lambda fused_op) {
+template <typename Index_, int TPB_X = 32, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_* row_ind,
+                           Index_ total_rows,
+                           Index_ nnz,
+                           Index_ batchSize,
+                           const bool* adj,
+                           Index_* row_ind_ptr,
+                           cudaStream_t stream,
+                           Lambda fused_op)
+{
   op::csr_row_op<Index_, TPB_X>(
-    row_ind, batchSize, nnz,
+    row_ind,
+    batchSize,
+    nnz,
     [fused_op, adj, total_rows, row_ind_ptr, batchSize, nnz] __device__(
       Index_ row, Index_ start_idx, Index_ stop_idx) {
       fused_op(row, start_idx, stop_idx);
@@ -106,14 +116,23 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
     stream);
 }
 
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
-                           Index_ batchSize, const bool *adj,
-                           Index_ *row_ind_ptr, cudaStream_t stream) {
-  csr_adj_graph_batched(
-    row_ind, total_rows, nnz, batchSize, adj, row_ind_ptr, stream,
-    [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {});
+template <typename Index_, int TPB_X = 32, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph_batched(const Index_* row_ind,
+                           Index_ total_rows,
+                           Index_ nnz,
+                           Index_ batchSize,
+                           const bool* adj,
+                           Index_* row_ind_ptr,
+                           cudaStream_t stream)
+{
+  csr_adj_graph_batched(row_ind,
+                        total_rows,
+                        nnz,
+                        batchSize,
+                        adj,
+                        row_ind_ptr,
+                        stream,
+                        [] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {});
 }
 
 /**
@@ -129,13 +148,17 @@ void csr_adj_graph_batched(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
  * @param stream cuda stream to use
  * @param fused_op the fused operation
  */
-template <typename Index_, int TPB_X = 32,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
-                   const bool *adj, Index_ *row_ind_ptr, cudaStream_t stream,
-                   Lambda fused_op) {
-  csr_adj_graph_batched<Index_, TPB_X>(row_ind, total_rows, nnz, total_rows,
-                                       adj, row_ind_ptr, stream, fused_op);
+template <typename Index_, int TPB_X = 32, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_adj_graph(const Index_* row_ind,
+                   Index_ total_rows,
+                   Index_ nnz,
+                   const bool* adj,
+                   Index_* row_ind_ptr,
+                   cudaStream_t stream,
+                   Lambda fused_op)
+{
+  csr_adj_graph_batched<Index_, TPB_X>(
+    row_ind, total_rows, nnz, total_rows, adj, row_ind_ptr, stream, fused_op);
 }
 
 /**
@@ -148,8 +171,8 @@ void csr_adj_graph(const Index_ *row_ind, Index_ total_rows, Index_ nnz,
  * @param stream: cuda stream to use
  */
 template <typename T>
-void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
-                       cudaStream_t stream) {
+void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t stream)
+{
   rmm::device_uvector<T> row_counts(m, stream);
 
   CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
@@ -157,11 +180,9 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
   linalg::coo_degree<32>(rows, nnz, row_counts.data(), stream);
 
   // create csr compressed row index from row counts
-  thrust::device_ptr<T> row_counts_d =
-    thrust::device_pointer_cast(row_counts.data());
-  thrust::device_ptr<T> c_ind_d = thrust::device_pointer_cast(row_ind);
-  exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m,
-                 c_ind_d);
+  thrust::device_ptr<T> row_counts_d = thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<T> c_ind_d      = thrust::device_pointer_cast(row_ind);
+  exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d);
 }
 
 /**
@@ -172,7 +193,8 @@ void sorted_coo_to_csr(const T *rows, int nnz, T *row_ind, int m,
  * @param stream: cuda stream to use
  */
 template <typename T>
-void sorted_coo_to_csr(COO<T> *coo, int *row_ind, cudaStream_t stream) {
+void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
+{
   sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
 }
 
diff --git a/cpp/include/raft/sparse/convert/dense.cuh b/cpp/include/raft/sparse/convert/dense.cuh
index 299f9d36d4..e90882b501 100644
--- a/cpp/include/raft/sparse/convert/dense.cuh
+++ b/cpp/include/raft/sparse/convert/dense.cuh
@@ -37,22 +37,20 @@ namespace sparse {
 namespace convert {
 
 template <typename value_t>
-__global__ void csr_to_dense_warp_per_row_kernel(int n_cols,
-                                                 const value_t *csrVal,
-                                                 const int *csrRowPtr,
-                                                 const int *csrColInd,
-                                                 value_t *a) {
+__global__ void csr_to_dense_warp_per_row_kernel(
+  int n_cols, const value_t* csrVal, const int* csrRowPtr, const int* csrColInd, value_t* a)
+{
   int row = blockIdx.x;
   int tid = threadIdx.x;
 
   int colStart = csrRowPtr[row];
-  int colEnd = csrRowPtr[row + 1];
-  int rowNnz = colEnd - colStart;
+  int colEnd   = csrRowPtr[row + 1];
+  int rowNnz   = colEnd - colStart;
 
   for (int i = tid; i < rowNnz; i += blockDim.x) {
     int colIdx = colStart + i;
     if (colIdx < colEnd) {
-      int col = csrColInd[colIdx];
+      int col               = csrColInd[colIdx];
       a[row * n_cols + col] = csrVal[colIdx];
     }
   }
@@ -77,10 +75,17 @@ __global__ void csr_to_dense_warp_per_row_kernel(int n_cols,
  * @param[in] row_major : Is row-major output desired?
  */
 template <typename value_idx, typename value_t>
-void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols,
-                  const value_idx *csr_indptr, const value_idx *csr_indices,
-                  const value_t *csr_data, value_idx lda, value_t *out,
-                  cudaStream_t stream, bool row_major = true) {
+void csr_to_dense(cusparseHandle_t handle,
+                  value_idx nrows,
+                  value_idx ncols,
+                  const value_idx* csr_indptr,
+                  const value_idx* csr_indices,
+                  const value_t* csr_data,
+                  value_idx lda,
+                  value_t* out,
+                  cudaStream_t stream,
+                  bool row_major = true)
+{
   if (!row_major) {
     /**
      * If we need col-major, use cusparse.
@@ -91,15 +96,13 @@ void csr_to_dense(cusparseHandle_t handle, value_idx nrows, value_idx ncols,
     CUSPARSE_CHECK(cusparseSetMatType(out_mat, CUSPARSE_MATRIX_TYPE_GENERAL));
 
     CUSPARSE_CHECK(raft::sparse::cusparsecsr2dense(
-      handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out,
-      lda, stream));
+      handle, nrows, ncols, out_mat, csr_data, csr_indptr, csr_indices, out, lda, stream));
 
     CUSPARSE_CHECK_NO_THROW(cusparseDestroyMatDescr(out_mat));
 
   } else {
     int blockdim = block_dim(ncols);
-    CUDA_CHECK(
-      cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
+    CUDA_CHECK(cudaMemsetAsync(out, 0, nrows * ncols * sizeof(value_t), stream));
     csr_to_dense_warp_per_row_kernel<<<nrows, blockdim, 0, stream>>>(
       ncols, csr_data, csr_indptr, csr_indices, out);
   }
diff --git a/cpp/include/raft/sparse/coo.cuh b/cpp/include/raft/sparse/coo.cuh
index fa21614f8f..ad1bac1e75 100644
--- a/cpp/include/raft/sparse/coo.cuh
+++ b/cpp/include/raft/sparse/coo.cuh
@@ -66,79 +66,79 @@ class COO {
   Index_Type n_cols;
 
   /**
-    * @param stream: CUDA stream to use
-    */
+   * @param stream: CUDA stream to use
+   */
   COO(cudaStream_t stream)
-    : rows_arr(0, stream),
-      cols_arr(0, stream),
-      vals_arr(0, stream),
-      nnz(0),
-      n_rows(0),
-      n_cols(0) {}
+    : rows_arr(0, stream), cols_arr(0, stream), vals_arr(0, stream), nnz(0), n_rows(0), n_cols(0)
+  {
+  }
 
   /**
-    * @param rows: coo rows array
-    * @param cols: coo cols array
-    * @param vals: coo vals array
-    * @param nnz: size of the rows/cols/vals arrays
-    * @param n_rows: number of rows in the dense matrix
-    * @param n_cols: number of cols in the dense matrix
-    */
-  COO(rmm::device_uvector<Index_Type> &rows,
-      rmm::device_uvector<Index_Type> &cols, rmm::device_uvector<T> &vals,
-      Index_Type nnz, Index_Type n_rows = 0, Index_Type n_cols = 0)
-    : rows_arr(rows),
-      cols_arr(cols),
-      vals_arr(vals),
-      nnz(nnz),
-      n_rows(n_rows),
-      n_cols(n_cols) {}
+   * @param rows: coo rows array
+   * @param cols: coo cols array
+   * @param vals: coo vals array
+   * @param nnz: size of the rows/cols/vals arrays
+   * @param n_rows: number of rows in the dense matrix
+   * @param n_cols: number of cols in the dense matrix
+   */
+  COO(rmm::device_uvector<Index_Type>& rows,
+      rmm::device_uvector<Index_Type>& cols,
+      rmm::device_uvector<T>& vals,
+      Index_Type nnz,
+      Index_Type n_rows = 0,
+      Index_Type n_cols = 0)
+    : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols)
+  {
+  }
 
   /**
-    * @param stream: CUDA stream to use
-    * @param nnz: size of the rows/cols/vals arrays
-    * @param n_rows: number of rows in the dense matrix
-    * @param n_cols: number of cols in the dense matrix
-    * @param init: initialize arrays with zeros
-    */
-  COO(cudaStream_t stream, Index_Type nnz, Index_Type n_rows = 0,
-      Index_Type n_cols = 0, bool init = true)
+   * @param stream: CUDA stream to use
+   * @param nnz: size of the rows/cols/vals arrays
+   * @param n_rows: number of rows in the dense matrix
+   * @param n_cols: number of cols in the dense matrix
+   * @param init: initialize arrays with zeros
+   */
+  COO(cudaStream_t stream,
+      Index_Type nnz,
+      Index_Type n_rows = 0,
+      Index_Type n_cols = 0,
+      bool init         = true)
     : rows_arr(nnz, stream),
       cols_arr(nnz, stream),
       vals_arr(nnz, stream),
       nnz(nnz),
       n_rows(n_rows),
-      n_cols(n_cols) {
+      n_cols(n_cols)
+  {
     if (init) init_arrays(stream);
   }
 
-  void init_arrays(cudaStream_t stream) {
-    CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0,
-                               this->nnz * sizeof(Index_Type), stream));
-    CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0,
-                               this->nnz * sizeof(Index_Type), stream));
-    CUDA_CHECK(
-      cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
+  void init_arrays(cudaStream_t stream)
+  {
+    CUDA_CHECK(cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
+    CUDA_CHECK(cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
+    CUDA_CHECK(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
   }
 
   ~COO() {}
 
   /**
-    * @brief Size should be > 0, with the number of rows
-    * and cols in the dense matrix being > 0.
-    */
-  bool validate_size() const {
+   * @brief Size should be > 0, with the number of rows
+   * and cols in the dense matrix being > 0.
+   */
+  bool validate_size() const
+  {
     if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false;
     return true;
   }
 
   /**
-    * @brief If the underlying arrays have not been set,
-    * return false. Otherwise true.
-    */
-  bool validate_mem() const {
-    if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 ||
-        this->vals_arr.size() == 0) {
+   * @brief If the underlying arrays have not been set,
+   * return false. Otherwise true.
+   */
+  bool validate_mem() const
+  {
+    if (this->rows_arr.size() == 0 || this->cols_arr.size() == 0 || this->vals_arr.size() == 0) {
       return false;
     }
 
@@ -148,33 +148,30 @@ class COO {
   /*
    * @brief Returns the rows array
    */
-  Index_Type *rows() { return this->rows_arr.data(); }
+  Index_Type* rows() { return this->rows_arr.data(); }
 
   /**
    * @brief Returns the cols array
    */
-  Index_Type *cols() { return this->cols_arr.data(); }
+  Index_Type* cols() { return this->cols_arr.data(); }
 
   /**
    * @brief Returns the vals array
    */
-  T *vals() { return this->vals_arr.data(); }
+  T* vals() { return this->vals_arr.data(); }
 
   /**
-    * @brief Send human-readable state information to output stream
-    */
-  friend std::ostream &operator<<(std::ostream &out,
-                                  const COO<T, Index_Type> &c) {
+   * @brief Send human-readable state information to output stream
+   */
+  friend std::ostream& operator<<(std::ostream& out, const COO<T, Index_Type>& c)
+  {
     if (c.validate_size() && c.validate_mem()) {
       cudaStream_t stream;
       CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-      out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream)
-          << std::endl;
-      out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream)
-          << std::endl;
-      out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream)
-          << std::endl;
+      out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl;
+      out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl;
+      out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) << std::endl;
       out << "nnz=" << c.nnz << std::endl;
       out << "n_rows=" << c.n_rows << std::endl;
       out << "n_cols=" << c.n_cols << std::endl;
@@ -188,58 +185,59 @@ class COO {
   }
 
   /**
-    * @brief Set the number of rows and cols
-    * @param n_rows: number of rows in the dense matrix
-    * @param n_cols: number of columns in the dense matrix
-    */
-  void setSize(int n_rows, int n_cols) {
+   * @brief Set the number of rows and cols
+   * @param n_rows: number of rows in the dense matrix
+   * @param n_cols: number of columns in the dense matrix
+   */
+  void setSize(int n_rows, int n_cols)
+  {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
   }
 
   /**
-    * @brief Set the number of rows and cols for a square dense matrix
-    * @param n: number of rows and cols
-    */
-  void setSize(int n) {
+   * @brief Set the number of rows and cols for a square dense matrix
+   * @param n: number of rows and cols
+   */
+  void setSize(int n)
+  {
     this->n_rows = n;
     this->n_cols = n;
   }
 
   /**
-    * @brief Allocate the underlying arrays
-    * @param nnz: size of underlying row/col/val arrays
-    * @param init: should values be initialized to 0?
-    * @param stream: CUDA stream to use
-    */
-  void allocate(int nnz, bool init, cudaStream_t stream) {
-    this->allocate(nnz, 0, init, stream);
-  }
+   * @brief Allocate the underlying arrays
+   * @param nnz: size of underlying row/col/val arrays
+   * @param init: should values be initialized to 0?
+   * @param stream: CUDA stream to use
+   */
+  void allocate(int nnz, bool init, cudaStream_t stream) { this->allocate(nnz, 0, init, stream); }
 
   /**
-    * @brief Allocate the underlying arrays
-    * @param nnz: size of the underlying row/col/val arrays
-    * @param size: the number of rows/cols in a square dense matrix
-    * @param init: should values be initialized to 0?
-    * @param stream: CUDA stream to use
-    */
-  void allocate(int nnz, int size, bool init, cudaStream_t stream) {
+   * @brief Allocate the underlying arrays
+   * @param nnz: size of the underlying row/col/val arrays
+   * @param size: the number of rows/cols in a square dense matrix
+   * @param init: should values be initialized to 0?
+   * @param stream: CUDA stream to use
+   */
+  void allocate(int nnz, int size, bool init, cudaStream_t stream)
+  {
     this->allocate(nnz, size, size, init, stream);
   }
 
   /**
-    * @brief Allocate the underlying arrays
-    * @param nnz: size of the underlying row/col/val arrays
-    * @param n_rows: number of rows in the dense matrix
-    * @param n_cols: number of columns in the dense matrix
-    * @param init: should values be initialized to 0?
-    * @param stream: stream to use for init
-    */
-  void allocate(int nnz, int n_rows, int n_cols, bool init,
-                cudaStream_t stream) {
+   * @brief Allocate the underlying arrays
+   * @param nnz: size of the underlying row/col/val arrays
+   * @param n_rows: number of rows in the dense matrix
+   * @param n_cols: number of columns in the dense matrix
+   * @param init: should values be initialized to 0?
+   * @param stream: stream to use for init
+   */
+  void allocate(int nnz, int n_rows, int n_cols, bool init, cudaStream_t stream)
+  {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
-    this->nnz = nnz;
+    this->nnz    = nnz;
 
     this->rows_arr.resize(this->nnz, stream);
     this->cols_arr.resize(this->nnz, stream);
diff --git a/cpp/include/raft/sparse/csr.cuh b/cpp/include/raft/sparse/csr.cuh
index 041aedf41c..f821ce2b98 100644
--- a/cpp/include/raft/sparse/csr.cuh
+++ b/cpp/include/raft/sparse/csr.cuh
@@ -41,57 +41,64 @@ namespace sparse {
 
 struct WeakCCState {
  public:
-  bool *m;
-  WeakCCState(bool *m) : m(m) {}
+  bool* m;
+  WeakCCState(bool* m) : m(m) {}
 };
 
 template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_label_device(Index_ *__restrict__ labels,
-                                     const Index_ *__restrict__ row_ind,
-                                     const Index_ *__restrict__ row_ind_ptr,
-                                     Index_ nnz, bool *__restrict__ m,
-                                     Index_ start_vertex_id, Index_ batch_size,
-                                     Index_ N, Lambda filter_op) {
-  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
+__global__ void weak_cc_label_device(Index_* __restrict__ labels,
+                                     const Index_* __restrict__ row_ind,
+                                     const Index_* __restrict__ row_ind_ptr,
+                                     Index_ nnz,
+                                     bool* __restrict__ m,
+                                     Index_ start_vertex_id,
+                                     Index_ batch_size,
+                                     Index_ N,
+                                     Lambda filter_op)
+{
+  Index_ tid       = threadIdx.x + blockIdx.x * TPB_X;
   Index_ global_id = tid + start_vertex_id;
   if (tid < batch_size && global_id < N) {
     Index_ start = __ldg(row_ind + tid);
 
     Index_ ci, cj;
-    bool ci_mod = false;
-    ci = labels[global_id];
+    bool ci_mod        = false;
+    ci                 = labels[global_id];
     bool ci_allow_prop = filter_op(global_id);
 
     Index_ end = get_stop_idx(tid, batch_size, nnz, row_ind);
     /// TODO: add one element to row_ind and avoid get_stop_idx
     for (Index_ j = start; j < end; j++) {
-      Index_ j_ind = __ldg(row_ind_ptr + j);
-      cj = labels[j_ind];
+      Index_ j_ind       = __ldg(row_ind_ptr + j);
+      cj                 = labels[j_ind];
       bool cj_allow_prop = filter_op(j_ind);
       if (ci < cj && ci_allow_prop) {
         if (sizeof(Index_) == 4)
-          atomicMin((int *)(labels + j_ind), ci);
+          atomicMin((int*)(labels + j_ind), ci);
         else if (sizeof(Index_) == 8)
-          atomicMin((long long int *)(labels + j_ind), ci);
+          atomicMin((long long int*)(labels + j_ind), ci);
         if (cj_allow_prop) *m = true;
       } else if (ci > cj && cj_allow_prop) {
-        ci = cj;
+        ci     = cj;
         ci_mod = true;
       }
     }
     if (ci_mod) {
       if (sizeof(Index_) == 4)
-        atomicMin((int *)(labels + global_id), ci);
+        atomicMin((int*)(labels + global_id), ci);
       else if (sizeof(Index_) == 8)
-        atomicMin((long long int *)(labels + global_id), ci);
+        atomicMin((long long int*)(labels + global_id), ci);
       if (ci_allow_prop) *m = true;
     }
   }
 }
 
 template <typename Index_, int TPB_X = 256, typename Lambda>
-__global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N,
-                                        Index_ MAX_LABEL, Lambda filter_op) {
+__global__ void weak_cc_init_all_kernel(Index_* labels,
+                                        Index_ N,
+                                        Index_ MAX_LABEL,
+                                        Lambda filter_op)
+{
   Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
   if (tid < N) {
     if (filter_op(tid))
@@ -123,22 +130,25 @@ __global__ void weak_cc_init_all_kernel(Index_ *labels, Index_ N,
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_, int TPB_X = 256,
-          typename Lambda = auto(Index_)->bool>
-void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
-                     const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
-                     Index_ start_vertex_id, Index_ batch_size,
-                     WeakCCState *state, cudaStream_t stream,
-                     Lambda filter_op) {
-  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8,
-         "Index_ should be 4 or 8 bytes");
+template <typename Index_, int TPB_X = 256, typename Lambda = auto(Index_)->bool>
+void weak_cc_batched(Index_* labels,
+                     const Index_* row_ind,
+                     const Index_* row_ind_ptr,
+                     Index_ nnz,
+                     Index_ N,
+                     Index_ start_vertex_id,
+                     Index_ batch_size,
+                     WeakCCState* state,
+                     cudaStream_t stream,
+                     Lambda filter_op)
+{
+  ASSERT(sizeof(Index_) == 4 || sizeof(Index_) == 8, "Index_ should be 4 or 8 bytes");
 
   bool host_m;
 
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
   weak_cc_init_all_kernel<Index_, TPB_X>
-    <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(
-      labels, N, MAX_LABEL, filter_op);
+    <<<raft::ceildiv(N, Index_(TPB_X)), TPB_X, 0, stream>>>(labels, N, MAX_LABEL, filter_op);
   CUDA_CHECK(cudaPeekAtLastError());
 
   int n_iters = 0;
@@ -147,8 +157,7 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
 
     weak_cc_label_device<Index_, TPB_X>
       <<<raft::ceildiv(batch_size, Index_(TPB_X)), TPB_X, 0, stream>>>(
-        labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id,
-        batch_size, N, filter_op);
+        labels, row_ind, row_ind_ptr, nnz, state->m, start_vertex_id, batch_size, N, filter_op);
     CUDA_CHECK(cudaPeekAtLastError());
 
     //** Updating m *
@@ -180,12 +189,25 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
  * @param stream the cuda stream to use
  */
 template <typename Index_, int TPB_X = 256>
-void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
-                     const Index_ *row_ind_ptr, Index_ nnz, Index_ N,
-                     Index_ start_vertex_id, Index_ batch_size,
-                     WeakCCState *state, cudaStream_t stream) {
-  weak_cc_batched(labels, row_ind, row_ind_ptr, nnz, N, start_vertex_id,
-                  batch_size, state, stream,
+void weak_cc_batched(Index_* labels,
+                     const Index_* row_ind,
+                     const Index_* row_ind_ptr,
+                     Index_ nnz,
+                     Index_ N,
+                     Index_ start_vertex_id,
+                     Index_ batch_size,
+                     WeakCCState* state,
+                     cudaStream_t stream)
+{
+  weak_cc_batched(labels,
+                  row_ind,
+                  row_ind_ptr,
+                  nnz,
+                  N,
+                  start_vertex_id,
+                  batch_size,
+                  state,
+                  stream,
                   [] __device__(Index_ tid) { return true; });
 }
 
@@ -212,14 +234,18 @@ void weak_cc_batched(Index_ *labels, const Index_ *row_ind,
  * @param filter_op an optional filtering function to determine which points
  * should get considered for labeling. It gets global indexes (not batch-wide!)
  */
-template <typename Index_ = int, int TPB_X = 256,
-          typename Lambda = auto(Index_)->bool>
-void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
-             Index_ nnz, Index_ N, cudaStream_t stream, Lambda filter_op) {
+template <typename Index_ = int, int TPB_X = 256, typename Lambda = auto(Index_)->bool>
+void weak_cc(Index_* labels,
+             const Index_* row_ind,
+             const Index_* row_ind_ptr,
+             Index_ nnz,
+             Index_ N,
+             cudaStream_t stream,
+             Lambda filter_op)
+{
   rmm::device_scalar<bool> m(stream);
   WeakCCState state(m.data());
-  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
-                                 stream, filter_op);
+  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, filter_op);
 }
 
 /**
@@ -244,12 +270,17 @@ void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
  * @param stream the cuda stream to use
  */
 template <typename Index_, int TPB_X = 256>
-void weak_cc(Index_ *labels, const Index_ *row_ind, const Index_ *row_ind_ptr,
-             Index_ nnz, Index_ N, cudaStream_t stream) {
+void weak_cc(Index_* labels,
+             const Index_* row_ind,
+             const Index_* row_ind_ptr,
+             Index_ nnz,
+             Index_ N,
+             cudaStream_t stream)
+{
   rmm::device_scalar<bool> m(stream);
   WeakCCState state(m.data());
-  weak_cc_batched<Index_, TPB_X>(labels, row_ind, row_ind_ptr, nnz, N, 0, N,
-                                 stream, [](Index_) { return true; });
+  weak_cc_batched<Index_, TPB_X>(
+    labels, row_ind, row_ind_ptr, nnz, N, 0, N, stream, [](Index_) { return true; });
 }
 
 };  // namespace sparse
diff --git a/cpp/include/raft/sparse/cusparse_wrappers.h b/cpp/include/raft/sparse/cusparse_wrappers.h
index d072100672..29a244a962 100644
--- a/cpp/include/raft/sparse/cusparse_wrappers.h
+++ b/cpp/include/raft/sparse/cusparse_wrappers.h
@@ -23,10 +23,9 @@
 //#include <cuml/common/logger.hpp>
 
 #define _CUSPARSE_ERR_TO_STR(err) \
-  case err:                       \
-    return #err;
+  case err: return #err;
 
-//Notes:
+// Notes:
 //(1.) CUDA_VER_10_1_UP aggregates all the CUDA version selection logic;
 //(2.) to enforce a lower version,
 //
@@ -43,16 +42,15 @@ namespace raft {
  * @brief Exception thrown when a cuSparse error is encountered.
  */
 struct cusparse_error : public raft::exception {
-  explicit cusparse_error(char const* const message)
-    : raft::exception(message) {}
-  explicit cusparse_error(std::string const& message)
-    : raft::exception(message) {}
+  explicit cusparse_error(char const* const message) : raft::exception(message) {}
+  explicit cusparse_error(std::string const& message) : raft::exception(message) {}
 };
 
 namespace sparse {
 namespace detail {
 
-inline const char* cusparse_error_to_string(cusparseStatus_t err) {
+inline const char* cusparse_error_to_string(cusparseStatus_t err)
+{
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 10100
   return cusparseGetErrorString(err);
 #else   // CUDART_VERSION
@@ -65,8 +63,7 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) {
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_EXECUTION_FAILED);
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_INTERNAL_ERROR);
     _CUSPARSE_ERR_TO_STR(CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED);
-    default:
-      return "CUSPARSE_STATUS_UNKNOWN";
+    default: return "CUSPARSE_STATUS_UNKNOWN";
   };
 #endif  // CUDART_VERSION
 }
@@ -88,8 +85,11 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) {
     cusparseStatus_t const status = (call);                                  \
     if (CUSPARSE_STATUS_SUCCESS != status) {                                 \
       std::string msg{};                                                     \
-      SET_ERROR_MSG(msg, "cuSparse error encountered at: ",                  \
-                    "call='%s', Reason=%d:%s", #call, status,                \
+      SET_ERROR_MSG(msg,                                                     \
+                    "cuSparse error encountered at: ",                       \
+                    "call='%s', Reason=%d:%s",                               \
+                    #call,                                                   \
+                    status,                                                  \
                     raft::sparse::detail::cusparse_error_to_string(status)); \
       throw raft::cusparse_error(msg);                                       \
     }                                                                        \
@@ -100,13 +100,15 @@ inline const char* cusparse_error_to_string(cusparseStatus_t err) {
 
 //@todo: use logger here once logging is enabled
 /** check for cusparse runtime API errors but do not assert */
-#define CUSPARSE_CHECK_NO_THROW(call)                                  \
-  do {                                                                 \
-    cusparseStatus_t err = call;                                       \
-    if (err != CUSPARSE_STATUS_SUCCESS) {                              \
-      printf("CUSPARSE call='%s' got errorcode=%d err=%s", #call, err, \
-             raft::sparse::detail::cusparse_error_to_string(err));     \
-    }                                                                  \
+#define CUSPARSE_CHECK_NO_THROW(call)                              \
+  do {                                                             \
+    cusparseStatus_t err = call;                                   \
+    if (err != CUSPARSE_STATUS_SUCCESS) {                          \
+      printf("CUSPARSE call='%s' got errorcode=%d err=%s",         \
+             #call,                                                \
+             err,                                                  \
+             raft::sparse::detail::cusparse_error_to_string(err)); \
+    }                                                              \
   } while (0)
 
 namespace raft {
@@ -117,28 +119,34 @@ namespace sparse {
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz, const T* vals,
-                              T* vals_sorted, int* d_P, cudaStream_t stream);
+cusparseStatus_t cusparsegthr(
+  cusparseHandle_t handle, int nnz, const T* vals, T* vals_sorted, int* d_P, cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
-                                     const double* vals, double* vals_sorted,
-                                     int* d_P, cudaStream_t stream) {
+inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle,
+                                     int nnz,
+                                     const double* vals,
+                                     double* vals_sorted,
+                                     int* d_P,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P,
-                       CUSPARSE_INDEX_BASE_ZERO);
+  return cusparseDgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO);
 #pragma GCC diagnostic pop
 }
 template <>
-inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
-                                     const float* vals, float* vals_sorted,
-                                     int* d_P, cudaStream_t stream) {
+inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle,
+                                     int nnz,
+                                     const float* vals,
+                                     float* vals_sorted,
+                                     int* d_P,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P,
-                       CUSPARSE_INDEX_BASE_ZERO);
+  return cusparseSgthr(handle, nnz, vals, vals_sorted, d_P, CUSPARSE_INDEX_BASE_ZERO);
 #pragma GCC diagnostic pop
 }
 /** @} */
@@ -148,15 +156,18 @@ inline cusparseStatus_t cusparsegthr(cusparseHandle_t handle, int nnz,
  * @{
  */
 template <typename T>
-void cusparsecoo2csr(cusparseHandle_t handle, const T* cooRowInd, int nnz,
-                     int m, T* csrRowPtr, cudaStream_t stream);
+void cusparsecoo2csr(
+  cusparseHandle_t handle, const T* cooRowInd, int nnz, int m, T* csrRowPtr, cudaStream_t stream);
 template <>
-inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd,
-                            int nnz, int m, int* csrRowPtr,
-                            cudaStream_t stream) {
+inline void cusparsecoo2csr(cusparseHandle_t handle,
+                            const int* cooRowInd,
+                            int nnz,
+                            int m,
+                            int* csrRowPtr,
+                            cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr,
-                                  CUSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CHECK(cusparseXcoo2csr(handle, cooRowInd, nnz, m, csrRowPtr, CUSPARSE_INDEX_BASE_ZERO));
 }
 /** @} */
 
@@ -166,30 +177,54 @@ inline void cusparsecoo2csr(cusparseHandle_t handle, const int* cooRowInd,
  */
 template <typename T>
 size_t cusparsecoosort_bufferSizeExt(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int nnz, const T* cooRows,
-  const T* cooCols, cudaStream_t stream);
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int nnz,
+  const T* cooRows,
+  const T* cooCols,
+  cudaStream_t stream);
 template <>
 inline size_t cusparsecoosort_bufferSizeExt(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int nnz, const int* cooRows,
-  const int* cooCols, cudaStream_t stream) {
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int nnz,
+  const int* cooRows,
+  const int* cooCols,
+  cudaStream_t stream)
+{
   size_t val;
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(
-    cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val));
+  CUSPARSE_CHECK(cusparseXcoosort_bufferSizeExt(handle, m, n, nnz, cooRows, cooCols, &val));
   return val;
 }
 
 template <typename T>
 void cusparsecoosortByRow(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int nnz, T* cooRows, T* cooCols, T* P,
-  void* pBuffer, cudaStream_t stream);
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int nnz,
+  T* cooRows,
+  T* cooCols,
+  T* P,
+  void* pBuffer,
+  cudaStream_t stream);
 template <>
 inline void cusparsecoosortByRow(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int nnz, int* cooRows, int* cooCols,
-  int* P, void* pBuffer, cudaStream_t stream) {
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int nnz,
+  int* cooRows,
+  int* cooCols,
+  int* P,
+  void* pBuffer,
+  cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(
-    cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer));
+  CUSPARSE_CHECK(cusparseXcoosortByRow(handle, m, n, nnz, cooRows, cooCols, P, pBuffer));
 }
 /** @} */
 
@@ -199,37 +234,67 @@ inline void cusparsecoosortByRow(  // NOLINT
  */
 template <typename T>
 cusparseStatus_t cusparsegemmi(  // NOLINT
-  cusparseHandle_t handle, int m, int n, int k, int nnz, const T* alpha,
-  const T* A, int lda, const T* cscValB, const int* cscColPtrB,
-  const int* cscRowIndB, const T* beta, T* C, int ldc, cudaStream_t stream);
+  cusparseHandle_t handle,
+  int m,
+  int n,
+  int k,
+  int nnz,
+  const T* alpha,
+  const T* A,
+  int lda,
+  const T* cscValB,
+  const int* cscColPtrB,
+  const int* cscRowIndB,
+  const T* beta,
+  T* C,
+  int ldc,
+  cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
-                                      int k, int nnz, const float* alpha,
-                                      const float* A, int lda,
+inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const float* alpha,
+                                      const float* A,
+                                      int lda,
                                       const float* cscValB,
                                       const int* cscColPtrB,
-                                      const int* cscRowIndB, const float* beta,
-                                      float* C, int ldc, cudaStream_t stream) {
+                                      const int* cscRowIndB,
+                                      const float* beta,
+                                      float* C,
+                                      int ldc,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseSgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB,
-                        cscColPtrB, cscRowIndB, beta, C, ldc);
+  return cusparseSgemmi(
+    handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
 #pragma GCC diagnostic pop
 }
 template <>
-inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
-                                      int k, int nnz, const double* alpha,
-                                      const double* A, int lda,
+inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const double* alpha,
+                                      const double* A,
+                                      int lda,
                                       const double* cscValB,
                                       const int* cscColPtrB,
-                                      const int* cscRowIndB, const double* beta,
-                                      double* C, int ldc, cudaStream_t stream) {
+                                      const int* cscRowIndB,
+                                      const double* beta,
+                                      double* C,
+                                      int ldc,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDgemmi(handle, m, n, k, nnz, alpha, A, lda, cscValB,
-                        cscColPtrB, cscRowIndB, beta, C, ldc);
+  return cusparseDgemmi(
+    handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
 #pragma GCC diagnostic pop
 }
 /** @} */
@@ -241,49 +306,94 @@ inline cusparseStatus_t cusparsegemmi(cusparseHandle_t handle, int m, int n,
  */
 template <typename IndexT, typename ValueT>
 cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                   int64_t rows, int64_t cols, int64_t nnz,
-                                   IndexT* csrRowOffsets, IndexT* csrColInd,
+                                   int64_t rows,
+                                   int64_t cols,
+                                   int64_t nnz,
+                                   IndexT* csrRowOffsets,
+                                   IndexT* csrColInd,
                                    ValueT* csrValues);
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows, int64_t cols,
-                                          int64_t nnz, int* csrRowOffsets,
-                                          int* csrColInd, float* csrValues) {
-  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
-                           csrColInd, csrValues, CUSPARSE_INDEX_32I,
-                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int* csrRowOffsets,
+                                          int* csrColInd,
+                                          float* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_32F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows, int64_t cols,
-                                          int64_t nnz, int* csrRowOffsets,
-                                          int* csrColInd, double* csrValues) {
-  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
-                           csrColInd, csrValues, CUSPARSE_INDEX_32I,
-                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int* csrRowOffsets,
+                                          int* csrColInd,
+                                          double* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_32I,
+                           CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_64F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows, int64_t cols,
-                                          int64_t nnz, int64_t* csrRowOffsets,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int64_t* csrRowOffsets,
                                           int64_t* csrColInd,
-                                          float* csrValues) {
-  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
-                           csrColInd, csrValues, CUSPARSE_INDEX_64I,
-                           CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO,
+                                          float* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_32F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
-                                          int64_t rows, int64_t cols,
-                                          int64_t nnz, int64_t* csrRowOffsets,
+                                          int64_t rows,
+                                          int64_t cols,
+                                          int64_t nnz,
+                                          int64_t* csrRowOffsets,
                                           int64_t* csrColInd,
-                                          double* csrValues) {
-  return cusparseCreateCsr(spMatDescr, rows, cols, nnz, csrRowOffsets,
-                           csrColInd, csrValues, CUSPARSE_INDEX_64I,
-                           CUSPARSE_INDEX_64I, CUSPARSE_INDEX_BASE_ZERO,
+                                          double* csrValues)
+{
+  return cusparseCreateCsr(spMatDescr,
+                           rows,
+                           cols,
+                           nnz,
+                           csrRowOffsets,
+                           csrColInd,
+                           csrValues,
+                           CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_64I,
+                           CUSPARSE_INDEX_BASE_ZERO,
                            CUDA_R_64F);
 }
 /** @} */
@@ -292,16 +402,19 @@ inline cusparseStatus_t cusparsecreatecsr(cusparseSpMatDescr_t* spMatDescr,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
-                                     int64_t size, T* values);
+cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr, int64_t size, T* values);
 template <>
 inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
-                                            int64_t size, float* values) {
+                                            int64_t size,
+                                            float* values)
+{
   return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_32F);
 }
 template <>
 inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
-                                            int64_t size, double* values) {
+                                            int64_t size,
+                                            double* values)
+{
   return cusparseCreateDnVec(dnVecDescr, size, values, CUDA_R_64F);
 }
 /** @} */
@@ -312,23 +425,30 @@ inline cusparseStatus_t cusparsecreatednvec(cusparseDnVecDescr_t* dnVecDescr,
  */
 template <typename T>
 cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
-                                     int64_t rows, int64_t cols, int64_t ld,
-                                     T* values, cusparseOrder_t order);
+                                     int64_t rows,
+                                     int64_t cols,
+                                     int64_t ld,
+                                     T* values,
+                                     cusparseOrder_t order);
 template <>
 inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
-                                            int64_t rows, int64_t cols,
-                                            int64_t ld, float* values,
-                                            cusparseOrder_t order) {
-  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F,
-                             order);
+                                            int64_t rows,
+                                            int64_t cols,
+                                            int64_t ld,
+                                            float* values,
+                                            cusparseOrder_t order)
+{
+  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_32F, order);
 }
 template <>
 inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
-                                            int64_t rows, int64_t cols,
-                                            int64_t ld, double* values,
-                                            cusparseOrder_t order) {
-  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F,
-                             order);
+                                            int64_t rows,
+                                            int64_t cols,
+                                            int64_t ld,
+                                            double* values,
+                                            cusparseOrder_t order)
+{
+  return cusparseCreateDnMat(dnMatDescr, rows, cols, ld, values, CUDA_R_64F, order);
 }
 /** @} */
 
@@ -337,58 +457,89 @@ inline cusparseStatus_t cusparsecreatednmat(cusparseDnMatDescr_t* dnMatDescr,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsespmv_buffersize(
-  cusparseHandle_t handle, cusparseOperation_t opA, const T* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const T* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  size_t* bufferSize, cudaStream_t stream);
+cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle,
+                                         cusparseOperation_t opA,
+                                         const T* alpha,
+                                         const cusparseSpMatDescr_t matA,
+                                         const cusparseDnVecDescr_t vecX,
+                                         const T* beta,
+                                         const cusparseDnVecDescr_t vecY,
+                                         cusparseSpMVAlg_t alg,
+                                         size_t* bufferSize,
+                                         cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmv_buffersize(
-  cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle,
+                                                cusparseOperation_t opA,
+                                                const float* alpha,
+                                                const cusparseSpMatDescr_t matA,
+                                                const cusparseDnVecDescr_t vecX,
+                                                const float* beta,
+                                                const cusparseDnVecDescr_t vecY,
+                                                cusparseSpMVAlg_t alg,
+                                                size_t* bufferSize,
+                                                cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY,
-                                 CUDA_R_32F, alg, bufferSize);
+  return cusparseSpMV_bufferSize(
+    handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, bufferSize);
 }
 template <>
-inline cusparseStatus_t cusparsespmv_buffersize(
-  cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmv_buffersize(cusparseHandle_t handle,
+                                                cusparseOperation_t opA,
+                                                const double* alpha,
+                                                const cusparseSpMatDescr_t matA,
+                                                const cusparseDnVecDescr_t vecX,
+                                                const double* beta,
+                                                const cusparseDnVecDescr_t vecY,
+                                                cusparseSpMVAlg_t alg,
+                                                size_t* bufferSize,
+                                                cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY,
-                                 CUDA_R_64F, alg, bufferSize);
+  return cusparseSpMV_bufferSize(
+    handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, bufferSize);
 }
 
 template <typename T>
-cusparseStatus_t cusparsespmv(cusparseHandle_t handle, cusparseOperation_t opA,
-                              const T* alpha, const cusparseSpMatDescr_t matA,
-                              const cusparseDnVecDescr_t vecX, const T* beta,
+cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
+                              cusparseOperation_t opA,
+                              const T* alpha,
+                              const cusparseSpMatDescr_t matA,
+                              const cusparseDnVecDescr_t vecX,
+                              const T* beta,
                               const cusparseDnVecDescr_t vecY,
-                              cusparseSpMVAlg_t alg, T* externalBuffer,
+                              cusparseSpMVAlg_t alg,
+                              T* externalBuffer,
                               cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmv(
-  cusparseHandle_t handle, cusparseOperation_t opA, const float* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const float* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  float* externalBuffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     const float* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnVecDescr_t vecX,
+                                     const float* beta,
+                                     const cusparseDnVecDescr_t vecY,
+                                     cusparseSpMVAlg_t alg,
+                                     float* externalBuffer,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F,
-                      alg, externalBuffer);
+  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_32F, alg, externalBuffer);
 }
 template <>
-inline cusparseStatus_t cusparsespmv(
-  cusparseHandle_t handle, cusparseOperation_t opA, const double* alpha,
-  const cusparseSpMatDescr_t matA, const cusparseDnVecDescr_t vecX,
-  const double* beta, const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
-  double* externalBuffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmv(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     const double* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnVecDescr_t vecX,
+                                     const double* beta,
+                                     const cusparseDnVecDescr_t vecY,
+                                     cusparseSpMVAlg_t alg,
+                                     double* externalBuffer,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F,
-                      alg, externalBuffer);
+  return cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, CUDA_R_64F, alg, externalBuffer);
 }
 /** @} */
 #else
@@ -398,29 +549,59 @@ inline cusparseStatus_t cusparsespmv(
  */
 template <typename T>
 cusparseStatus_t cusparsecsrmv(  // NOLINT
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
-  const T* alpha, const cusparseMatDescr_t descr, const T* csrVal,
-  const int* csrRowPtr, const int* csrColInd, const T* x, const T* beta, T* y,
+  cusparseHandle_t handle,
+  cusparseOperation_t trans,
+  int m,
+  int n,
+  int nnz,
+  const T* alpha,
+  const cusparseMatDescr_t descr,
+  const T* csrVal,
+  const int* csrRowPtr,
+  const int* csrColInd,
+  const T* x,
+  const T* beta,
+  T* y,
   cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsecsrmv(
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
-  const float* alpha, const cusparseMatDescr_t descr, const float* csrVal,
-  const int* csrRowPtr, const int* csrColInd, const float* x, const float* beta,
-  float* y, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle,
+                                      cusparseOperation_t trans,
+                                      int m,
+                                      int n,
+                                      int nnz,
+                                      const float* alpha,
+                                      const cusparseMatDescr_t descr,
+                                      const float* csrVal,
+                                      const int* csrRowPtr,
+                                      const int* csrColInd,
+                                      const float* x,
+                                      const float* beta,
+                                      float* y,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseScsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal,
-                        csrRowPtr, csrColInd, x, beta, y);
+  return cusparseScsrmv(
+    handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y);
 }
 template <>
-inline cusparseStatus_t cusparsecsrmv(
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int nnz,
-  const double* alpha, const cusparseMatDescr_t descr, const double* csrVal,
-  const int* csrRowPtr, const int* csrColInd, const double* x,
-  const double* beta, double* y, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrmv(cusparseHandle_t handle,
+                                      cusparseOperation_t trans,
+                                      int m,
+                                      int n,
+                                      int nnz,
+                                      const double* alpha,
+                                      const cusparseMatDescr_t descr,
+                                      const double* csrVal,
+                                      const int* csrRowPtr,
+                                      const int* csrColInd,
+                                      const double* x,
+                                      const double* beta,
+                                      double* y,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseDcsrmv(handle, trans, m, n, nnz, alpha, descr, csrVal,
-                        csrRowPtr, csrColInd, x, beta, y);
+  return cusparseDcsrmv(
+    handle, trans, m, n, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, beta, y);
 }
 /** @} */
 #endif
@@ -431,58 +612,96 @@ inline cusparseStatus_t cusparsecsrmv(
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsespmm_bufferSize(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const T* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC,
-  cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream);
+cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle,
+                                         cusparseOperation_t opA,
+                                         cusparseOperation_t opB,
+                                         const T* alpha,
+                                         const cusparseSpMatDescr_t matA,
+                                         const cusparseDnMatDescr_t matB,
+                                         const T* beta,
+                                         cusparseDnMatDescr_t matC,
+                                         cusparseSpMMAlg_t alg,
+                                         size_t* bufferSize,
+                                         cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmm_bufferSize(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const float* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC,
-  cusparseSpMMAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle,
+                                                cusparseOperation_t opA,
+                                                cusparseOperation_t opB,
+                                                const float* alpha,
+                                                const cusparseSpMatDescr_t matA,
+                                                const cusparseDnMatDescr_t matB,
+                                                const float* beta,
+                                                cusparseDnMatDescr_t matC,
+                                                cusparseSpMMAlg_t alg,
+                                                size_t* bufferSize,
+                                                cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta,
-                                 matC, CUDA_R_32F, alg, bufferSize);
+  return cusparseSpMM_bufferSize(
+    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, bufferSize);
 }
 template <>
-inline cusparseStatus_t cusparsespmm_bufferSize(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const double* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const double* beta,
-  cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, size_t* bufferSize,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmm_bufferSize(cusparseHandle_t handle,
+                                                cusparseOperation_t opA,
+                                                cusparseOperation_t opB,
+                                                const double* alpha,
+                                                const cusparseSpMatDescr_t matA,
+                                                const cusparseDnMatDescr_t matB,
+                                                const double* beta,
+                                                cusparseDnMatDescr_t matC,
+                                                cusparseSpMMAlg_t alg,
+                                                size_t* bufferSize,
+                                                cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM_bufferSize(handle, opA, opB, alpha, matA, matB, beta,
-                                 matC, CUDA_R_64F, alg, bufferSize);
+  return cusparseSpMM_bufferSize(
+    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, bufferSize);
 }
 template <typename T>
-inline cusparseStatus_t cusparsespmm(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const T* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const T* beta, cusparseDnMatDescr_t matC,
-  cusparseSpMMAlg_t alg, T* externalBuffer, cudaStream_t stream);
+inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     cusparseOperation_t opB,
+                                     const T* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnMatDescr_t matB,
+                                     const T* beta,
+                                     cusparseDnMatDescr_t matC,
+                                     cusparseSpMMAlg_t alg,
+                                     T* externalBuffer,
+                                     cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsespmm(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const float* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const float* beta, cusparseDnMatDescr_t matC,
-  cusparseSpMMAlg_t alg, float* externalBuffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     cusparseOperation_t opB,
+                                     const float* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnMatDescr_t matB,
+                                     const float* beta,
+                                     cusparseDnMatDescr_t matC,
+                                     cusparseSpMMAlg_t alg,
+                                     float* externalBuffer,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC,
-                      CUDA_R_32F, alg, externalBuffer);
+  return cusparseSpMM(
+    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_32F, alg, externalBuffer);
 }
 template <>
-inline cusparseStatus_t cusparsespmm(
-  cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
-  const double* alpha, const cusparseSpMatDescr_t matA,
-  const cusparseDnMatDescr_t matB, const double* beta,
-  cusparseDnMatDescr_t matC, cusparseSpMMAlg_t alg, double* externalBuffer,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsespmm(cusparseHandle_t handle,
+                                     cusparseOperation_t opA,
+                                     cusparseOperation_t opB,
+                                     const double* alpha,
+                                     const cusparseSpMatDescr_t matA,
+                                     const cusparseDnMatDescr_t matB,
+                                     const double* beta,
+                                     cusparseDnMatDescr_t matC,
+                                     cusparseSpMMAlg_t alg,
+                                     double* externalBuffer,
+                                     cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseSpMM(handle, opA, opB, alpha, matA, matB, beta, matC,
-                      CUDA_R_64F, alg, externalBuffer);
+  return cusparseSpMM(
+    handle, opA, opB, alpha, matA, matB, beta, matC, CUDA_R_64F, alg, externalBuffer);
 }
 /** @} */
 #else
@@ -492,31 +711,68 @@ inline cusparseStatus_t cusparsespmm(
  */
 template <typename T>
 cusparseStatus_t cusparsecsrmm(  // NOLINT
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
-  int nnz, const T* alpha, const cusparseMatDescr_t descr, const T* csrVal,
-  const int* csrRowPtr, const int* csrColInd, const T* x, const int ldx,
-  const T* beta, T* y, const int ldy, cudaStream_t stream);
+  cusparseHandle_t handle,
+  cusparseOperation_t trans,
+  int m,
+  int n,
+  int k,
+  int nnz,
+  const T* alpha,
+  const cusparseMatDescr_t descr,
+  const T* csrVal,
+  const int* csrRowPtr,
+  const int* csrColInd,
+  const T* x,
+  const int ldx,
+  const T* beta,
+  T* y,
+  const int ldy,
+  cudaStream_t stream);
 template <>
-inline cusparseStatus_t cusparsecsrmm(
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
-  int nnz, const float* alpha, const cusparseMatDescr_t descr,
-  const float* csrVal, const int* csrRowPtr, const int* csrColInd,
-  const float* x, const int ldx, const float* beta, float* y, const int ldy,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle,
+                                      cusparseOperation_t trans,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const float* alpha,
+                                      const cusparseMatDescr_t descr,
+                                      const float* csrVal,
+                                      const int* csrRowPtr,
+                                      const int* csrColInd,
+                                      const float* x,
+                                      const int ldx,
+                                      const float* beta,
+                                      float* y,
+                                      const int ldy,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseScsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal,
-                        csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
+  return cusparseScsrmm(
+    handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
 }
 template <>
-inline cusparseStatus_t cusparsecsrmm(
-  cusparseHandle_t handle, cusparseOperation_t trans, int m, int n, int k,
-  int nnz, const double* alpha, const cusparseMatDescr_t descr,
-  const double* csrVal, const int* csrRowPtr, const int* csrColInd,
-  const double* x, const int ldx, const double* beta, double* y, const int ldy,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrmm(cusparseHandle_t handle,
+                                      cusparseOperation_t trans,
+                                      int m,
+                                      int n,
+                                      int k,
+                                      int nnz,
+                                      const double* alpha,
+                                      const cusparseMatDescr_t descr,
+                                      const double* csrVal,
+                                      const int* csrRowPtr,
+                                      const int* csrColInd,
+                                      const double* x,
+                                      const int ldx,
+                                      const double* beta,
+                                      double* y,
+                                      const int ldy,
+                                      cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseDcsrmm(handle, trans, m, n, k, nnz, alpha, descr, csrVal,
-                        csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
+  return cusparseDcsrmm(
+    handle, trans, m, n, k, nnz, alpha, descr, csrVal, csrRowPtr, csrColInd, x, ldx, beta, y, ldy);
 }
 /** @} */
 #endif
@@ -527,15 +783,22 @@ inline cusparseStatus_t cusparsecsrmm(
  */
 template <typename T>
 void cusparsecsr2coo(  // NOLINT
-  cusparseHandle_t handle, const int n, const int nnz, const T* csrRowPtr,
-  T* cooRowInd, cudaStream_t stream);
+  cusparseHandle_t handle,
+  const int n,
+  const int nnz,
+  const T* csrRowPtr,
+  T* cooRowInd,
+  cudaStream_t stream);
 template <>
-inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz,
-                            const int* csrRowPtr, int* cooRowInd,
-                            cudaStream_t stream) {
+inline void cusparsecsr2coo(cusparseHandle_t handle,
+                            const int n,
+                            const int nnz,
+                            const int* csrRowPtr,
+                            int* cooRowInd,
+                            cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd,
-                                  CUSPARSE_INDEX_BASE_ZERO));
+  CUSPARSE_CHECK(cusparseXcsr2coo(handle, csrRowPtr, nnz, n, cooRowInd, CUSPARSE_INDEX_BASE_ZERO));
 }
 /** @} */
 
@@ -553,7 +816,8 @@ inline void cusparsecsr2coo(cusparseHandle_t handle, const int n, const int nnz,
 // template<>
 inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle,
                                                cusparsePointerMode_t mode,
-                                               cudaStream_t stream) {
+                                               cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
   return cusparseSetPointerMode(handle, mode);
 }
@@ -564,69 +828,203 @@ inline cusparseStatus_t cusparsesetpointermode(cusparseHandle_t handle,
  * @{
  */
 template <typename T>
-cusparseStatus_t cusparsecsrmvex_bufferSize(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA,
-  const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x,
-  const T* beta, T* y, size_t* bufferSizeInBytes, cudaStream_t stream);
-template <>
-inline cusparseStatus_t cusparsecsrmvex_bufferSize(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA,
-  const float* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-  const float* x, const float* beta, float* y, size_t* bufferSizeInBytes,
-  cudaStream_t stream) {
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx_bufferSize(
-    handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F, descrA, csrValA,
-    CUDA_R_32F, csrRowPtrA, csrColIndA, x, CUDA_R_32F, beta, CUDA_R_32F, y,
-    CUDA_R_32F, CUDA_R_32F, bufferSizeInBytes);
-}
-template <>
-inline cusparseStatus_t cusparsecsrmvex_bufferSize(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA,
-  const double* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-  const double* x, const double* beta, double* y, size_t* bufferSizeInBytes,
-  cudaStream_t stream) {
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx_bufferSize(
-    handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F, descrA, csrValA,
-    CUDA_R_64F, csrRowPtrA, csrColIndA, x, CUDA_R_64F, beta, CUDA_R_64F, y,
-    CUDA_R_64F, CUDA_R_64F, bufferSizeInBytes);
+cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
+                                            cusparseAlgMode_t alg,
+                                            cusparseOperation_t transA,
+                                            int m,
+                                            int n,
+                                            int nnz,
+                                            const T* alpha,
+                                            const cusparseMatDescr_t descrA,
+                                            const T* csrValA,
+                                            const int* csrRowPtrA,
+                                            const int* csrColIndA,
+                                            const T* x,
+                                            const T* beta,
+                                            T* y,
+                                            size_t* bufferSizeInBytes,
+                                            cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
+                                                   cusparseAlgMode_t alg,
+                                                   cusparseOperation_t transA,
+                                                   int m,
+                                                   int n,
+                                                   int nnz,
+                                                   const float* alpha,
+                                                   const cusparseMatDescr_t descrA,
+                                                   const float* csrValA,
+                                                   const int* csrRowPtrA,
+                                                   const int* csrColIndA,
+                                                   const float* x,
+                                                   const float* beta,
+                                                   float* y,
+                                                   size_t* bufferSizeInBytes,
+                                                   cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx_bufferSize(handle,
+                                    alg,
+                                    transA,
+                                    m,
+                                    n,
+                                    nnz,
+                                    alpha,
+                                    CUDA_R_32F,
+                                    descrA,
+                                    csrValA,
+                                    CUDA_R_32F,
+                                    csrRowPtrA,
+                                    csrColIndA,
+                                    x,
+                                    CUDA_R_32F,
+                                    beta,
+                                    CUDA_R_32F,
+                                    y,
+                                    CUDA_R_32F,
+                                    CUDA_R_32F,
+                                    bufferSizeInBytes);
+}
+template <>
+inline cusparseStatus_t cusparsecsrmvex_bufferSize(cusparseHandle_t handle,
+                                                   cusparseAlgMode_t alg,
+                                                   cusparseOperation_t transA,
+                                                   int m,
+                                                   int n,
+                                                   int nnz,
+                                                   const double* alpha,
+                                                   const cusparseMatDescr_t descrA,
+                                                   const double* csrValA,
+                                                   const int* csrRowPtrA,
+                                                   const int* csrColIndA,
+                                                   const double* x,
+                                                   const double* beta,
+                                                   double* y,
+                                                   size_t* bufferSizeInBytes,
+                                                   cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx_bufferSize(handle,
+                                    alg,
+                                    transA,
+                                    m,
+                                    n,
+                                    nnz,
+                                    alpha,
+                                    CUDA_R_64F,
+                                    descrA,
+                                    csrValA,
+                                    CUDA_R_64F,
+                                    csrRowPtrA,
+                                    csrColIndA,
+                                    x,
+                                    CUDA_R_64F,
+                                    beta,
+                                    CUDA_R_64F,
+                                    y,
+                                    CUDA_R_64F,
+                                    CUDA_R_64F,
+                                    bufferSizeInBytes);
 }
 
 template <typename T>
-cusparseStatus_t cusparsecsrmvex(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const T* alpha, const cusparseMatDescr_t descrA,
-  const T* csrValA, const int* csrRowPtrA, const int* csrColIndA, const T* x,
-  const T* beta, T* y, T* buffer, cudaStream_t stream);
-template <>
-inline cusparseStatus_t cusparsecsrmvex(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const float* alpha, const cusparseMatDescr_t descrA,
-  const float* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-  const float* x, const float* beta, float* y, float* buffer,
-  cudaStream_t stream) {
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_32F,
-                         descrA, csrValA, CUDA_R_32F, csrRowPtrA, csrColIndA, x,
-                         CUDA_R_32F, beta, CUDA_R_32F, y, CUDA_R_32F,
-                         CUDA_R_32F, buffer);
-}
-template <>
-inline cusparseStatus_t cusparsecsrmvex(
-  cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
-  int m, int n, int nnz, const double* alpha, const cusparseMatDescr_t descrA,
-  const double* csrValA, const int* csrRowPtrA, const int* csrColIndA,
-  const double* x, const double* beta, double* y, double* buffer,
-  cudaStream_t stream) {
-  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseCsrmvEx(handle, alg, transA, m, n, nnz, alpha, CUDA_R_64F,
-                         descrA, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, x,
-                         CUDA_R_64F, beta, CUDA_R_64F, y, CUDA_R_64F,
-                         CUDA_R_64F, buffer);
+cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
+                                 cusparseAlgMode_t alg,
+                                 cusparseOperation_t transA,
+                                 int m,
+                                 int n,
+                                 int nnz,
+                                 const T* alpha,
+                                 const cusparseMatDescr_t descrA,
+                                 const T* csrValA,
+                                 const int* csrRowPtrA,
+                                 const int* csrColIndA,
+                                 const T* x,
+                                 const T* beta,
+                                 T* y,
+                                 T* buffer,
+                                 cudaStream_t stream);
+template <>
+inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
+                                        cusparseAlgMode_t alg,
+                                        cusparseOperation_t transA,
+                                        int m,
+                                        int n,
+                                        int nnz,
+                                        const float* alpha,
+                                        const cusparseMatDescr_t descrA,
+                                        const float* csrValA,
+                                        const int* csrRowPtrA,
+                                        const int* csrColIndA,
+                                        const float* x,
+                                        const float* beta,
+                                        float* y,
+                                        float* buffer,
+                                        cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx(handle,
+                         alg,
+                         transA,
+                         m,
+                         n,
+                         nnz,
+                         alpha,
+                         CUDA_R_32F,
+                         descrA,
+                         csrValA,
+                         CUDA_R_32F,
+                         csrRowPtrA,
+                         csrColIndA,
+                         x,
+                         CUDA_R_32F,
+                         beta,
+                         CUDA_R_32F,
+                         y,
+                         CUDA_R_32F,
+                         CUDA_R_32F,
+                         buffer);
+}
+template <>
+inline cusparseStatus_t cusparsecsrmvex(cusparseHandle_t handle,
+                                        cusparseAlgMode_t alg,
+                                        cusparseOperation_t transA,
+                                        int m,
+                                        int n,
+                                        int nnz,
+                                        const double* alpha,
+                                        const cusparseMatDescr_t descrA,
+                                        const double* csrValA,
+                                        const int* csrRowPtrA,
+                                        const int* csrColIndA,
+                                        const double* x,
+                                        const double* beta,
+                                        double* y,
+                                        double* buffer,
+                                        cudaStream_t stream)
+{
+  CUSPARSE_CHECK(cusparseSetStream(handle, stream));
+  return cusparseCsrmvEx(handle,
+                         alg,
+                         transA,
+                         m,
+                         n,
+                         nnz,
+                         alpha,
+                         CUDA_R_64F,
+                         descrA,
+                         csrValA,
+                         CUDA_R_64F,
+                         csrRowPtrA,
+                         csrColIndA,
+                         x,
+                         CUDA_R_64F,
+                         beta,
+                         CUDA_R_64F,
+                         y,
+                         CUDA_R_64F,
+                         CUDA_R_64F,
+                         buffer);
 }
 
 /** @} */
@@ -637,68 +1035,180 @@ inline cusparseStatus_t cusparsecsrmvex(
  */
 
 template <typename T>
-cusparseStatus_t cusparsecsr2csc_bufferSize(
-  cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream);
+cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle,
+                                            int m,
+                                            int n,
+                                            int nnz,
+                                            const T* csrVal,
+                                            const int* csrRowPtr,
+                                            const int* csrColInd,
+                                            void* cscVal,
+                                            int* cscColPtr,
+                                            int* cscRowInd,
+                                            cusparseAction_t copyValues,
+                                            cusparseIndexBase_t idxBase,
+                                            cusparseCsr2CscAlg_t alg,
+                                            size_t* bufferSize,
+                                            cudaStream_t stream);
 
 template <>
-inline cusparseStatus_t cusparsecsr2csc_bufferSize(
-  cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle,
+                                                   int m,
+                                                   int n,
+                                                   int nnz,
+                                                   const float* csrVal,
+                                                   const int* csrRowPtr,
+                                                   const int* csrColInd,
+                                                   void* cscVal,
+                                                   int* cscColPtr,
+                                                   int* cscRowInd,
+                                                   cusparseAction_t copyValues,
+                                                   cusparseIndexBase_t idxBase,
+                                                   cusparseCsr2CscAlg_t alg,
+                                                   size_t* bufferSize,
+                                                   cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2_bufferSize(
-    handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr,
-    cscRowInd, CUDA_R_32F, copyValues, idxBase, alg, bufferSize);
+  return cusparseCsr2cscEx2_bufferSize(handle,
+                                       m,
+                                       n,
+                                       nnz,
+                                       csrVal,
+                                       csrRowPtr,
+                                       csrColInd,
+                                       cscVal,
+                                       cscColPtr,
+                                       cscRowInd,
+                                       CUDA_R_32F,
+                                       copyValues,
+                                       idxBase,
+                                       alg,
+                                       bufferSize);
 }
 template <>
-inline cusparseStatus_t cusparsecsr2csc_bufferSize(
-  cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, size_t* bufferSize, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsr2csc_bufferSize(cusparseHandle_t handle,
+                                                   int m,
+                                                   int n,
+                                                   int nnz,
+                                                   const double* csrVal,
+                                                   const int* csrRowPtr,
+                                                   const int* csrColInd,
+                                                   void* cscVal,
+                                                   int* cscColPtr,
+                                                   int* cscRowInd,
+                                                   cusparseAction_t copyValues,
+                                                   cusparseIndexBase_t idxBase,
+                                                   cusparseCsr2CscAlg_t alg,
+                                                   size_t* bufferSize,
+                                                   cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2_bufferSize(
-    handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr,
-    cscRowInd, CUDA_R_64F, copyValues, idxBase, alg, bufferSize);
+  return cusparseCsr2cscEx2_bufferSize(handle,
+                                       m,
+                                       n,
+                                       nnz,
+                                       csrVal,
+                                       csrRowPtr,
+                                       csrColInd,
+                                       cscVal,
+                                       cscColPtr,
+                                       cscRowInd,
+                                       CUDA_R_64F,
+                                       copyValues,
+                                       idxBase,
+                                       alg,
+                                       bufferSize);
 }
 
 template <typename T>
-cusparseStatus_t cusparsecsr2csc(
-  cusparseHandle_t handle, int m, int n, int nnz, const T* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream);
+cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
+                                 int m,
+                                 int n,
+                                 int nnz,
+                                 const T* csrVal,
+                                 const int* csrRowPtr,
+                                 const int* csrColInd,
+                                 void* cscVal,
+                                 int* cscColPtr,
+                                 int* cscRowInd,
+                                 cusparseAction_t copyValues,
+                                 cusparseIndexBase_t idxBase,
+                                 cusparseCsr2CscAlg_t alg,
+                                 void* buffer,
+                                 cudaStream_t stream);
 
 template <>
-inline cusparseStatus_t cusparsecsr2csc(
-  cusparseHandle_t handle, int m, int n, int nnz, const float* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
+                                        int m,
+                                        int n,
+                                        int nnz,
+                                        const float* csrVal,
+                                        const int* csrRowPtr,
+                                        const int* csrColInd,
+                                        void* cscVal,
+                                        int* cscColPtr,
+                                        int* cscRowInd,
+                                        cusparseAction_t copyValues,
+                                        cusparseIndexBase_t idxBase,
+                                        cusparseCsr2CscAlg_t alg,
+                                        void* buffer,
+                                        cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
-                            cscVal, cscColPtr, cscRowInd, CUDA_R_32F,
-                            copyValues, idxBase, alg, buffer);
+  return cusparseCsr2cscEx2(handle,
+                            m,
+                            n,
+                            nnz,
+                            csrVal,
+                            csrRowPtr,
+                            csrColInd,
+                            cscVal,
+                            cscColPtr,
+                            cscRowInd,
+                            CUDA_R_32F,
+                            copyValues,
+                            idxBase,
+                            alg,
+                            buffer);
 }
 
 template <>
-inline cusparseStatus_t cusparsecsr2csc(
-  cusparseHandle_t handle, int m, int n, int nnz, const double* csrVal,
-  const int* csrRowPtr, const int* csrColInd, void* cscVal, int* cscColPtr,
-  int* cscRowInd, cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
-  cusparseCsr2CscAlg_t alg, void* buffer, cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsr2csc(cusparseHandle_t handle,
+                                        int m,
+                                        int n,
+                                        int nnz,
+                                        const double* csrVal,
+                                        const int* csrRowPtr,
+                                        const int* csrColInd,
+                                        void* cscVal,
+                                        int* cscColPtr,
+                                        int* cscRowInd,
+                                        cusparseAction_t copyValues,
+                                        cusparseIndexBase_t idxBase,
+                                        cusparseCsr2CscAlg_t alg,
+                                        void* buffer,
+                                        cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 
-  return cusparseCsr2cscEx2(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd,
-                            cscVal, cscColPtr, cscRowInd, CUDA_R_64F,
-                            copyValues, idxBase, alg, buffer);
+  return cusparseCsr2cscEx2(handle,
+                            m,
+                            n,
+                            nnz,
+                            csrVal,
+                            csrRowPtr,
+                            csrColInd,
+                            cscVal,
+                            cscColPtr,
+                            cscRowInd,
+                            CUDA_R_64F,
+                            copyValues,
+                            idxBase,
+                            alg,
+                            buffer);
 }
 
 /** @} */
@@ -709,120 +1219,329 @@ inline cusparseStatus_t cusparsecsr2csc(
  */
 
 template <typename T>
-cusparseStatus_t cusparsecsrgemm2_buffersizeext(
-  cusparseHandle_t handle, int m, int n, int k, const T* alpha, const T* beta,
-  const cusparseMatDescr_t matA, int nnzA, const int* rowindA,
-  const int* indicesA, const cusparseMatDescr_t matB, int nnzB,
-  const int* rowindB, const int* indicesB, const cusparseMatDescr_t matD,
-  int nnzD, const int* rowindD, const int* indicesD, csrgemm2Info_t info,
-  size_t* pBufferSizeInBytes, cudaStream_t stream);
-
-template <>
-inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(
-  cusparseHandle_t handle, int m, int n, int k, const float* alpha,
-  const float* beta, const cusparseMatDescr_t matA, int nnzA,
-  const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB,
-  int nnzB, const int* rowindB, const int* indicesB,
-  const cusparseMatDescr_t matD, int nnzD, const int* rowindD,
-  const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes,
-  cudaStream_t stream) {
+cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle,
+                                                int m,
+                                                int n,
+                                                int k,
+                                                const T* alpha,
+                                                const T* beta,
+                                                const cusparseMatDescr_t matA,
+                                                int nnzA,
+                                                const int* rowindA,
+                                                const int* indicesA,
+                                                const cusparseMatDescr_t matB,
+                                                int nnzB,
+                                                const int* rowindB,
+                                                const int* indicesB,
+                                                const cusparseMatDescr_t matD,
+                                                int nnzD,
+                                                const int* rowindD,
+                                                const int* indicesD,
+                                                csrgemm2Info_t info,
+                                                size_t* pBufferSizeInBytes,
+                                                cudaStream_t stream);
+
+template <>
+inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle,
+                                                       int m,
+                                                       int n,
+                                                       int k,
+                                                       const float* alpha,
+                                                       const float* beta,
+                                                       const cusparseMatDescr_t matA,
+                                                       int nnzA,
+                                                       const int* rowindA,
+                                                       const int* indicesA,
+                                                       const cusparseMatDescr_t matB,
+                                                       int nnzB,
+                                                       const int* rowindB,
+                                                       const int* indicesB,
+                                                       const cusparseMatDescr_t matD,
+                                                       int nnzD,
+                                                       const int* rowindD,
+                                                       const int* indicesD,
+                                                       csrgemm2Info_t info,
+                                                       size_t* pBufferSizeInBytes,
+                                                       cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseScsrgemm2_bufferSizeExt(
-    handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB,
-    indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes);
+  return cusparseScsrgemm2_bufferSizeExt(handle,
+                                         m,
+                                         n,
+                                         k,
+                                         alpha,
+                                         matA,
+                                         nnzA,
+                                         rowindA,
+                                         indicesA,
+                                         matB,
+                                         nnzB,
+                                         rowindB,
+                                         indicesB,
+                                         beta,
+                                         matD,
+                                         nnzD,
+                                         rowindD,
+                                         indicesD,
+                                         info,
+                                         pBufferSizeInBytes);
 #pragma GCC diagnostic pop
 }
 
 template <>
-inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(
-  cusparseHandle_t handle, int m, int n, int k, const double* alpha,
-  const double* beta, const cusparseMatDescr_t matA, int nnzA,
-  const int* rowindA, const int* indicesA, const cusparseMatDescr_t matB,
-  int nnzB, const int* rowindB, const int* indicesB,
-  const cusparseMatDescr_t matD, int nnzD, const int* rowindD,
-  const int* indicesD, csrgemm2Info_t info, size_t* pBufferSizeInBytes,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrgemm2_buffersizeext(cusparseHandle_t handle,
+                                                       int m,
+                                                       int n,
+                                                       int k,
+                                                       const double* alpha,
+                                                       const double* beta,
+                                                       const cusparseMatDescr_t matA,
+                                                       int nnzA,
+                                                       const int* rowindA,
+                                                       const int* indicesA,
+                                                       const cusparseMatDescr_t matB,
+                                                       int nnzB,
+                                                       const int* rowindB,
+                                                       const int* indicesB,
+                                                       const cusparseMatDescr_t matD,
+                                                       int nnzD,
+                                                       const int* rowindD,
+                                                       const int* indicesD,
+                                                       csrgemm2Info_t info,
+                                                       size_t* pBufferSizeInBytes,
+                                                       cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDcsrgemm2_bufferSizeExt(
-    handle, m, n, k, alpha, matA, nnzA, rowindA, indicesA, matB, nnzB, rowindB,
-    indicesB, beta, matD, nnzD, rowindD, indicesD, info, pBufferSizeInBytes);
+  return cusparseDcsrgemm2_bufferSizeExt(handle,
+                                         m,
+                                         n,
+                                         k,
+                                         alpha,
+                                         matA,
+                                         nnzA,
+                                         rowindA,
+                                         indicesA,
+                                         matB,
+                                         nnzB,
+                                         rowindB,
+                                         indicesB,
+                                         beta,
+                                         matD,
+                                         nnzD,
+                                         rowindD,
+                                         indicesD,
+                                         info,
+                                         pBufferSizeInBytes);
 #pragma GCC diagnostic pop
 }
 
-inline cusparseStatus_t cusparsecsrgemm2nnz(
-  cusparseHandle_t handle, int m, int n, int k, const cusparseMatDescr_t matA,
-  int nnzA, const int* rowindA, const int* indicesA,
-  const cusparseMatDescr_t matB, int nnzB, const int* rowindB,
-  const int* indicesB, const cusparseMatDescr_t matD, int nnzD,
-  const int* rowindD, const int* indicesD, const cusparseMatDescr_t matC,
-  int* rowindC, int* nnzC, const csrgemm2Info_t info, void* pBuffer,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrgemm2nnz(cusparseHandle_t handle,
+                                            int m,
+                                            int n,
+                                            int k,
+                                            const cusparseMatDescr_t matA,
+                                            int nnzA,
+                                            const int* rowindA,
+                                            const int* indicesA,
+                                            const cusparseMatDescr_t matB,
+                                            int nnzB,
+                                            const int* rowindB,
+                                            const int* indicesB,
+                                            const cusparseMatDescr_t matD,
+                                            int nnzD,
+                                            const int* rowindD,
+                                            const int* indicesD,
+                                            const cusparseMatDescr_t matC,
+                                            int* rowindC,
+                                            int* nnzC,
+                                            const csrgemm2Info_t info,
+                                            void* pBuffer,
+                                            cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseXcsrgemm2Nnz(handle, m, n, k, matA, nnzA, rowindA, indicesA,
-                              matB, nnzB, rowindB, indicesB, matD, nnzD,
-                              rowindD, indicesD, matC, rowindC, nnzC, info,
+  return cusparseXcsrgemm2Nnz(handle,
+                              m,
+                              n,
+                              k,
+                              matA,
+                              nnzA,
+                              rowindA,
+                              indicesA,
+                              matB,
+                              nnzB,
+                              rowindB,
+                              indicesB,
+                              matD,
+                              nnzD,
+                              rowindD,
+                              indicesD,
+                              matC,
+                              rowindC,
+                              nnzC,
+                              info,
                               pBuffer);
 #pragma GCC diagnostic pop
 }
 
 template <typename T>
-cusparseStatus_t cusparsecsrgemm2(
-  cusparseHandle_t handle, int m, int n, int k, const T* alpha,
-  const cusparseMatDescr_t descrA, int nnzA, const T* csrValA,
-  const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB,
-  int nnzB, const T* csrValB, const int* csrRowPtrB, const int* csrColIndB,
-  const T* beta, const cusparseMatDescr_t descrD, int nnzD, const T* csrValD,
-  const int* csrRowPtrD, const int* csrColIndD, const cusparseMatDescr_t descrC,
-  T* csrValC, const int* csrRowPtrC, int* csrColIndC, const csrgemm2Info_t info,
-  void* pBuffer, cudaStream_t stream);
-
-template <>
-inline cusparseStatus_t cusparsecsrgemm2(
-  cusparseHandle_t handle, int m, int n, int k, const float* alpha,
-  const cusparseMatDescr_t descrA, int nnzA, const float* csrValA,
-  const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB,
-  int nnzB, const float* csrValB, const int* csrRowPtrB, const int* csrColIndB,
-  const float* beta, const cusparseMatDescr_t descrD, int nnzD,
-  const float* csrValD, const int* csrRowPtrD, const int* csrColIndD,
-  const cusparseMatDescr_t descrC, float* csrValC, const int* csrRowPtrC,
-  int* csrColIndC, const csrgemm2Info_t info, void* pBuffer,
-  cudaStream_t stream) {
+cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
+                                  int m,
+                                  int n,
+                                  int k,
+                                  const T* alpha,
+                                  const cusparseMatDescr_t descrA,
+                                  int nnzA,
+                                  const T* csrValA,
+                                  const int* csrRowPtrA,
+                                  const int* csrColIndA,
+                                  const cusparseMatDescr_t descrB,
+                                  int nnzB,
+                                  const T* csrValB,
+                                  const int* csrRowPtrB,
+                                  const int* csrColIndB,
+                                  const T* beta,
+                                  const cusparseMatDescr_t descrD,
+                                  int nnzD,
+                                  const T* csrValD,
+                                  const int* csrRowPtrD,
+                                  const int* csrColIndD,
+                                  const cusparseMatDescr_t descrC,
+                                  T* csrValC,
+                                  const int* csrRowPtrC,
+                                  int* csrColIndC,
+                                  const csrgemm2Info_t info,
+                                  void* pBuffer,
+                                  cudaStream_t stream);
+
+template <>
+inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
+                                         int m,
+                                         int n,
+                                         int k,
+                                         const float* alpha,
+                                         const cusparseMatDescr_t descrA,
+                                         int nnzA,
+                                         const float* csrValA,
+                                         const int* csrRowPtrA,
+                                         const int* csrColIndA,
+                                         const cusparseMatDescr_t descrB,
+                                         int nnzB,
+                                         const float* csrValB,
+                                         const int* csrRowPtrB,
+                                         const int* csrColIndB,
+                                         const float* beta,
+                                         const cusparseMatDescr_t descrD,
+                                         int nnzD,
+                                         const float* csrValD,
+                                         const int* csrRowPtrD,
+                                         const int* csrColIndD,
+                                         const cusparseMatDescr_t descrC,
+                                         float* csrValC,
+                                         const int* csrRowPtrC,
+                                         int* csrColIndC,
+                                         const csrgemm2Info_t info,
+                                         void* pBuffer,
+                                         cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseScsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA,
-                           csrRowPtrA, csrColIndA, descrB, nnzB, csrValB,
-                           csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD,
-                           csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC,
-                           csrColIndC, info, pBuffer);
+  return cusparseScsrgemm2(handle,
+                           m,
+                           n,
+                           k,
+                           alpha,
+                           descrA,
+                           nnzA,
+                           csrValA,
+                           csrRowPtrA,
+                           csrColIndA,
+                           descrB,
+                           nnzB,
+                           csrValB,
+                           csrRowPtrB,
+                           csrColIndB,
+                           beta,
+                           descrD,
+                           nnzD,
+                           csrValD,
+                           csrRowPtrD,
+                           csrColIndD,
+                           descrC,
+                           csrValC,
+                           csrRowPtrC,
+                           csrColIndC,
+                           info,
+                           pBuffer);
 #pragma GCC diagnostic pop
 }
 
 template <>
-inline cusparseStatus_t cusparsecsrgemm2(
-  cusparseHandle_t handle, int m, int n, int k, const double* alpha,
-  const cusparseMatDescr_t descrA, int nnzA, const double* csrValA,
-  const int* csrRowPtrA, const int* csrColIndA, const cusparseMatDescr_t descrB,
-  int nnzB, const double* csrValB, const int* csrRowPtrB, const int* csrColIndB,
-  const double* beta, const cusparseMatDescr_t descrD, int nnzD,
-  const double* csrValD, const int* csrRowPtrD, const int* csrColIndD,
-  const cusparseMatDescr_t descrC, double* csrValC, const int* csrRowPtrC,
-  int* csrColIndC, const csrgemm2Info_t info, void* pBuffer,
-  cudaStream_t stream) {
+inline cusparseStatus_t cusparsecsrgemm2(cusparseHandle_t handle,
+                                         int m,
+                                         int n,
+                                         int k,
+                                         const double* alpha,
+                                         const cusparseMatDescr_t descrA,
+                                         int nnzA,
+                                         const double* csrValA,
+                                         const int* csrRowPtrA,
+                                         const int* csrColIndA,
+                                         const cusparseMatDescr_t descrB,
+                                         int nnzB,
+                                         const double* csrValB,
+                                         const int* csrRowPtrB,
+                                         const int* csrColIndB,
+                                         const double* beta,
+                                         const cusparseMatDescr_t descrD,
+                                         int nnzD,
+                                         const double* csrValD,
+                                         const int* csrRowPtrD,
+                                         const int* csrColIndD,
+                                         const cusparseMatDescr_t descrC,
+                                         double* csrValC,
+                                         const int* csrRowPtrC,
+                                         int* csrColIndC,
+                                         const csrgemm2Info_t info,
+                                         void* pBuffer,
+                                         cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  return cusparseDcsrgemm2(handle, m, n, k, alpha, descrA, nnzA, csrValA,
-                           csrRowPtrA, csrColIndA, descrB, nnzB, csrValB,
-                           csrRowPtrB, csrColIndB, beta, descrD, nnzD, csrValD,
-                           csrRowPtrD, csrColIndD, descrC, csrValC, csrRowPtrC,
-                           csrColIndC, info, pBuffer);
+  return cusparseDcsrgemm2(handle,
+                           m,
+                           n,
+                           k,
+                           alpha,
+                           descrA,
+                           nnzA,
+                           csrValA,
+                           csrRowPtrA,
+                           csrColIndA,
+                           descrB,
+                           nnzB,
+                           csrValB,
+                           csrRowPtrB,
+                           csrColIndB,
+                           beta,
+                           descrD,
+                           nnzD,
+                           csrValD,
+                           csrRowPtrD,
+                           csrColIndD,
+                           descrC,
+                           csrValC,
+                           csrRowPtrC,
+                           csrColIndC,
+                           info,
+                           pBuffer);
 #pragma GCC diagnostic pop
 }
 
@@ -834,33 +1553,46 @@ inline cusparseStatus_t cusparsecsrgemm2(
  */
 
 template <typename T>
-cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n,
+cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
+                                   int m,
+                                   int n,
                                    const cusparseMatDescr_t descrA,
-                                   const T* csrValA, const int* csrRowPtrA,
-                                   const int* csrColIndA, T* A, int lda,
+                                   const T* csrValA,
+                                   const int* csrRowPtrA,
+                                   const int* csrColIndA,
+                                   T* A,
+                                   int lda,
                                    cudaStream_t stream);
 
 template <>
-inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n,
+inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
+                                          int m,
+                                          int n,
                                           const cusparseMatDescr_t descrA,
                                           const float* csrValA,
                                           const int* csrRowPtrA,
-                                          const int* csrColIndA, float* A,
-                                          int lda, cudaStream_t stream) {
+                                          const int* csrColIndA,
+                                          float* A,
+                                          int lda,
+                                          cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA,
-                            csrColIndA, A, lda);
+  return cusparseScsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
 }
 template <>
-inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle, int m, int n,
+inline cusparseStatus_t cusparsecsr2dense(cusparseHandle_t handle,
+                                          int m,
+                                          int n,
                                           const cusparseMatDescr_t descrA,
                                           const double* csrValA,
                                           const int* csrRowPtrA,
-                                          const int* csrColIndA, double* A,
-                                          int lda, cudaStream_t stream) {
+                                          const int* csrColIndA,
+                                          double* A,
+                                          int lda,
+                                          cudaStream_t stream)
+{
   CUSPARSE_CHECK(cusparseSetStream(handle, stream));
-  return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA,
-                            csrColIndA, A, lda);
+  return cusparseDcsr2dense(handle, m, n, descrA, csrValA, csrRowPtrA, csrColIndA, A, lda);
 }
 
 /** @} */
diff --git a/cpp/include/raft/sparse/distance/common.h b/cpp/include/raft/sparse/distance/common.h
index 1c55412eec..29c823bcdb 100644
--- a/cpp/include/raft/sparse/distance/common.h
+++ b/cpp/include/raft/sparse/distance/common.h
@@ -24,31 +24,31 @@ namespace distance {
 
 template <typename value_idx, typename value_t>
 struct distances_config_t {
-  distances_config_t(const raft::handle_t &handle_) : handle(handle_) {}
+  distances_config_t(const raft::handle_t& handle_) : handle(handle_) {}
 
   // left side
   value_idx a_nrows;
   value_idx a_ncols;
   value_idx a_nnz;
-  value_idx *a_indptr;
-  value_idx *a_indices;
-  value_t *a_data;
+  value_idx* a_indptr;
+  value_idx* a_indices;
+  value_t* a_data;
 
   // right side
   value_idx b_nrows;
   value_idx b_ncols;
   value_idx b_nnz;
-  value_idx *b_indptr;
-  value_idx *b_indices;
-  value_t *b_data;
+  value_idx* b_indptr;
+  value_idx* b_indices;
+  value_t* b_data;
 
-  const raft::handle_t &handle;
+  const raft::handle_t& handle;
 };
 
 template <typename value_t>
 class distances_t {
  public:
-  virtual void compute(value_t *out) {}
+  virtual void compute(value_t* out) {}
   virtual ~distances_t() = default;
 };
 
diff --git a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
index 3f8c32a20b..4d3b31df9a 100644
--- a/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/bin_distance.cuh
@@ -35,9 +35,11 @@ namespace distance {
 namespace detail {
 // @TODO: Move this into sparse prims (coo_norm)
 template <typename value_idx, typename value_t>
-__global__ void compute_binary_row_norm_kernel(
-  value_t *out, const value_idx *__restrict__ coo_rows,
-  const value_t *__restrict__ data, value_idx nnz) {
+__global__ void compute_binary_row_norm_kernel(value_t* out,
+                                               const value_idx* __restrict__ coo_rows,
+                                               const value_t* __restrict__ data,
+                                               value_idx nnz)
+{
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
   if (i < nnz) {
     // We do conditional here only because it's
@@ -49,54 +51,63 @@ __global__ void compute_binary_row_norm_kernel(
 }
 
 template <typename value_idx, typename value_t, typename expansion_f>
-__global__ void compute_binary_warp_kernel(value_t *__restrict__ C,
-                                           const value_t *__restrict__ Q_norms,
-                                           const value_t *__restrict__ R_norms,
-                                           value_idx n_rows, value_idx n_cols,
-                                           expansion_f expansion_func) {
+__global__ void compute_binary_warp_kernel(value_t* __restrict__ C,
+                                           const value_t* __restrict__ Q_norms,
+                                           const value_t* __restrict__ R_norms,
+                                           value_idx n_rows,
+                                           value_idx n_cols,
+                                           expansion_f expansion_func)
+{
   std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
-  value_idx i = tid / n_cols;
-  value_idx j = tid % n_cols;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
 
   if (i >= n_rows || j >= n_cols) return;
 
-  value_t q_norm = Q_norms[i];
-  value_t r_norm = R_norms[j];
-  value_t dot = C[(size_t)i * n_cols + j];
+  value_t q_norm            = Q_norms[i];
+  value_t r_norm            = R_norms[j];
+  value_t dot               = C[(size_t)i * n_cols + j];
   C[(size_t)i * n_cols + j] = expansion_func(dot, q_norm, r_norm);
 }
 
-template <typename value_idx, typename value_t, typename expansion_f,
-          int tpb = 1024>
-void compute_binary(value_t *C, const value_t *Q_norms, const value_t *R_norms,
-                    value_idx n_rows, value_idx n_cols,
-                    expansion_f expansion_func, cudaStream_t stream) {
+template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
+void compute_binary(value_t* C,
+                    const value_t* Q_norms,
+                    const value_t* R_norms,
+                    value_idx n_rows,
+                    value_idx n_cols,
+                    expansion_f expansion_func,
+                    cudaStream_t stream)
+{
   int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
   compute_binary_warp_kernel<<<blocks, tpb, 0, stream>>>(
     C, Q_norms, R_norms, n_rows, n_cols, expansion_func);
 }
 
-template <typename value_idx, typename value_t, typename expansion_f,
-          int tpb = 1024>
-void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows,
-                          const value_t *Q_data, value_idx Q_nnz,
-                          const value_idx *R_coo_rows, const value_t *R_data,
-                          value_idx R_nnz, value_idx m, value_idx n,
-                          cudaStream_t stream, expansion_f expansion_func) {
+template <typename value_idx, typename value_t, typename expansion_f, int tpb = 1024>
+void compute_bin_distance(value_t* out,
+                          const value_idx* Q_coo_rows,
+                          const value_t* Q_data,
+                          value_idx Q_nnz,
+                          const value_idx* R_coo_rows,
+                          const value_t* R_data,
+                          value_idx R_nnz,
+                          value_idx m,
+                          value_idx n,
+                          cudaStream_t stream,
+                          expansion_f expansion_func)
+{
   rmm::device_uvector<value_t> Q_norms(m, stream);
   rmm::device_uvector<value_t> R_norms(n, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(
-    cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
 
   compute_binary_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_norms.data(), Q_coo_rows, Q_data, Q_nnz);
   compute_binary_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
     R_norms.data(), R_coo_rows, R_data, R_nnz);
 
-  compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func,
-                 stream);
+  compute_binary(out, Q_norms.data(), R_norms.data(), m, n, expansion_func, stream);
 }
 
 /**
@@ -106,44 +117,51 @@ void compute_bin_distance(value_t *out, const value_idx *Q_coo_rows,
 template <typename value_idx = int, typename value_t = float>
 class jaccard_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit jaccard_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(0, config.handle.get_stream()),
-      ip_dists(config) {}
+  explicit jaccard_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_bin_distance(
-      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
-      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_stream(),
-      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-        value_t q_r_union = q_norm + r_norm;
-        value_t denom = q_r_union - dot;
-
-        value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom);
-
-        // flip the similarity when both rows are 0
-        bool both_empty = q_r_union == 0;
-        return 1 - ((!both_empty * jacc) + both_empty);
-      });
+    compute_bin_distance(out_dists,
+                         search_coo_rows.data(),
+                         config_->a_data,
+                         config_->a_nnz,
+                         b_indices,
+                         b_data,
+                         config_->b_nnz,
+                         config_->a_nrows,
+                         config_->b_nrows,
+                         config_->handle.get_stream(),
+                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                           value_t q_r_union = q_norm + r_norm;
+                           value_t denom     = q_r_union - dot;
+
+                           value_t jacc = ((denom != 0) * dot) / ((denom == 0) + denom);
+
+                           // flip the similarity when both rows are 0
+                           bool both_empty = q_r_union == 0;
+                           return 1 - ((!both_empty * jacc) + both_empty);
+                         });
   }
 
   ~jaccard_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
@@ -155,40 +173,47 @@ class jaccard_expanded_distances_t : public distances_t<value_t> {
 template <typename value_idx = int, typename value_t = float>
 class dice_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit dice_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(0, config.handle.get_stream()),
-      ip_dists(config) {}
+  explicit dice_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_bin_distance(
-      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
-      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_stream(),
-      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-        value_t q_r_union = q_norm + r_norm;
-        value_t dice = (2 * dot) / q_r_union;
-        bool both_empty = q_r_union == 0;
-        return 1 - ((!both_empty * dice) + both_empty);
-      });
+    compute_bin_distance(out_dists,
+                         search_coo_rows.data(),
+                         config_->a_data,
+                         config_->a_nnz,
+                         b_indices,
+                         b_data,
+                         config_->b_nnz,
+                         config_->a_nrows,
+                         config_->b_nrows,
+                         config_->handle.get_stream(),
+                         [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                           value_t q_r_union = q_norm + r_norm;
+                           value_t dice      = (2 * dot) / q_r_union;
+                           bool both_empty   = q_r_union == 0;
+                           return 1 - ((!both_empty * dice) + both_empty);
+                         });
   }
 
   ~dice_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
index 83844b8c54..6694d0fc4f 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv.cuh
@@ -39,19 +39,29 @@ namespace sparse {
 namespace distance {
 namespace detail {
 
-template <typename value_idx, typename value_t, int threads_per_block = 1024,
-          typename product_f, typename accum_f, typename write_f,
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f,
           typename strategy_t>
 inline void balanced_coo_pairwise_generalized_spmv(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
-  value_idx *coo_rows_b, product_f product_func, accum_f accum_func,
-  write_f write_func, strategy_t strategy, int chunk_size = 500000) {
-  CUDA_CHECK(cudaMemsetAsync(
-    out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows,
-    config_.handle.get_stream()));
-
-  strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func,
-                    chunk_size);
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_b,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  strategy_t strategy,
+  int chunk_size = 500000)
+{
+  CUDA_CHECK(cudaMemsetAsync(out_dists,
+                             0,
+                             sizeof(value_t) * config_.a_nrows * config_.b_nrows,
+                             config_.handle.get_stream()));
+
+  strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
 };
 
 /**
@@ -87,39 +97,55 @@ inline void balanced_coo_pairwise_generalized_spmv(
  *            this value was found through profiling and represents a reasonable
  *            setting for both large and small densities
  */
-template <typename value_idx, typename value_t, int threads_per_block = 1024,
-          typename product_f, typename accum_f, typename write_f>
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
 inline void balanced_coo_pairwise_generalized_spmv(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
-  value_idx *coo_rows_b, product_f product_func, accum_f accum_func,
-  write_f write_func, int chunk_size = 500000) {
-  CUDA_CHECK(cudaMemsetAsync(
-    out_dists, 0, sizeof(value_t) * config_.a_nrows * config_.b_nrows,
-    config_.handle.get_stream()));
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_b,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  int chunk_size = 500000)
+{
+  CUDA_CHECK(cudaMemsetAsync(out_dists,
+                             0,
+                             sizeof(value_t) * config_.a_nrows * config_.b_nrows,
+                             config_.handle.get_stream()));
 
   int max_cols = max_cols_per_block<value_idx, value_t>();
 
   if (max_cols > config_.a_ncols) {
-    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(
-      config_);
-    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func,
-                      write_func, chunk_size);
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
   } else {
     hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
-    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func,
-                      write_func, chunk_size);
+    strategy.dispatch(out_dists, coo_rows_b, product_func, accum_func, write_func, chunk_size);
   }
 };
 
-template <typename value_idx, typename value_t, int threads_per_block = 1024,
-          typename product_f, typename accum_f, typename write_f,
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f,
           typename strategy_t>
 inline void balanced_coo_pairwise_generalized_spmv_rev(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
-  value_idx *coo_rows_a, product_f product_func, accum_f accum_func,
-  write_f write_func, strategy_t strategy, int chunk_size = 500000) {
-  strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func,
-                        write_func, chunk_size);
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_a,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  strategy_t strategy,
+  int chunk_size = 500000)
+{
+  strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
 };
 
 /**
@@ -158,24 +184,30 @@ inline void balanced_coo_pairwise_generalized_spmv_rev(
  *            this value was found through profiling and represents a reasonable
  *            setting for both large and small densities
  */
-template <typename value_idx, typename value_t, int threads_per_block = 1024,
-          typename product_f, typename accum_f, typename write_f>
+template <typename value_idx,
+          typename value_t,
+          int threads_per_block = 1024,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
 inline void balanced_coo_pairwise_generalized_spmv_rev(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> &config_,
-  value_idx *coo_rows_a, product_f product_func, accum_f accum_func,
-  write_f write_func, int chunk_size = 500000) {
+  value_t* out_dists,
+  const distances_config_t<value_idx, value_t>& config_,
+  value_idx* coo_rows_a,
+  product_f product_func,
+  accum_f accum_func,
+  write_f write_func,
+  int chunk_size = 500000)
+{
   // try dense first
   int max_cols = max_cols_per_block<value_idx, value_t>();
 
   if (max_cols > config_.b_ncols) {
-    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(
-      config_);
-    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func,
-                          write_func, chunk_size);
+    dense_smem_strategy<value_idx, value_t, threads_per_block> strategy(config_);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
   } else {
     hash_strategy<value_idx, value_t, threads_per_block> strategy(config_);
-    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func,
-                          write_func, chunk_size);
+    strategy.dispatch_rev(out_dists, coo_rows_a, product_func, accum_func, write_func, chunk_size);
   }
 };
 
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
index 866ff43224..9bfdd3bad0 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_kernel.cuh
@@ -27,68 +27,88 @@ namespace sparse {
 namespace distance {
 namespace detail {
 /**
-  * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
-  * sparse-matrix-sparse-vector multiplication layout (SPMV).
-  * This is intended to be scheduled n_chunks_b times for each row of a.
-  * The steps are as follows:
-  *
-  * 1. Load row from A into dense vector in shared memory.
-  *    This can be further chunked in the future if necessary to support larger
-  *    column sizes.
-  * 2. Threads of block all step through chunks of B in parallel.
-  *    When a new row is encountered in row_indices_b, a segmented
-  *    reduction is performed across the warps and then across the
-  *    block and the final value written out to host memory.
-  *
-  * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf
-  *
-  * @tparam value_idx index type
-  * @tparam value_t value type
-  * @tparam tpb threads per block configured on launch
-  * @tparam rev if this is true, the reduce/accumulate functions are only
-  *         executed when A[col] == 0.0. when executed before/after !rev
-  *         and A & B are reversed, this allows the full symmetric difference
-  *         and intersection to be computed.
-  * @tparam kv_t data type stored in shared mem cache
-  * @tparam product_f reduce function type (semiring product() function).
-  *                  accepts two arguments of value_t and returns a value_t
-  * @tparam accum_f accumulation function type (semiring sum() function).
-  *                 accepts two arguments of value_t and returns a value_t
-  * @tparam write_f function to write value out. this should be mathematically
-  *                 equivalent to the accumulate function but implemented as
-  *                 an atomic operation on global memory. Accepts two arguments
-  *                 of value_t* and value_t and updates the value given by the
-  *                 pointer.
-  * @param[in] indptrA column pointer array for A
-  * @param[in] indicesA column indices array for A
-  * @param[in] dataA data array for A
-  * @param[in] rowsB coo row array for B
-  * @param[in] indicesB column indices array for B
-  * @param[in] dataB data array for B
-  * @param[in] m number of rows in A
-  * @param[in] n number of rows in B
-  * @param[in] dim number of features
-  * @param[in] nnz_b number of nonzeros in B
-  * @param[out] out array of size m*n
-  * @param[in] n_blocks_per_row number of blocks of B per row of A
-  * @param[in] chunk_size number of nnz for B to use for each row of A
-  * @param[in] buffer_size amount of smem to use for each row of A
-  * @param[in] product_func semiring product() function
-  * @param[in] accum_func semiring sum() function
-  * @param[in] write_func atomic semiring sum() function
-  */
-template <typename strategy_t, typename indptr_it, typename value_idx,
-          typename value_t, bool rev, int tpb, typename product_f,
-          typename accum_f, typename write_f>
-__global__ void balanced_coo_generalized_spmv_kernel(
-  strategy_t strategy, indptr_it indptrA, value_idx *indicesA, value_t *dataA,
-  value_idx nnz_a, value_idx *rowsB, value_idx *indicesB, value_t *dataB,
-  value_idx m, value_idx n, int dim, value_idx nnz_b, value_t *out,
-  int n_blocks_per_row, int chunk_size, value_idx b_ncols,
-  product_f product_func, accum_f accum_func, write_f write_func) {
+ * Load-balanced sparse-matrix-sparse-matrix multiplication (SPMM) kernel with
+ * sparse-matrix-sparse-vector multiplication layout (SPMV).
+ * This is intended to be scheduled n_chunks_b times for each row of a.
+ * The steps are as follows:
+ *
+ * 1. Load row from A into dense vector in shared memory.
+ *    This can be further chunked in the future if necessary to support larger
+ *    column sizes.
+ * 2. Threads of block all step through chunks of B in parallel.
+ *    When a new row is encountered in row_indices_b, a segmented
+ *    reduction is performed across the warps and then across the
+ *    block and the final value written out to host memory.
+ *
+ * Reference: https://www.icl.utk.edu/files/publications/2020/icl-utk-1421-2020.pdf
+ *
+ * @tparam value_idx index type
+ * @tparam value_t value type
+ * @tparam tpb threads per block configured on launch
+ * @tparam rev if this is true, the reduce/accumulate functions are only
+ *         executed when A[col] == 0.0. when executed before/after !rev
+ *         and A & B are reversed, this allows the full symmetric difference
+ *         and intersection to be computed.
+ * @tparam kv_t data type stored in shared mem cache
+ * @tparam product_f reduce function type (semiring product() function).
+ *                  accepts two arguments of value_t and returns a value_t
+ * @tparam accum_f accumulation function type (semiring sum() function).
+ *                 accepts two arguments of value_t and returns a value_t
+ * @tparam write_f function to write value out. this should be mathematically
+ *                 equivalent to the accumulate function but implemented as
+ *                 an atomic operation on global memory. Accepts two arguments
+ *                 of value_t* and value_t and updates the value given by the
+ *                 pointer.
+ * @param[in] indptrA column pointer array for A
+ * @param[in] indicesA column indices array for A
+ * @param[in] dataA data array for A
+ * @param[in] rowsB coo row array for B
+ * @param[in] indicesB column indices array for B
+ * @param[in] dataB data array for B
+ * @param[in] m number of rows in A
+ * @param[in] n number of rows in B
+ * @param[in] dim number of features
+ * @param[in] nnz_b number of nonzeros in B
+ * @param[out] out array of size m*n
+ * @param[in] n_blocks_per_row number of blocks of B per row of A
+ * @param[in] chunk_size number of nnz for B to use for each row of A
+ * @param[in] buffer_size amount of smem to use for each row of A
+ * @param[in] product_func semiring product() function
+ * @param[in] accum_func semiring sum() function
+ * @param[in] write_func atomic semiring sum() function
+ */
+template <typename strategy_t,
+          typename indptr_it,
+          typename value_idx,
+          typename value_t,
+          bool rev,
+          int tpb,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+__global__ void balanced_coo_generalized_spmv_kernel(strategy_t strategy,
+                                                     indptr_it indptrA,
+                                                     value_idx* indicesA,
+                                                     value_t* dataA,
+                                                     value_idx nnz_a,
+                                                     value_idx* rowsB,
+                                                     value_idx* indicesB,
+                                                     value_t* dataB,
+                                                     value_idx m,
+                                                     value_idx n,
+                                                     int dim,
+                                                     value_idx nnz_b,
+                                                     value_t* out,
+                                                     int n_blocks_per_row,
+                                                     int chunk_size,
+                                                     value_idx b_ncols,
+                                                     product_f product_func,
+                                                     accum_f accum_func,
+                                                     write_f write_func)
+{
   typedef cub::WarpReduce<value_t> warp_reduce;
 
-  value_idx cur_row_a = indptrA.get_row_idx(n_blocks_per_row);
+  value_idx cur_row_a        = indptrA.get_row_idx(n_blocks_per_row);
   value_idx cur_chunk_offset = blockIdx.x % n_blocks_per_row;
 
   // chunk starting offset
@@ -96,18 +116,17 @@ __global__ void balanced_coo_generalized_spmv_kernel(
   // how many total cols will be processed by this block (should be <= chunk_size * n_threads)
   value_idx active_chunk_size = min(chunk_size * tpb, nnz_b - ind_offset);
 
-  int tid = threadIdx.x;
+  int tid     = threadIdx.x;
   int warp_id = tid / raft::warp_size();
 
   // compute id relative to current warp
   unsigned int lane_id = tid & (raft::warp_size() - 1);
-  value_idx ind = ind_offset + threadIdx.x;
+  value_idx ind        = ind_offset + threadIdx.x;
 
   extern __shared__ char smem[];
 
-  typename strategy_t::smem_type A = (typename strategy_t::smem_type)(smem);
-  typename warp_reduce::TempStorage *temp_storage =
-    (typename warp_reduce::TempStorage *)(A + dim);
+  typename strategy_t::smem_type A                = (typename strategy_t::smem_type)(smem);
+  typename warp_reduce::TempStorage* temp_storage = (typename warp_reduce::TempStorage*)(A + dim);
 
   auto inserter = strategy.init_insert(A, dim);
 
@@ -115,13 +134,12 @@ __global__ void balanced_coo_generalized_spmv_kernel(
 
   value_idx start_offset_a, stop_offset_a;
   bool first_a_chunk, last_a_chunk;
-  indptrA.get_row_offsets(cur_row_a, start_offset_a, stop_offset_a,
-                          n_blocks_per_row, first_a_chunk, last_a_chunk);
+  indptrA.get_row_offsets(
+    cur_row_a, start_offset_a, stop_offset_a, n_blocks_per_row, first_a_chunk, last_a_chunk);
 
   // Convert current row vector in A to dense
   for (int i = tid; i <= (stop_offset_a - start_offset_a); i += blockDim.x) {
-    strategy.insert(inserter, indicesA[start_offset_a + i],
-                    dataA[start_offset_a + i]);
+    strategy.insert(inserter, indicesA[start_offset_a + i], dataA[start_offset_a + i]);
   }
 
   __syncthreads();
@@ -132,34 +150,36 @@ __global__ void balanced_coo_generalized_spmv_kernel(
   if (ind >= nnz_b) return;
 
   value_idx start_index_a = 0, stop_index_a = b_ncols - 1;
-  indptrA.get_indices_boundary(indicesA, cur_row_a, start_offset_a,
-                               stop_offset_a, start_index_a, stop_index_a,
-                               first_a_chunk, last_a_chunk);
+  indptrA.get_indices_boundary(indicesA,
+                               cur_row_a,
+                               start_offset_a,
+                               stop_offset_a,
+                               start_index_a,
+                               stop_index_a,
+                               first_a_chunk,
+                               last_a_chunk);
 
   value_idx cur_row_b = -1;
-  value_t c = 0.0;
+  value_t c           = 0.0;
 
   auto warp_red = warp_reduce(*(temp_storage + warp_id));
 
   if (tid < active_chunk_size) {
     cur_row_b = rowsB[ind];
 
-    auto index_b = indicesB[ind];
-    auto in_bounds =
-      indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+    auto index_b   = indicesB[ind];
+    auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
 
     if (in_bounds) {
       value_t a_col = strategy.find(finder, index_b);
-      if (!rev || a_col == 0.0) {
-        c = product_func(a_col, dataB[ind]);
-      }
+      if (!rev || a_col == 0.0) { c = product_func(a_col, dataB[ind]); }
     }
   }
 
   // loop through chunks in parallel, reducing when a new row is
   // encountered by each thread
   for (int i = tid; i < active_chunk_size; i += blockDim.x) {
-    value_idx ind_next = ind + blockDim.x;
+    value_idx ind_next   = ind + blockDim.x;
     value_idx next_row_b = -1;
 
     if (i + blockDim.x < active_chunk_size) next_row_b = rowsB[ind_next];
@@ -170,14 +190,13 @@ __global__ void balanced_coo_generalized_spmv_kernel(
       // grab the threads currently participating in loops.
       // because any other threads should have returned already.
       unsigned int peer_group = __match_any_sync(0xffffffff, cur_row_b);
-      bool is_leader = get_lowest_peer(peer_group) == lane_id;
-      value_t v = warp_red.HeadSegmentedReduce(c, is_leader, accum_func);
+      bool is_leader          = get_lowest_peer(peer_group) == lane_id;
+      value_t v               = warp_red.HeadSegmentedReduce(c, is_leader, accum_func);
 
       // thread with lowest lane id among peers writes out
       if (is_leader && v != 0.0) {
         // this conditional should be uniform, since rev is constant
-        size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b
-                          : (size_t)cur_row_b * m + cur_row_a;
+        size_t idx = !rev ? (size_t)cur_row_a * n + cur_row_b : (size_t)cur_row_b * m + cur_row_a;
         write_func(out + idx, v);
       }
 
@@ -187,15 +206,12 @@ __global__ void balanced_coo_generalized_spmv_kernel(
     if (next_row_b != -1) {
       ind = ind_next;
 
-      auto index_b = indicesB[ind];
-      auto in_bounds =
-        indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
+      auto index_b   = indicesB[ind];
+      auto in_bounds = indptrA.check_indices_bounds(start_index_a, stop_index_a, index_b);
       if (in_bounds) {
         value_t a_col = strategy.find(finder, index_b);
 
-        if (!rev || a_col == 0.0) {
-          c = accum_func(c, product_func(a_col, dataB[ind]));
-        }
+        if (!rev || a_col == 0.0) { c = accum_func(c, product_func(a_col, dataB[ind])); }
       }
 
       cur_row_b = next_row_b;
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh
index 4ad3368c4a..9b1dfff022 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/base_strategy.cuh
@@ -31,58 +31,114 @@ namespace detail {
 template <typename value_idx, typename value_t, int tpb>
 class coo_spmv_strategy {
  public:
-  coo_spmv_strategy(const distances_config_t<value_idx, value_t> &config_)
-    : config(config_) {
+  coo_spmv_strategy(const distances_config_t<value_idx, value_t>& config_) : config(config_)
+  {
     smem = raft::getSharedMemPerBlock();
   }
 
-  template <typename strategy_t, typename indptr_it, typename product_f,
-            typename accum_f, typename write_f>
-  void _dispatch_base(strategy_t &strategy, int smem_dim, indptr_it &a_indptr,
-                      value_t *out_dists, value_idx *coo_rows_b,
-                      product_f product_func, accum_f accum_func,
-                      write_f write_func, int chunk_size, int n_blocks,
-                      int n_blocks_per_row) {
-    CUDA_CHECK(cudaFuncSetCacheConfig(
-      balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
-                                           value_t, false, tpb, product_f,
-                                           accum_f, write_f>,
-      cudaFuncCachePreferShared));
+  template <typename strategy_t,
+            typename indptr_it,
+            typename product_f,
+            typename accum_f,
+            typename write_f>
+  void _dispatch_base(strategy_t& strategy,
+                      int smem_dim,
+                      indptr_it& a_indptr,
+                      value_t* out_dists,
+                      value_idx* coo_rows_b,
+                      product_f product_func,
+                      accum_f accum_func,
+                      write_f write_func,
+                      int chunk_size,
+                      int n_blocks,
+                      int n_blocks_per_row)
+  {
+    CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                           indptr_it,
+                                                                           value_idx,
+                                                                           value_t,
+                                                                           false,
+                                                                           tpb,
+                                                                           product_f,
+                                                                           accum_f,
+                                                                           write_f>,
+                                      cudaFuncCachePreferShared));
 
-    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
-                                         value_t, false, tpb>
-      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(
-        strategy, a_indptr, config.a_indices, config.a_data, config.a_nnz,
-        coo_rows_b, config.b_indices, config.b_data, config.a_nrows,
-        config.b_nrows, smem_dim, config.b_nnz, out_dists, n_blocks_per_row,
-        chunk_size, config.b_ncols, product_func, accum_func, write_func);
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, false, tpb>
+      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(strategy,
+                                                            a_indptr,
+                                                            config.a_indices,
+                                                            config.a_data,
+                                                            config.a_nnz,
+                                                            coo_rows_b,
+                                                            config.b_indices,
+                                                            config.b_data,
+                                                            config.a_nrows,
+                                                            config.b_nrows,
+                                                            smem_dim,
+                                                            config.b_nnz,
+                                                            out_dists,
+                                                            n_blocks_per_row,
+                                                            chunk_size,
+                                                            config.b_ncols,
+                                                            product_func,
+                                                            accum_func,
+                                                            write_func);
   }
 
-  template <typename strategy_t, typename indptr_it, typename product_f,
-            typename accum_f, typename write_f>
-  void _dispatch_base_rev(strategy_t &strategy, int smem_dim,
-                          indptr_it &b_indptr, value_t *out_dists,
-                          value_idx *coo_rows_a, product_f product_func,
-                          accum_f accum_func, write_f write_func,
-                          int chunk_size, int n_blocks, int n_blocks_per_row) {
-    CUDA_CHECK(cudaFuncSetCacheConfig(
-      balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
-                                           value_t, true, tpb, product_f,
-                                           accum_f, write_f>,
-      cudaFuncCachePreferShared));
+  template <typename strategy_t,
+            typename indptr_it,
+            typename product_f,
+            typename accum_f,
+            typename write_f>
+  void _dispatch_base_rev(strategy_t& strategy,
+                          int smem_dim,
+                          indptr_it& b_indptr,
+                          value_t* out_dists,
+                          value_idx* coo_rows_a,
+                          product_f product_func,
+                          accum_f accum_func,
+                          write_f write_func,
+                          int chunk_size,
+                          int n_blocks,
+                          int n_blocks_per_row)
+  {
+    CUDA_CHECK(cudaFuncSetCacheConfig(balanced_coo_generalized_spmv_kernel<strategy_t,
+                                                                           indptr_it,
+                                                                           value_idx,
+                                                                           value_t,
+                                                                           true,
+                                                                           tpb,
+                                                                           product_f,
+                                                                           accum_f,
+                                                                           write_f>,
+                                      cudaFuncCachePreferShared));
 
-    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx,
-                                         value_t, true, tpb>
-      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(
-        strategy, b_indptr, config.b_indices, config.b_data, config.b_nnz,
-        coo_rows_a, config.a_indices, config.a_data, config.b_nrows,
-        config.a_nrows, smem_dim, config.a_nnz, out_dists, n_blocks_per_row,
-        chunk_size, config.a_ncols, product_func, accum_func, write_func);
+    balanced_coo_generalized_spmv_kernel<strategy_t, indptr_it, value_idx, value_t, true, tpb>
+      <<<n_blocks, tpb, smem, config.handle.get_stream()>>>(strategy,
+                                                            b_indptr,
+                                                            config.b_indices,
+                                                            config.b_data,
+                                                            config.b_nnz,
+                                                            coo_rows_a,
+                                                            config.a_indices,
+                                                            config.a_data,
+                                                            config.b_nrows,
+                                                            config.a_nrows,
+                                                            smem_dim,
+                                                            config.a_nnz,
+                                                            out_dists,
+                                                            n_blocks_per_row,
+                                                            chunk_size,
+                                                            config.a_ncols,
+                                                            product_func,
+                                                            accum_func,
+                                                            write_func);
   }
 
  protected:
   int smem;
-  const distances_config_t<value_idx, value_t> &config;
+  const distances_config_t<value_idx, value_t>& config;
 };
 
 }  // namespace detail
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
index 0ab7b65ac2..da51767307 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/coo_mask_row_iterators.cuh
@@ -29,11 +29,15 @@ namespace detail {
 template <typename value_idx>
 class mask_row_it {
  public:
-  mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_,
-              value_idx *mask_row_idx_ = NULL)
-    : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_) {}
+  mask_row_it(const value_idx* full_indptr_,
+              const value_idx& n_rows_,
+              value_idx* mask_row_idx_ = NULL)
+    : full_indptr(full_indptr_), mask_row_idx(mask_row_idx_), n_rows(n_rows_)
+  {
+  }
 
-  __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) {
+  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
+  {
     if (mask_row_idx != NULL) {
       return mask_row_idx[blockIdx.x / n_blocks_nnz_b];
     } else {
@@ -41,37 +45,49 @@ class mask_row_it {
     }
   }
 
-  __device__ inline void get_row_offsets(
-    const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset,
-    const value_idx &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) {
+  __device__ inline void get_row_offsets(const value_idx& row_idx,
+                                         value_idx& start_offset,
+                                         value_idx& stop_offset,
+                                         const value_idx& n_blocks_nnz_b,
+                                         bool& first_a_chunk,
+                                         bool& last_a_chunk)
+  {
     start_offset = full_indptr[row_idx];
-    stop_offset = full_indptr[row_idx + 1] - 1;
+    stop_offset  = full_indptr[row_idx + 1] - 1;
   }
 
-  __device__ constexpr inline void get_indices_boundary(
-    const value_idx *indices, value_idx &indices_len, value_idx &start_offset,
-    value_idx &stop_offset, value_idx &start_index, value_idx &stop_index,
-    bool &first_a_chunk, bool &last_a_chunk) {
+  __device__ constexpr inline void get_indices_boundary(const value_idx* indices,
+                                                        value_idx& indices_len,
+                                                        value_idx& start_offset,
+                                                        value_idx& stop_offset,
+                                                        value_idx& start_index,
+                                                        value_idx& stop_index,
+                                                        bool& first_a_chunk,
+                                                        bool& last_a_chunk)
+  {
     // do nothing;
   }
 
-  __device__ constexpr inline bool check_indices_bounds(
-    value_idx &start_index_a, value_idx &stop_index_a, value_idx &index_b) {
+  __device__ constexpr inline bool check_indices_bounds(value_idx& start_index_a,
+                                                        value_idx& stop_index_a,
+                                                        value_idx& index_b)
+  {
     return true;
   }
 
   const value_idx *full_indptr, &n_rows;
-  value_idx *mask_row_idx;
+  value_idx* mask_row_idx;
 };
 
 template <typename value_idx>
-__global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row,
-                                          value_idx *chunk_indices,
-                                          value_idx n_rows) {
+__global__ void fill_chunk_indices_kernel(value_idx* n_chunks_per_row,
+                                          value_idx* chunk_indices,
+                                          value_idx n_rows)
+{
   auto tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid < n_rows) {
     auto start = n_chunks_per_row[tid];
-    auto end = n_chunks_per_row[tid + 1];
+    auto end   = n_chunks_per_row[tid + 1];
 
 #pragma unroll
     for (int i = start; i < end; i++) {
@@ -83,73 +99,89 @@ __global__ void fill_chunk_indices_kernel(value_idx *n_chunks_per_row,
 template <typename value_idx>
 class chunked_mask_row_it : public mask_row_it<value_idx> {
  public:
-  chunked_mask_row_it(const value_idx *full_indptr_, const value_idx &n_rows_,
-                      value_idx *mask_row_idx_, int row_chunk_size_,
-                      const value_idx *n_chunks_per_row_,
-                      const value_idx *chunk_indices_,
+  chunked_mask_row_it(const value_idx* full_indptr_,
+                      const value_idx& n_rows_,
+                      value_idx* mask_row_idx_,
+                      int row_chunk_size_,
+                      const value_idx* n_chunks_per_row_,
+                      const value_idx* chunk_indices_,
                       const cudaStream_t stream_)
     : mask_row_it<value_idx>(full_indptr_, n_rows_, mask_row_idx_),
       row_chunk_size(row_chunk_size_),
       n_chunks_per_row(n_chunks_per_row_),
       chunk_indices(chunk_indices_),
-      stream(stream_) {}
+      stream(stream_)
+  {
+  }
 
-  static void init(const value_idx *indptr, const value_idx *mask_row_idx,
-                   const value_idx &n_rows, const int row_chunk_size,
-                   rmm::device_uvector<value_idx> &n_chunks_per_row,
-                   rmm::device_uvector<value_idx> &chunk_indices,
-                   cudaStream_t stream) {
+  static void init(const value_idx* indptr,
+                   const value_idx* mask_row_idx,
+                   const value_idx& n_rows,
+                   const int row_chunk_size,
+                   rmm::device_uvector<value_idx>& n_chunks_per_row,
+                   rmm::device_uvector<value_idx>& chunk_indices,
+                   cudaStream_t stream)
+  {
     auto policy = rmm::exec_policy(stream);
 
     constexpr value_idx first_element = 0;
     n_chunks_per_row.set_element_async(0, first_element, stream);
     n_chunks_per_row_functor chunk_functor(indptr, row_chunk_size);
-    thrust::transform(policy, mask_row_idx, mask_row_idx + n_rows,
-                      n_chunks_per_row.begin() + 1, chunk_functor);
+    thrust::transform(
+      policy, mask_row_idx, mask_row_idx + n_rows, n_chunks_per_row.begin() + 1, chunk_functor);
 
-    thrust::inclusive_scan(policy, n_chunks_per_row.begin() + 1,
-                           n_chunks_per_row.end(),
-                           n_chunks_per_row.begin() + 1);
+    thrust::inclusive_scan(
+      policy, n_chunks_per_row.begin() + 1, n_chunks_per_row.end(), n_chunks_per_row.begin() + 1);
 
-    raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1,
-                      stream);
+    raft::update_host(&total_row_blocks, n_chunks_per_row.data() + n_rows, 1, stream);
 
     fill_chunk_indices(n_rows, n_chunks_per_row, chunk_indices, stream);
   }
 
-  __device__ inline value_idx get_row_idx(const int &n_blocks_nnz_b) {
+  __device__ inline value_idx get_row_idx(const int& n_blocks_nnz_b)
+  {
     return this->mask_row_idx[chunk_indices[blockIdx.x / n_blocks_nnz_b]];
   }
 
-  __device__ inline void get_row_offsets(
-    const value_idx &row_idx, value_idx &start_offset, value_idx &stop_offset,
-    const int &n_blocks_nnz_b, bool &first_a_chunk, bool &last_a_chunk) {
-    auto chunk_index = blockIdx.x / n_blocks_nnz_b;
-    auto chunk_val = chunk_indices[chunk_index];
-    auto prev_n_chunks = n_chunks_per_row[chunk_val];
+  __device__ inline void get_row_offsets(const value_idx& row_idx,
+                                         value_idx& start_offset,
+                                         value_idx& stop_offset,
+                                         const int& n_blocks_nnz_b,
+                                         bool& first_a_chunk,
+                                         bool& last_a_chunk)
+  {
+    auto chunk_index    = blockIdx.x / n_blocks_nnz_b;
+    auto chunk_val      = chunk_indices[chunk_index];
+    auto prev_n_chunks  = n_chunks_per_row[chunk_val];
     auto relative_chunk = chunk_index - prev_n_chunks;
-    first_a_chunk = relative_chunk == 0;
+    first_a_chunk       = relative_chunk == 0;
 
     start_offset = this->full_indptr[row_idx] + relative_chunk * row_chunk_size;
-    stop_offset = start_offset + row_chunk_size;
+    stop_offset  = start_offset + row_chunk_size;
 
     auto final_stop_offset = this->full_indptr[row_idx + 1];
 
     last_a_chunk = stop_offset >= final_stop_offset;
-    stop_offset = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1;
+    stop_offset  = last_a_chunk ? final_stop_offset - 1 : stop_offset - 1;
   }
 
-  __device__ inline void get_indices_boundary(
-    const value_idx *indices, value_idx &row_idx, value_idx &start_offset,
-    value_idx &stop_offset, value_idx &start_index, value_idx &stop_index,
-    bool &first_a_chunk, bool &last_a_chunk) {
+  __device__ inline void get_indices_boundary(const value_idx* indices,
+                                              value_idx& row_idx,
+                                              value_idx& start_offset,
+                                              value_idx& stop_offset,
+                                              value_idx& start_index,
+                                              value_idx& stop_index,
+                                              bool& first_a_chunk,
+                                              bool& last_a_chunk)
+  {
     start_index = first_a_chunk ? start_index : indices[start_offset - 1] + 1;
-    stop_index = last_a_chunk ? stop_index : indices[stop_offset];
+    stop_index  = last_a_chunk ? stop_index : indices[stop_offset];
   }
 
-  __device__ inline bool check_indices_bounds(value_idx &start_index_a,
-                                              value_idx &stop_index_a,
-                                              value_idx &index_b) {
+  __device__ inline bool check_indices_bounds(value_idx& start_index_a,
+                                              value_idx& stop_index_a,
+                                              value_idx& index_b)
+  {
     return (index_b >= start_index_a && index_b <= stop_index_a);
   }
 
@@ -160,30 +192,34 @@ class chunked_mask_row_it : public mask_row_it<value_idx> {
 
   struct n_chunks_per_row_functor {
    public:
-    n_chunks_per_row_functor(const value_idx *indptr_,
-                             value_idx row_chunk_size_)
-      : indptr(indptr_), row_chunk_size(row_chunk_size_) {}
+    n_chunks_per_row_functor(const value_idx* indptr_, value_idx row_chunk_size_)
+      : indptr(indptr_), row_chunk_size(row_chunk_size_)
+    {
+    }
 
-    __host__ __device__ value_idx operator()(const value_idx &i) {
+    __host__ __device__ value_idx operator()(const value_idx& i)
+    {
       auto degree = indptr[i + 1] - indptr[i];
       return raft::ceildiv(degree, (value_idx)row_chunk_size);
     }
 
-    const value_idx *indptr;
+    const value_idx* indptr;
     value_idx row_chunk_size;
   };
 
  private:
-  static void fill_chunk_indices(
-    const value_idx &n_rows, rmm::device_uvector<value_idx> &n_chunks_per_row,
-    rmm::device_uvector<value_idx> &chunk_indices, cudaStream_t stream) {
+  static void fill_chunk_indices(const value_idx& n_rows,
+                                 rmm::device_uvector<value_idx>& n_chunks_per_row,
+                                 rmm::device_uvector<value_idx>& chunk_indices,
+                                 cudaStream_t stream)
+  {
     auto n_threads = std::min(n_rows, 256);
-    auto n_blocks = raft::ceildiv(n_rows, (value_idx)n_threads);
+    auto n_blocks  = raft::ceildiv(n_rows, (value_idx)n_threads);
 
     chunk_indices.resize(total_row_blocks, stream);
 
-    fill_chunk_indices_kernel<value_idx><<<n_blocks, n_threads, 0, stream>>>(
-      n_chunks_per_row.data(), chunk_indices.data(), n_rows);
+    fill_chunk_indices_kernel<value_idx>
+      <<<n_blocks, n_threads, 0, stream>>>(n_chunks_per_row.data(), chunk_indices.data(), n_rows);
   }
 };
 
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
index 79a5f154d0..5a1c152bd0 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/dense_smem_strategy.cuh
@@ -26,71 +26,91 @@ namespace detail {
 template <typename value_idx, typename value_t, int tpb>
 class dense_smem_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
  public:
-  using smem_type = value_t *;
+  using smem_type   = value_t*;
   using insert_type = smem_type;
-  using find_type = smem_type;
+  using find_type   = smem_type;
 
-  dense_smem_strategy(const distances_config_t<value_idx, value_t> &config_)
-    : coo_spmv_strategy<value_idx, value_t, tpb>(config_) {}
+  dense_smem_strategy(const distances_config_t<value_idx, value_t>& config_)
+    : coo_spmv_strategy<value_idx, value_t, tpb>(config_)
+  {
+  }
 
-  inline static int smem_per_block(int n_cols) {
-    return (n_cols * sizeof(value_t)) +
-           ((1024 / raft::warp_size()) * sizeof(value_t));
+  inline static int smem_per_block(int n_cols)
+  {
+    return (n_cols * sizeof(value_t)) + ((1024 / raft::warp_size()) * sizeof(value_t));
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch(value_t *out_dists, value_idx *coo_rows_b,
-                product_f product_func, accum_f accum_func, write_f write_func,
-                int chunk_size) {
-    auto n_blocks_per_row =
-      raft::ceildiv(this->config.b_nnz, chunk_size * 1024);
-    auto n_blocks = this->config.a_nrows * n_blocks_per_row;
-
-    mask_row_it<value_idx> a_indptr(this->config.a_indptr,
-                                    this->config.a_nrows);
-
-    this->_dispatch_base(*this, this->config.b_ncols, a_indptr, out_dists,
-                         coo_rows_b, product_func, accum_func, write_func,
-                         chunk_size, n_blocks, n_blocks_per_row);
+  void dispatch(value_t* out_dists,
+                value_idx* coo_rows_b,
+                product_f product_func,
+                accum_f accum_func,
+                write_f write_func,
+                int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * 1024);
+    auto n_blocks         = this->config.a_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> a_indptr(this->config.a_indptr, this->config.a_nrows);
+
+    this->_dispatch_base(*this,
+                         this->config.b_ncols,
+                         a_indptr,
+                         out_dists,
+                         coo_rows_b,
+                         product_func,
+                         accum_func,
+                         write_func,
+                         chunk_size,
+                         n_blocks,
+                         n_blocks_per_row);
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a,
-                    product_f product_func, accum_f accum_func,
-                    write_f write_func, int chunk_size) {
-    auto n_blocks_per_row =
-      raft::ceildiv(this->config.a_nnz, chunk_size * 1024);
-    auto n_blocks = this->config.b_nrows * n_blocks_per_row;
-
-    mask_row_it<value_idx> b_indptr(this->config.b_indptr,
-                                    this->config.b_nrows);
-
-    this->_dispatch_base_rev(*this, this->config.a_ncols, b_indptr, out_dists,
-                             coo_rows_a, product_func, accum_func, write_func,
-                             chunk_size, n_blocks, n_blocks_per_row);
+  void dispatch_rev(value_t* out_dists,
+                    value_idx* coo_rows_a,
+                    product_f product_func,
+                    accum_f accum_func,
+                    write_f write_func,
+                    int chunk_size)
+  {
+    auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * 1024);
+    auto n_blocks         = this->config.b_nrows * n_blocks_per_row;
+
+    mask_row_it<value_idx> b_indptr(this->config.b_indptr, this->config.b_nrows);
+
+    this->_dispatch_base_rev(*this,
+                             this->config.a_ncols,
+                             b_indptr,
+                             out_dists,
+                             coo_rows_a,
+                             product_func,
+                             accum_func,
+                             write_func,
+                             chunk_size,
+                             n_blocks,
+                             n_blocks_per_row);
   }
 
-  __device__ inline insert_type init_insert(smem_type cache,
-                                            const value_idx &cache_size) {
+  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
+  {
     for (int k = threadIdx.x; k < cache_size; k += blockDim.x) {
       cache[k] = 0.0;
     }
     return cache;
   }
 
-  __device__ inline void insert(insert_type cache, const value_idx &key,
-                                const value_t &value) {
+  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
+  {
     cache[key] = value;
   }
 
-  __device__ inline find_type init_find(smem_type cache,
-                                        const value_idx &cache_size) {
+  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
+  {
     return cache;
   }
 
-  __device__ inline value_t find(find_type cache, const value_idx &key) {
-    return cache[key];
-  }
+  __device__ inline value_t find(find_type cache, const value_idx& key) { return cache[key]; }
 };
 
 }  // namespace detail
diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
index 5ba2d5c102..4f8637b425 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
@@ -1,18 +1,18 @@
 /*
-  * Copyright (c) 2021, NVIDIA CORPORATION.
-  *
-  * Licensed under the Apache License, Version 2.0 (the "License");
-  * you may not use this file except in compliance with the License.
-  * You may obtain a copy of the License at
-  *
-  *     http://www.apache.org/licenses/LICENSE-2.0
-  *
-  * Unless required by applicable law or agreed to in writing, software
-  * distributed under the License is distributed on an "AS IS" BASIS,
-  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  * See the License for the specific language governing permissions and
-  * limitations under the License.
-  */
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 
@@ -39,177 +39,238 @@ template <typename value_idx, typename value_t, int tpb>
 class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
  public:
   using insert_type =
-    typename cuco::static_map<value_idx, value_t,
-                              cuda::thread_scope_block>::device_mutable_view;
-  using smem_type = typename insert_type::slot_type *;
+    typename cuco::static_map<value_idx, value_t, cuda::thread_scope_block>::device_mutable_view;
+  using smem_type = typename insert_type::slot_type*;
   using find_type =
-    typename cuco::static_map<value_idx, value_t,
-                              cuda::thread_scope_block>::device_view;
+    typename cuco::static_map<value_idx, value_t, cuda::thread_scope_block>::device_view;
 
-  hash_strategy(const distances_config_t<value_idx, value_t> &config_,
-                float capacity_threshold_ = 0.5, int map_size_ = get_map_size())
+  hash_strategy(const distances_config_t<value_idx, value_t>& config_,
+                float capacity_threshold_ = 0.5,
+                int map_size_             = get_map_size())
     : coo_spmv_strategy<value_idx, value_t, tpb>(config_),
       capacity_threshold(capacity_threshold_),
-      map_size(map_size_) {}
+      map_size(map_size_)
+  {
+  }
 
-  void chunking_needed(const value_idx *indptr, const value_idx n_rows,
-                       rmm::device_uvector<value_idx> &mask_indptr,
-                       std::tuple<value_idx, value_idx> &n_rows_divided,
-                       cudaStream_t stream) {
+  void chunking_needed(const value_idx* indptr,
+                       const value_idx n_rows,
+                       rmm::device_uvector<value_idx>& mask_indptr,
+                       std::tuple<value_idx, value_idx>& n_rows_divided,
+                       cudaStream_t stream)
+  {
     auto policy = this->config.handle.get_thrust_policy();
 
-    auto less = thrust::copy_if(
-      policy, thrust::make_counting_iterator(value_idx(0)),
-      thrust::make_counting_iterator(n_rows), mask_indptr.data(),
-      fits_in_hash_table(indptr, 0, capacity_threshold * map_size));
+    auto less                   = thrust::copy_if(policy,
+                                thrust::make_counting_iterator(value_idx(0)),
+                                thrust::make_counting_iterator(n_rows),
+                                mask_indptr.data(),
+                                fits_in_hash_table(indptr, 0, capacity_threshold * map_size));
     std::get<0>(n_rows_divided) = less - mask_indptr.data();
 
     auto more = thrust::copy_if(
-      policy, thrust::make_counting_iterator(value_idx(0)),
-      thrust::make_counting_iterator(n_rows), less,
-      fits_in_hash_table(indptr, capacity_threshold * map_size,
-                         std::numeric_limits<value_idx>::max()));
+      policy,
+      thrust::make_counting_iterator(value_idx(0)),
+      thrust::make_counting_iterator(n_rows),
+      less,
+      fits_in_hash_table(
+        indptr, capacity_threshold * map_size, std::numeric_limits<value_idx>::max()));
     std::get<1>(n_rows_divided) = more - less;
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch(value_t *out_dists, value_idx *coo_rows_b,
-                product_f product_func, accum_f accum_func, write_f write_func,
-                int chunk_size) {
+  void dispatch(value_t* out_dists,
+                value_idx* coo_rows_b,
+                product_f product_func,
+                accum_f accum_func,
+                write_f write_func,
+                int chunk_size)
+  {
     auto n_blocks_per_row = raft::ceildiv(this->config.b_nnz, chunk_size * tpb);
-    rmm::device_uvector<value_idx> mask_indptr(
-      this->config.a_nrows, this->config.handle.get_stream());
+    rmm::device_uvector<value_idx> mask_indptr(this->config.a_nrows,
+                                               this->config.handle.get_stream());
     std::tuple<value_idx, value_idx> n_rows_divided;
 
-    chunking_needed(this->config.a_indptr, this->config.a_nrows, mask_indptr,
-                    n_rows_divided, this->config.handle.get_stream());
+    chunking_needed(this->config.a_indptr,
+                    this->config.a_nrows,
+                    mask_indptr,
+                    n_rows_divided,
+                    this->config.handle.get_stream());
 
     auto less_rows = std::get<0>(n_rows_divided);
     if (less_rows > 0) {
-      mask_row_it<value_idx> less(this->config.a_indptr, less_rows,
-                                  mask_indptr.data());
+      mask_row_it<value_idx> less(this->config.a_indptr, less_rows, mask_indptr.data());
 
       auto n_less_blocks = less_rows * n_blocks_per_row;
-      this->_dispatch_base(*this, map_size, less, out_dists, coo_rows_b,
-                           product_func, accum_func, write_func, chunk_size,
-                           n_less_blocks, n_blocks_per_row);
+      this->_dispatch_base(*this,
+                           map_size,
+                           less,
+                           out_dists,
+                           coo_rows_b,
+                           product_func,
+                           accum_func,
+                           write_func,
+                           chunk_size,
+                           n_less_blocks,
+                           n_blocks_per_row);
     }
 
     auto more_rows = std::get<1>(n_rows_divided);
     if (more_rows > 0) {
-      rmm::device_uvector<value_idx> n_chunks_per_row(
-        more_rows + 1, this->config.handle.get_stream());
-      rmm::device_uvector<value_idx> chunk_indices(
-        0, this->config.handle.get_stream());
-      chunked_mask_row_it<value_idx>::init(
-        this->config.a_indptr, mask_indptr.data() + less_rows, more_rows,
-        capacity_threshold * map_size, n_chunks_per_row, chunk_indices,
-        this->config.handle.get_stream());
-
-      chunked_mask_row_it<value_idx> more(
-        this->config.a_indptr, more_rows, mask_indptr.data() + less_rows,
-        capacity_threshold * map_size, n_chunks_per_row.data(),
-        chunk_indices.data(), this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> n_chunks_per_row(more_rows + 1,
+                                                      this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> chunk_indices(0, this->config.handle.get_stream());
+      chunked_mask_row_it<value_idx>::init(this->config.a_indptr,
+                                           mask_indptr.data() + less_rows,
+                                           more_rows,
+                                           capacity_threshold * map_size,
+                                           n_chunks_per_row,
+                                           chunk_indices,
+                                           this->config.handle.get_stream());
+
+      chunked_mask_row_it<value_idx> more(this->config.a_indptr,
+                                          more_rows,
+                                          mask_indptr.data() + less_rows,
+                                          capacity_threshold * map_size,
+                                          n_chunks_per_row.data(),
+                                          chunk_indices.data(),
+                                          this->config.handle.get_stream());
 
       auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
-      this->_dispatch_base(*this, map_size, more, out_dists, coo_rows_b,
-                           product_func, accum_func, write_func, chunk_size,
-                           n_more_blocks, n_blocks_per_row);
+      this->_dispatch_base(*this,
+                           map_size,
+                           more,
+                           out_dists,
+                           coo_rows_b,
+                           product_func,
+                           accum_func,
+                           write_func,
+                           chunk_size,
+                           n_more_blocks,
+                           n_blocks_per_row);
     }
   }
 
   template <typename product_f, typename accum_f, typename write_f>
-  void dispatch_rev(value_t *out_dists, value_idx *coo_rows_a,
-                    product_f product_func, accum_f accum_func,
-                    write_f write_func, int chunk_size) {
+  void dispatch_rev(value_t* out_dists,
+                    value_idx* coo_rows_a,
+                    product_f product_func,
+                    accum_f accum_func,
+                    write_f write_func,
+                    int chunk_size)
+  {
     auto n_blocks_per_row = raft::ceildiv(this->config.a_nnz, chunk_size * tpb);
-    rmm::device_uvector<value_idx> mask_indptr(
-      this->config.b_nrows, this->config.handle.get_stream());
+    rmm::device_uvector<value_idx> mask_indptr(this->config.b_nrows,
+                                               this->config.handle.get_stream());
     std::tuple<value_idx, value_idx> n_rows_divided;
 
-    chunking_needed(this->config.b_indptr, this->config.b_nrows, mask_indptr,
-                    n_rows_divided, this->config.handle.get_stream());
+    chunking_needed(this->config.b_indptr,
+                    this->config.b_nrows,
+                    mask_indptr,
+                    n_rows_divided,
+                    this->config.handle.get_stream());
 
     auto less_rows = std::get<0>(n_rows_divided);
     if (less_rows > 0) {
-      mask_row_it<value_idx> less(this->config.b_indptr, less_rows,
-                                  mask_indptr.data());
+      mask_row_it<value_idx> less(this->config.b_indptr, less_rows, mask_indptr.data());
 
       auto n_less_blocks = less_rows * n_blocks_per_row;
-      this->_dispatch_base_rev(*this, map_size, less, out_dists, coo_rows_a,
-                               product_func, accum_func, write_func, chunk_size,
-                               n_less_blocks, n_blocks_per_row);
+      this->_dispatch_base_rev(*this,
+                               map_size,
+                               less,
+                               out_dists,
+                               coo_rows_a,
+                               product_func,
+                               accum_func,
+                               write_func,
+                               chunk_size,
+                               n_less_blocks,
+                               n_blocks_per_row);
     }
 
     auto more_rows = std::get<1>(n_rows_divided);
     if (more_rows > 0) {
-      rmm::device_uvector<value_idx> n_chunks_per_row(
-        more_rows + 1, this->config.handle.get_stream());
-      rmm::device_uvector<value_idx> chunk_indices(
-        0, this->config.handle.get_stream());
-      chunked_mask_row_it<value_idx>::init(
-        this->config.b_indptr, mask_indptr.data() + less_rows, more_rows,
-        capacity_threshold * map_size, n_chunks_per_row, chunk_indices,
-        this->config.handle.get_stream());
-
-      chunked_mask_row_it<value_idx> more(
-        this->config.b_indptr, more_rows, mask_indptr.data() + less_rows,
-        capacity_threshold * map_size, n_chunks_per_row.data(),
-        chunk_indices.data(), this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> n_chunks_per_row(more_rows + 1,
+                                                      this->config.handle.get_stream());
+      rmm::device_uvector<value_idx> chunk_indices(0, this->config.handle.get_stream());
+      chunked_mask_row_it<value_idx>::init(this->config.b_indptr,
+                                           mask_indptr.data() + less_rows,
+                                           more_rows,
+                                           capacity_threshold * map_size,
+                                           n_chunks_per_row,
+                                           chunk_indices,
+                                           this->config.handle.get_stream());
+
+      chunked_mask_row_it<value_idx> more(this->config.b_indptr,
+                                          more_rows,
+                                          mask_indptr.data() + less_rows,
+                                          capacity_threshold * map_size,
+                                          n_chunks_per_row.data(),
+                                          chunk_indices.data(),
+                                          this->config.handle.get_stream());
 
       auto n_more_blocks = more.total_row_blocks * n_blocks_per_row;
-      this->_dispatch_base_rev(*this, map_size, more, out_dists, coo_rows_a,
-                               product_func, accum_func, write_func, chunk_size,
-                               n_more_blocks, n_blocks_per_row);
+      this->_dispatch_base_rev(*this,
+                               map_size,
+                               more,
+                               out_dists,
+                               coo_rows_a,
+                               product_func,
+                               accum_func,
+                               write_func,
+                               chunk_size,
+                               n_more_blocks,
+                               n_blocks_per_row);
     }
   }
 
-  __device__ inline insert_type init_insert(smem_type cache,
-                                            const value_idx &cache_size) {
+  __device__ inline insert_type init_insert(smem_type cache, const value_idx& cache_size)
+  {
     return insert_type::make_from_uninitialized_slots(
       cooperative_groups::this_thread_block(), cache, cache_size, -1, 0);
   }
 
-  __device__ inline void insert(insert_type cache, const value_idx &key,
-                                const value_t &value) {
+  __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
+  {
     auto success = cache.insert(cuco::pair<value_idx, value_t>(key, value));
   }
 
-  __device__ inline find_type init_find(smem_type cache,
-                                        const value_idx &cache_size) {
+  __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
+  {
     return find_type(cache, cache_size, -1, 0);
   }
 
-  __device__ inline value_t find(find_type cache, const value_idx &key) {
+  __device__ inline value_t find(find_type cache, const value_idx& key)
+  {
     auto a_pair = cache.find(key);
 
     value_t a_col = 0.0;
-    if (a_pair != cache.end()) {
-      a_col = a_pair->second;
-    }
+    if (a_pair != cache.end()) { a_col = a_pair->second; }
     return a_col;
   }
 
   struct fits_in_hash_table {
    public:
-    fits_in_hash_table(const value_idx *indptr_, value_idx degree_l_,
-                       value_idx degree_r_)
-      : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_) {}
+    fits_in_hash_table(const value_idx* indptr_, value_idx degree_l_, value_idx degree_r_)
+      : indptr(indptr_), degree_l(degree_l_), degree_r(degree_r_)
+    {
+    }
 
-    __host__ __device__ bool operator()(const value_idx &i) {
+    __host__ __device__ bool operator()(const value_idx& i)
+    {
       auto degree = indptr[i + 1] - indptr[i];
 
       return degree >= degree_l && degree < degree_r;
     }
 
    private:
-    const value_idx *indptr;
+    const value_idx* indptr;
     const value_idx degree_l, degree_r;
   };
 
-  inline static int get_map_size() {
-    return (raft::getSharedMemPerBlock() -
-            ((tpb / raft::warp_size()) * sizeof(value_t))) /
+  inline static int get_map_size()
+  {
+    return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
            sizeof(typename insert_type::slot_type);
   }
 
diff --git a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
index 2cd7b670d8..bde979a993 100644
--- a/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/ip_distance.cuh
@@ -42,35 +42,38 @@ template <typename value_idx, typename value_t>
 class ip_distances_t : public distances_t<value_t> {
  public:
   /**
-         * Computes simple sparse inner product distances as sum(x_y * y_k)
-         * @param[in] config specifies inputs, outputs, and sizes
-         */
-  ip_distances_t(const distances_config_t<value_idx, value_t> &config)
-    : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream()) {
-    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
-                                      coo_rows_b.data(), config_->b_nnz,
+   * Computes simple sparse inner product distances as sum(x_y * y_k)
+   * @param[in] config specifies inputs, outputs, and sizes
+   */
+  ip_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), coo_rows_b(config.b_nnz, config.handle.get_stream())
+  {
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows_b.data(),
+                                      config_->b_nnz,
                                       config_->handle.get_stream());
   }
 
   /**
-         * Performs pairwise distance computation and computes output distances
-         * @param out_distances dense output matrix (size a_nrows * b_nrows)
-         */
-  void compute(value_t *out_distances) {
+   * Performs pairwise distance computation and computes output distances
+   * @param out_distances dense output matrix (size a_nrows * b_nrows)
+   */
+  void compute(value_t* out_distances)
+  {
     /**
-               * Compute pairwise distances and return dense matrix in row-major format
-               */
+     * Compute pairwise distances and return dense matrix in row-major format
+     */
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_distances, *config_, coo_rows_b.data(), Product(), Sum(),
-      AtomicAdd());
+      out_distances, *config_, coo_rows_b.data(), Product(), Sum(), AtomicAdd());
   }
 
-  value_idx *b_rows_coo() { return coo_rows_b.data(); }
+  value_idx* b_rows_coo() { return coo_rows_b.data(); }
 
-  value_t *b_data_coo() { return config_->b_data; }
+  value_t* b_data_coo() { return config_->b_data; }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<value_idx> coo_rows_b;
 };
 
diff --git a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
index f06a15215c..a4a534823f 100644
--- a/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/l2_distance.cuh
@@ -38,35 +38,36 @@ namespace detail {
 
 // @TODO: Move this into sparse prims (coo_norm)
 template <typename value_idx, typename value_t>
-__global__ void compute_row_norm_kernel(value_t *out,
-                                        const value_idx *__restrict__ coo_rows,
-                                        const value_t *__restrict__ data,
-                                        value_idx nnz) {
+__global__ void compute_row_norm_kernel(value_t* out,
+                                        const value_idx* __restrict__ coo_rows,
+                                        const value_t* __restrict__ data,
+                                        value_idx nnz)
+{
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < nnz) {
-    atomicAdd(&out[coo_rows[i]], data[i] * data[i]);
-  }
+  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i] * data[i]); }
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_row_sum_kernel(value_t *out,
-                                       const value_idx *__restrict__ coo_rows,
-                                       const value_t *__restrict__ data,
-                                       value_idx nnz) {
+__global__ void compute_row_sum_kernel(value_t* out,
+                                       const value_idx* __restrict__ coo_rows,
+                                       const value_t* __restrict__ data,
+                                       value_idx nnz)
+{
   value_idx i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < nnz) {
-    atomicAdd(&out[coo_rows[i]], data[i]);
-  }
+  if (i < nnz) { atomicAdd(&out[coo_rows[i]], data[i]); }
 }
 
 template <typename value_idx, typename value_t, typename expansion_f>
-__global__ void compute_euclidean_warp_kernel(
-  value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms,
-  const value_t *__restrict__ R_sq_norms, value_idx n_rows, value_idx n_cols,
-  expansion_f expansion_func) {
+__global__ void compute_euclidean_warp_kernel(value_t* __restrict__ C,
+                                              const value_t* __restrict__ Q_sq_norms,
+                                              const value_t* __restrict__ R_sq_norms,
+                                              value_idx n_rows,
+                                              value_idx n_cols,
+                                              expansion_f expansion_func)
+{
   std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
-  value_idx i = tid / n_cols;
-  value_idx j = tid % n_cols;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
 
   if (i >= n_rows || j >= n_cols) return;
 
@@ -80,25 +81,29 @@ __global__ void compute_euclidean_warp_kernel(
 }
 
 template <typename value_idx, typename value_t>
-__global__ void compute_correlation_warp_kernel(
-  value_t *__restrict__ C, const value_t *__restrict__ Q_sq_norms,
-  const value_t *__restrict__ R_sq_norms, const value_t *__restrict__ Q_norms,
-  const value_t *__restrict__ R_norms, value_idx n_rows, value_idx n_cols,
-  value_idx n) {
+__global__ void compute_correlation_warp_kernel(value_t* __restrict__ C,
+                                                const value_t* __restrict__ Q_sq_norms,
+                                                const value_t* __restrict__ R_sq_norms,
+                                                const value_t* __restrict__ Q_norms,
+                                                const value_t* __restrict__ R_norms,
+                                                value_idx n_rows,
+                                                value_idx n_cols,
+                                                value_idx n)
+{
   std::size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
-  value_idx i = tid / n_cols;
-  value_idx j = tid % n_cols;
+  value_idx i     = tid / n_cols;
+  value_idx j     = tid % n_cols;
 
   if (i >= n_rows || j >= n_cols) return;
 
-  value_t dot = C[(size_t)i * n_cols + j];
+  value_t dot  = C[(size_t)i * n_cols + j];
   value_t Q_l1 = Q_norms[i];
   value_t R_l1 = R_norms[j];
 
   value_t Q_l2 = Q_sq_norms[i];
   value_t R_l2 = R_sq_norms[j];
 
-  value_t numer = n * dot - (Q_l1 * R_l1);
+  value_t numer   = n * dot - (Q_l1 * R_l1);
   value_t Q_denom = n * Q_l2 - (Q_l1 * Q_l1);
   value_t R_denom = n * R_l2 - (R_l1 * R_l1);
 
@@ -108,56 +113,75 @@ __global__ void compute_correlation_warp_kernel(
   C[(size_t)i * n_cols + j] = val * (fabs(val) >= 0.0001);
 }
 
-template <typename value_idx, typename value_t, int tpb = 256,
-          typename expansion_f>
-void compute_euclidean(value_t *C, const value_t *Q_sq_norms,
-                       const value_t *R_sq_norms, value_idx n_rows,
-                       value_idx n_cols, cudaStream_t stream,
-                       expansion_f expansion_func) {
+template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
+void compute_euclidean(value_t* C,
+                       const value_t* Q_sq_norms,
+                       const value_t* R_sq_norms,
+                       value_idx n_rows,
+                       value_idx n_cols,
+                       cudaStream_t stream,
+                       expansion_f expansion_func)
+{
   int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
   compute_euclidean_warp_kernel<<<blocks, tpb, 0, stream>>>(
     C, Q_sq_norms, R_sq_norms, n_rows, n_cols, expansion_func);
 }
 
-template <typename value_idx, typename value_t, int tpb = 256,
-          typename expansion_f>
-void compute_l2(value_t *out, const value_idx *Q_coo_rows,
-                const value_t *Q_data, value_idx Q_nnz,
-                const value_idx *R_coo_rows, const value_t *R_data,
-                value_idx R_nnz, value_idx m, value_idx n, cudaStream_t stream,
-                expansion_f expansion_func) {
+template <typename value_idx, typename value_t, int tpb = 256, typename expansion_f>
+void compute_l2(value_t* out,
+                const value_idx* Q_coo_rows,
+                const value_t* Q_data,
+                value_idx Q_nnz,
+                const value_idx* R_coo_rows,
+                const value_t* R_data,
+                value_idx R_nnz,
+                value_idx m,
+                value_idx n,
+                cudaStream_t stream,
+                expansion_f expansion_func)
+{
   rmm::device_uvector<value_t> Q_sq_norms(m, stream);
   rmm::device_uvector<value_t> R_sq_norms(n, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(
-    cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
 
   compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
   compute_row_norm_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
     R_sq_norms.data(), R_coo_rows, R_data, R_nnz);
 
-  compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream,
-                    expansion_func);
+  compute_euclidean(out, Q_sq_norms.data(), R_sq_norms.data(), m, n, stream, expansion_func);
 }
 
 template <typename value_idx, typename value_t, int tpb = 256>
-void compute_correlation(value_t *C, const value_t *Q_sq_norms,
-                         const value_t *R_sq_norms, const value_t *Q_norms,
-                         const value_t *R_norms, value_idx n_rows,
-                         value_idx n_cols, value_idx n, cudaStream_t stream) {
+void compute_correlation(value_t* C,
+                         const value_t* Q_sq_norms,
+                         const value_t* R_sq_norms,
+                         const value_t* Q_norms,
+                         const value_t* R_norms,
+                         value_idx n_rows,
+                         value_idx n_cols,
+                         value_idx n,
+                         cudaStream_t stream)
+{
   int blocks = raft::ceildiv<size_t>((size_t)n_rows * n_cols, tpb);
   compute_correlation_warp_kernel<<<blocks, tpb, 0, stream>>>(
     C, Q_sq_norms, R_sq_norms, Q_norms, R_norms, n_rows, n_cols, n);
 }
 
 template <typename value_idx, typename value_t, int tpb = 256>
-void compute_corr(value_t *out, const value_idx *Q_coo_rows,
-                  const value_t *Q_data, value_idx Q_nnz,
-                  const value_idx *R_coo_rows, const value_t *R_data,
-                  value_idx R_nnz, value_idx m, value_idx n, value_idx n_cols,
-                  cudaStream_t stream) {
+void compute_corr(value_t* out,
+                  const value_idx* Q_coo_rows,
+                  const value_t* Q_data,
+                  value_idx Q_nnz,
+                  const value_idx* R_coo_rows,
+                  const value_t* R_data,
+                  value_idx R_nnz,
+                  value_idx m,
+                  value_idx n,
+                  value_idx n_cols,
+                  cudaStream_t stream)
+{
   // sum_sq for std dev
   rmm::device_uvector<value_t> Q_sq_norms(m, stream);
   rmm::device_uvector<value_t> R_sq_norms(n, stream);
@@ -166,15 +190,11 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows,
   rmm::device_uvector<value_t> Q_norms(m, stream);
   rmm::device_uvector<value_t> R_norms(n, stream);
 
-  CUDA_CHECK(
-    cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(
-    cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(Q_sq_norms.data(), 0, Q_sq_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(R_sq_norms.data(), 0, R_sq_norms.size() * sizeof(value_t)));
 
-  CUDA_CHECK(
-    cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
-  CUDA_CHECK(
-    cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(Q_norms.data(), 0, Q_norms.size() * sizeof(value_t)));
+  CUDA_CHECK(cudaMemsetAsync(R_norms.data(), 0, R_norms.size() * sizeof(value_t)));
 
   compute_row_norm_kernel<<<raft::ceildiv(Q_nnz, tpb), tpb, 0, stream>>>(
     Q_sq_norms.data(), Q_coo_rows, Q_data, Q_nnz);
@@ -186,8 +206,15 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows,
   compute_row_sum_kernel<<<raft::ceildiv(R_nnz, tpb), tpb, 0, stream>>>(
     R_norms.data(), R_coo_rows, R_data, R_nnz);
 
-  compute_correlation(out, Q_sq_norms.data(), R_sq_norms.data(), Q_norms.data(),
-                      R_norms.data(), m, n, n_cols, stream);
+  compute_correlation(out,
+                      Q_sq_norms.data(),
+                      R_sq_norms.data(),
+                      Q_norms.data(),
+                      R_norms.data(),
+                      m,
+                      n,
+                      n_cols,
+                      stream);
 }
 
 /**
@@ -197,35 +224,44 @@ void compute_corr(value_t *out, const value_idx *Q_coo_rows,
 template <typename value_idx = int, typename value_t = float>
 class l2_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit l2_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config), ip_dists(config) {}
+  explicit l2_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_l2(
-      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
-      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_stream(),
-      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-        return -2 * dot + q_norm + r_norm;
-      });
+    compute_l2(out_dists,
+               search_coo_rows.data(),
+               config_->a_data,
+               config_->a_nnz,
+               b_indices,
+               b_data,
+               config_->b_nnz,
+               config_->a_nrows,
+               config_->b_nrows,
+               config_->handle.get_stream(),
+               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                 return -2 * dot + q_norm + r_norm;
+               });
   }
 
   ~l2_expanded_distances_t() = default;
 
  protected:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
@@ -234,18 +270,21 @@ class l2_expanded_distances_t : public distances_t<value_t> {
  * The expanded form is more efficient for sparse data.
  */
 template <typename value_idx = int, typename value_t = float>
-class l2_sqrt_expanded_distances_t
-  : public l2_expanded_distances_t<value_idx, value_t> {
+class l2_sqrt_expanded_distances_t : public l2_expanded_distances_t<value_idx, value_t> {
  public:
-  explicit l2_sqrt_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : l2_expanded_distances_t<value_idx, value_t>(config) {}
+  explicit l2_sqrt_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : l2_expanded_distances_t<value_idx, value_t>(config)
+  {
+  }
 
-  void compute(value_t *out_dists) override {
+  void compute(value_t* out_dists) override
+  {
     l2_expanded_distances_t<value_idx, value_t>::compute(out_dists);
     // Sqrt Post-processing
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows,
+      out_dists,
+      out_dists,
+      this->config_->a_nrows * this->config_->b_nrows,
       [] __device__(value_t input) {
         int neg = input < 0 ? -1 : 1;
         return sqrt(abs(input) * neg);
@@ -259,79 +298,96 @@ class l2_sqrt_expanded_distances_t
 template <typename value_idx, typename value_t>
 class correlation_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit correlation_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config), ip_dists(config) {}
+  explicit correlation_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_corr(out_dists, search_coo_rows.data(), config_->a_data,
-                 config_->a_nnz, b_indices, b_data, config_->b_nnz,
-                 config_->a_nrows, config_->b_nrows, config_->b_ncols,
+    compute_corr(out_dists,
+                 search_coo_rows.data(),
+                 config_->a_data,
+                 config_->a_nnz,
+                 b_indices,
+                 b_data,
+                 config_->b_nnz,
+                 config_->a_nrows,
+                 config_->b_nrows,
+                 config_->b_ncols,
                  config_->handle.get_stream());
   }
 
   ~correlation_expanded_distances_t() = default;
 
  protected:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
 
 /**
- * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) * sqrt(sum(y_k)^2)))
- * The expanded form is more efficient for sparse data.
+ * Cosine distance using the expanded form: 1 - ( sum(x_k * y_k) / (sqrt(sum(x_k)^2) *
+ * sqrt(sum(y_k)^2))) The expanded form is more efficient for sparse data.
  */
 template <typename value_idx = int, typename value_t = float>
 class cosine_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit cosine_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(0, config.handle.get_stream()),
-      ip_dists(config) {}
+  explicit cosine_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_idx *b_indices = ip_dists.b_rows_coo();
-    value_t *b_data = ip_dists.b_data_coo();
+    value_idx* b_indices = ip_dists.b_rows_coo();
+    value_t* b_data      = ip_dists.b_data_coo();
 
-    rmm::device_uvector<value_idx> search_coo_rows(
-      config_->a_nnz, config_->handle.get_stream());
-    raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                      search_coo_rows.data(), config_->a_nnz,
+    rmm::device_uvector<value_idx> search_coo_rows(config_->a_nnz, config_->handle.get_stream());
+    raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                      config_->a_nrows,
+                                      search_coo_rows.data(),
+                                      config_->a_nnz,
                                       config_->handle.get_stream());
 
-    compute_l2(
-      out_dists, search_coo_rows.data(), config_->a_data, config_->a_nnz,
-      b_indices, b_data, config_->b_nnz, config_->a_nrows, config_->b_nrows,
-      config_->handle.get_stream(),
-      [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
-        value_t norms = sqrt(q_norm) * sqrt(r_norm);
-        // deal with potential for 0 in denominator by forcing 0/1 instead
-        value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
-
-        // flip the similarity when both rows are 0
-        bool both_empty = (q_norm == 0) && (r_norm == 0);
-        return 1 - ((!both_empty * cos) + both_empty);
-      });
+    compute_l2(out_dists,
+               search_coo_rows.data(),
+               config_->a_data,
+               config_->a_nnz,
+               b_indices,
+               b_data,
+               config_->b_nnz,
+               config_->a_nrows,
+               config_->b_nrows,
+               config_->handle.get_stream(),
+               [] __device__ __host__(value_t dot, value_t q_norm, value_t r_norm) {
+                 value_t norms = sqrt(q_norm) * sqrt(r_norm);
+                 // deal with potential for 0 in denominator by forcing 0/1 instead
+                 value_t cos = ((norms != 0) * dot) / ((norms == 0) + norms);
+
+                 // flip the similarity when both rows are 0
+                 bool both_empty = (q_norm == 0) && (r_norm == 0);
+                 return 1 - ((!both_empty * cos) + both_empty);
+               });
   }
 
   ~cosine_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
@@ -348,25 +404,34 @@ class cosine_expanded_distances_t : public distances_t<value_t> {
 template <typename value_idx = int, typename value_t = float>
 class hellinger_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit hellinger_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config), workspace(0, config.handle.get_stream()) {}
+  explicit hellinger_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream())
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
                                             config_->handle.get_stream());
 
-    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
-                                      coo_rows.data(), config_->b_nnz,
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows.data(),
+                                      config_->b_nnz,
                                       config_->handle.get_stream());
 
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_dists, *config_, coo_rows.data(),
-      [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); }, Sum(),
+      out_dists,
+      *config_,
+      coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return sqrt(a) * sqrt(b); },
+      Sum(),
       AtomicAdd());
 
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) {
         // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
         bool rectifier = (1 - input) > 0;
@@ -378,42 +443,43 @@ class hellinger_expanded_distances_t : public distances_t<value_t> {
   ~hellinger_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class russelrao_expanded_distances_t : public distances_t<value_t> {
  public:
-  explicit russelrao_expanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config),
-      workspace(0, config.handle.get_stream()),
-      ip_dists(config) {}
+  explicit russelrao_expanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config), workspace(0, config.handle.get_stream()), ip_dists(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     ip_dists.compute(out_dists);
 
-    value_t n_cols = config_->a_ncols;
+    value_t n_cols     = config_->a_ncols;
     value_t n_cols_inv = 1.0 / n_cols;
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return (n_cols - input) * n_cols_inv; },
       config_->handle.get_stream());
 
-    auto exec_policy = rmm::exec_policy(config_->handle.get_stream());
-    auto diags = thrust::counting_iterator<value_idx>(0);
+    auto exec_policy  = rmm::exec_policy(config_->handle.get_stream());
+    auto diags        = thrust::counting_iterator<value_idx>(0);
     value_idx b_nrows = config_->b_nrows;
-    thrust::for_each(exec_policy, diags, diags + config_->a_nrows,
-                     [=] __device__(value_idx input) {
-                       out_dists[input * b_nrows + input] = 0.0;
-                     });
+    thrust::for_each(exec_policy, diags, diags + config_->a_nrows, [=] __device__(value_idx input) {
+      out_dists[input * b_nrows + input] = 0.0;
+    });
   }
 
   ~russelrao_expanded_distances_t() = default;
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   rmm::device_uvector<char> workspace;
   ip_distances_t<value_idx, value_t> ip_dists;
 };
diff --git a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
index c11369375b..f5e7c75988 100644
--- a/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
+++ b/cpp/include/raft/sparse/distance/detail/lp_distance.cuh
@@ -39,23 +39,33 @@ namespace sparse {
 namespace distance {
 namespace detail {
 
-template <typename value_idx = int, typename value_t = float,
-          typename product_f, typename accum_f, typename write_f>
-void unexpanded_lp_distances(
-  value_t *out_dists, const distances_config_t<value_idx, value_t> *config_,
-  product_f product_func, accum_f accum_func, write_f write_func) {
+template <typename value_idx = int,
+          typename value_t   = float,
+          typename product_f,
+          typename accum_f,
+          typename write_f>
+void unexpanded_lp_distances(value_t* out_dists,
+                             const distances_config_t<value_idx, value_t>* config_,
+                             product_f product_func,
+                             accum_f accum_func,
+                             write_f write_func)
+{
   rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
                                           config_->handle.get_stream());
 
-  raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
-                                    coo_rows.data(), config_->b_nnz,
+  raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                    config_->b_nrows,
+                                    coo_rows.data(),
+                                    config_->b_nnz,
                                     config_->handle.get_stream());
 
   balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
     out_dists, *config_, coo_rows.data(), product_func, accum_func, write_func);
 
-  raft::sparse::convert::csr_to_coo(config_->a_indptr, config_->a_nrows,
-                                    coo_rows.data(), config_->a_nnz,
+  raft::sparse::convert::csr_to_coo(config_->a_indptr,
+                                    config_->a_nrows,
+                                    coo_rows.data(),
+                                    config_->a_nnz,
                                     config_->handle.get_stream());
 
   balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
@@ -72,48 +82,51 @@ void unexpanded_lp_distances(
 template <typename value_idx = int, typename value_t = float>
 class l1_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  l1_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  l1_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(),
-                                                Sum(), AtomicAdd());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(), Sum(), AtomicAdd());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class l2_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  l2_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  l2_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config) : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, SqDiff(),
-                                                Sum(), AtomicAdd());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, SqDiff(), Sum(), AtomicAdd());
   }
 
  protected:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
-class l2_sqrt_unexpanded_distances_t
-  : public l2_unexpanded_distances_t<value_idx, value_t> {
+class l2_sqrt_unexpanded_distances_t : public l2_unexpanded_distances_t<value_idx, value_t> {
  public:
-  l2_sqrt_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : l2_unexpanded_distances_t<value_idx, value_t>(config) {}
+  l2_sqrt_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : l2_unexpanded_distances_t<value_idx, value_t>(config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     l2_unexpanded_distances_t<value_idx, value_t>::compute(out_dists);
     // Sqrt Post-processing
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, this->config_->a_nrows * this->config_->b_nrows,
+      out_dists,
+      out_dists,
+      this->config_->a_nrows * this->config_->b_nrows,
       [] __device__(value_t input) {
         int neg = input < 0 ? -1 : 1;
         return sqrt(abs(input) * neg);
@@ -125,29 +138,33 @@ class l2_sqrt_unexpanded_distances_t
 template <typename value_idx = int, typename value_t = float>
 class linf_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit linf_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  explicit linf_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(),
-                                                Max(), AtomicMax());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, AbsDiff(), Max(), AtomicMax());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class canberra_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit canberra_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  explicit canberra_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     unexpanded_lp_distances<value_idx, value_t>(
-      out_dists, config_,
+      out_dists,
+      config_,
       [] __device__(value_t a, value_t b) {
         value_t d = fabs(a) + fabs(b);
 
@@ -155,70 +172,82 @@ class canberra_unexpanded_distances_t : public distances_t<value_t> {
         // forcing 1/0 instead
         return ((d != 0) * fabs(a - b)) / (d + (d == 0));
       },
-      Sum(), AtomicAdd());
+      Sum(),
+      AtomicAdd());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class lp_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit lp_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config, value_t p_)
-    : config_(&config), p(p_) {}
+  explicit lp_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config,
+                                     value_t p_)
+    : config_(&config), p(p_)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, PDiff(p),
-                                                Sum(), AtomicAdd());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, PDiff(p), Sum(), AtomicAdd());
 
     float one_over_p = 1.0f / p;
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return pow(input, one_over_p); },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
   value_t p;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class hamming_unexpanded_distances_t : public distances_t<value_t> {
  public:
-  explicit hamming_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+  explicit hamming_unexpanded_distances_t(const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
-    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, NotEqual(),
-                                                Sum(), AtomicAdd());
+  void compute(value_t* out_dists)
+  {
+    unexpanded_lp_distances<value_idx, value_t>(out_dists, config_, NotEqual(), Sum(), AtomicAdd());
 
     value_t n_cols = 1.0 / config_->a_ncols;
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return input * n_cols; },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
  public:
   explicit jensen_shannon_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+    const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     unexpanded_lp_distances<value_idx, value_t>(
-      out_dists, config_,
+      out_dists,
+      config_,
       [] __device__(value_t a, value_t b) {
-        value_t m = 0.5f * (a + b);
+        value_t m   = 0.5f * (a + b);
         bool a_zero = a == 0;
         bool b_zero = b == 0;
 
@@ -228,49 +257,61 @@ class jensen_shannon_unexpanded_distances_t : public distances_t<value_t> {
         bool x_zero = x == 0;
         bool y_zero = y == 0;
 
-        return (-a * (!x_zero * log(x + x_zero))) +
-               (-b * (!y_zero * log(y + y_zero)));
+        return (-a * (!x_zero * log(x + x_zero))) + (-b * (!y_zero * log(y + y_zero)));
       },
-      Sum(), AtomicAdd());
+      Sum(),
+      AtomicAdd());
 
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return sqrt(0.5 * input); },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 template <typename value_idx = int, typename value_t = float>
 class kl_divergence_unexpanded_distances_t : public distances_t<value_t> {
  public:
   explicit kl_divergence_unexpanded_distances_t(
-    const distances_config_t<value_idx, value_t> &config)
-    : config_(&config) {}
+    const distances_config_t<value_idx, value_t>& config)
+    : config_(&config)
+  {
+  }
 
-  void compute(value_t *out_dists) {
+  void compute(value_t* out_dists)
+  {
     rmm::device_uvector<value_idx> coo_rows(max(config_->b_nnz, config_->a_nnz),
                                             config_->handle.get_stream());
 
-    raft::sparse::convert::csr_to_coo(config_->b_indptr, config_->b_nrows,
-                                      coo_rows.data(), config_->b_nnz,
+    raft::sparse::convert::csr_to_coo(config_->b_indptr,
+                                      config_->b_nrows,
+                                      coo_rows.data(),
+                                      config_->b_nnz,
                                       config_->handle.get_stream());
 
     balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_dists, *config_, coo_rows.data(),
-      [] __device__(value_t a, value_t b) { return a * log(a / b); }, Sum(),
+      out_dists,
+      *config_,
+      coo_rows.data(),
+      [] __device__(value_t a, value_t b) { return a * log(a / b); },
+      Sum(),
       AtomicAdd());
 
     raft::linalg::unaryOp<value_t>(
-      out_dists, out_dists, config_->a_nrows * config_->b_nrows,
+      out_dists,
+      out_dists,
+      config_->a_nrows * config_->b_nrows,
       [=] __device__(value_t input) { return 0.5 * input; },
       config_->handle.get_stream());
   }
 
  private:
-  const distances_config_t<value_idx, value_t> *config_;
+  const distances_config_t<value_idx, value_t>* config_;
 };
 
 };  // END namespace detail
diff --git a/cpp/include/raft/sparse/distance/detail/operators.cuh b/cpp/include/raft/sparse/distance/detail/operators.cuh
index 9f206095bf..b2c2e2172b 100644
--- a/cpp/include/raft/sparse/distance/detail/operators.cuh
+++ b/cpp/include/raft/sparse/distance/detail/operators.cuh
@@ -25,21 +25,24 @@ namespace detail {
 
 struct Sum {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return a + b;
   }
 };
 
 struct NotEqual {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return a != b;
   }
 };
 
 struct SqDiff {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return (a - b) * (a - b);
   }
 };
@@ -50,44 +53,48 @@ struct PDiff {
   PDiff(float p_) : p(p_) {}
 
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return pow(a - b, p);
   }
 };
 
 struct Max {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return fmax(a, b);
   }
 };
 
 struct AtomicAdd {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t *a,
-                                                         value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b)
+  {
     return atomicAdd(a, b);
   }
 };
 
 struct AtomicMax {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t *a,
-                                                         value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t* a, value_t b)
+  {
     return atomicMax(a, b);
   }
 };
 
 struct Product {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return a * b;
   }
 };
 
 struct AbsDiff {
   template <typename value_t>
-  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b) {
+  __host__ __device__ __forceinline__ value_t operator()(value_t a, value_t b)
+  {
     return fabs(a - b);
   }
 };
diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh
index abfb7d24ea..8c01b33c1e 100644
--- a/cpp/include/raft/sparse/distance/detail/utils.cuh
+++ b/cpp/include/raft/sparse/distance/detail/utils.cuh
@@ -33,10 +33,10 @@ namespace detail {
  * @return the maximum number of columns that can be stored in smem
  */
 template <typename value_idx, typename value_t, int tpb = 1024>
-inline int max_cols_per_block() {
+inline int max_cols_per_block()
+{
   // max cols = (total smem available - cub reduction smem)
-  return (raft::getSharedMemPerBlock() -
-          ((tpb / raft::warp_size()) * sizeof(value_t))) /
+  return (raft::getSharedMemPerBlock() - ((tpb / raft::warp_size()) * sizeof(value_t))) /
          sizeof(value_t);
 }
 
diff --git a/cpp/include/raft/sparse/distance/distance.hpp b/cpp/include/raft/sparse/distance/distance.hpp
index 0aeabe5019..92c08654d2 100644
--- a/cpp/include/raft/sparse/distance/distance.hpp
+++ b/cpp/include/raft/sparse/distance/distance.hpp
@@ -71,90 +71,71 @@ static const std::unordered_set<raft::distance::DistanceType> supportedDistance{
  * @param[out] out dense output array (size A.nrows * B.nrows)
  * @param[in] input_config input argument configuration
  * @param[in] metric distance metric to use
-* @param[in] metric_arg metric argument (used for Minkowski distance)
+ * @param[in] metric_arg metric argument (used for Minkowski distance)
  */
 template <typename value_idx = int, typename value_t = float>
-void pairwiseDistance(value_t *out,
+void pairwiseDistance(value_t* out,
                       distances_config_t<value_idx, value_t> input_config,
-                      raft::distance::DistanceType metric, float metric_arg) {
+                      raft::distance::DistanceType metric,
+                      float metric_arg)
+{
   switch (metric) {
     case raft::distance::DistanceType::L2Expanded:
-      detail::l2_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::l2_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L2SqrtExpanded:
-      detail::l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::l2_sqrt_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::InnerProduct:
       detail::ip_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L2Unexpanded:
-      detail::l2_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::l2_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L2SqrtUnexpanded:
-      detail::l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::l2_sqrt_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::L1:
-      detail::l1_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::l1_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
-      detail::lp_unexpanded_distances_t<value_idx, value_t>(input_config,
-                                                            metric_arg)
-        .compute(out);
+      detail::lp_unexpanded_distances_t<value_idx, value_t>(input_config, metric_arg).compute(out);
       break;
     case raft::distance::DistanceType::Linf:
-      detail::linf_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::linf_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::Canberra:
-      detail::canberra_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::canberra_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::JaccardExpanded:
-      detail::jaccard_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::jaccard_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      detail::cosine_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::cosine_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      detail::hellinger_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::hellinger_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::DiceExpanded:
-      detail::dice_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::dice_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::CorrelationExpanded:
-      detail::correlation_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::correlation_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::RusselRaoExpanded:
-      detail::russelrao_expanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::russelrao_expanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::HammingUnexpanded:
-      detail::hamming_unexpanded_distances_t<value_idx, value_t>(input_config)
-        .compute(out);
+      detail::hamming_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::JensenShannon:
-      detail::jensen_shannon_unexpanded_distances_t<value_idx, value_t>(
-        input_config)
-        .compute(out);
+      detail::jensen_shannon_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
     case raft::distance::DistanceType::KLDivergence:
-      detail::kl_divergence_unexpanded_distances_t<value_idx, value_t>(
-        input_config)
-        .compute(out);
+      detail::kl_divergence_unexpanded_distances_t<value_idx, value_t>(input_config).compute(out);
       break;
 
-    default:
-      THROW("Unsupported distance: %d", metric);
+    default: THROW("Unsupported distance: %d", metric);
   }
 }
 
diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h
index 29f541498b..1738dd7498 100644
--- a/cpp/include/raft/sparse/hierarchy/common.h
+++ b/cpp/include/raft/sparse/hierarchy/common.h
@@ -37,13 +37,15 @@ class linkage_output {
   value_idx n_leaves;
   value_idx n_connected_components;
 
-  value_idx *labels;  // size: m
+  value_idx* labels;  // size: m
 
-  value_idx *children;  // size: (m-1, 2)
+  value_idx* children;  // size: (m-1, 2)
 };
 
-class linkage_output_int_float : public linkage_output<int, float> {};
-class linkage_output__int64_float : public linkage_output<int64_t, float> {};
+class linkage_output_int_float : public linkage_output<int, float> {
+};
+class linkage_output__int64_float : public linkage_output<int64_t, float> {
+};
 
 };  // namespace hierarchy
 };  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
index 4ef2ac43e2..207cca7287 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/agglomerative.cuh
@@ -44,31 +44,32 @@ class UnionFind {
   value_idx n_indices;
 
   UnionFind(value_idx N_)
-    : n_indices(2 * N_ - 1),
-      parent(2 * N_ - 1, -1),
-      size(2 * N_ - 1, 1),
-      next_label(N_) {
+    : n_indices(2 * N_ - 1), parent(2 * N_ - 1, -1), size(2 * N_ - 1, 1), next_label(N_)
+  {
     memset(size.data() + N_, 0, (size.size() - N_) * sizeof(value_idx));
   }
 
-  value_idx find(value_idx n) {
+  value_idx find(value_idx n)
+  {
     value_idx p;
     p = n;
 
-    while (parent[n] != -1) n = parent[n];
+    while (parent[n] != -1)
+      n = parent[n];
 
     // path compression
     while (parent[p] != n) {
-      p = parent[p == -1 ? n_indices - 1 : p];
+      p                                   = parent[p == -1 ? n_indices - 1 : p];
       parent[p == -1 ? n_indices - 1 : p] = n;
     }
     return n;
   }
 
-  void perform_union(value_idx m, value_idx n) {
+  void perform_union(value_idx m, value_idx n)
+  {
     size[next_label] = size[m] + size[n];
-    parent[m] = next_label;
-    parent[n] = next_label;
+    parent[m]        = next_label;
+    parent[n]        = next_label;
 
     next_label += 1;
   }
@@ -97,10 +98,15 @@ class UnionFind {
  * @param[out] out_size cluster sizes of output
  */
 template <typename value_idx, typename value_t>
-void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
-                           const value_idx *cols, const value_t *data,
-                           size_t nnz, value_idx *children, value_t *out_delta,
-                           value_idx *out_size) {
+void build_dendrogram_host(const handle_t& handle,
+                           const value_idx* rows,
+                           const value_idx* cols,
+                           const value_t* data,
+                           size_t nnz,
+                           value_idx* children,
+                           value_t* out_delta,
+                           value_idx* out_size)
+{
   auto stream = handle.get_stream();
 
   value_idx n_edges = nnz;
@@ -122,8 +128,8 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
   UnionFind<value_idx, value_t> U(nnz + 1);
 
   for (std::size_t i = 0; i < nnz; i++) {
-    value_idx a = mst_src_h[i];
-    value_idx b = mst_dst_h[i];
+    value_idx a   = mst_src_h[i];
+    value_idx b   = mst_dst_h[i];
     value_t delta = mst_weights_h[i];
 
     value_idx aa = U.find(a);
@@ -131,10 +137,10 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
 
     value_idx children_idx = i * 2;
 
-    children_h[children_idx] = aa;
+    children_h[children_idx]     = aa;
     children_h[children_idx + 1] = bb;
-    out_delta_h[i] = delta;
-    out_size_h[i] = U.size[aa] + U.size[bb];
+    out_delta_h[i]               = delta;
+    out_size_h[i]                = U.size[aa] + U.size[bb];
 
     U.perform_union(aa, bb);
   }
@@ -145,13 +151,15 @@ void build_dendrogram_host(const handle_t &handle, const value_idx *rows,
 }
 
 template <typename value_idx>
-__global__ void write_levels_kernel(const value_idx *children,
-                                    value_idx *parents, value_idx n_vertices) {
+__global__ void write_levels_kernel(const value_idx* children,
+                                    value_idx* parents,
+                                    value_idx n_vertices)
+{
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid < n_vertices) {
     value_idx level = tid / 2;
     value_idx child = children[tid];
-    parents[child] = level;
+    parents[child]  = level;
   }
 }
 
@@ -167,14 +175,17 @@ __global__ void write_levels_kernel(const value_idx *children,
  * @param labels
  */
 template <typename value_idx>
-__global__ void inherit_labels(const value_idx *children,
-                               const value_idx *levels, std::size_t n_leaves,
-                               value_idx *labels, int cut_level,
-                               value_idx n_vertices) {
+__global__ void inherit_labels(const value_idx* children,
+                               const value_idx* levels,
+                               std::size_t n_leaves,
+                               value_idx* labels,
+                               int cut_level,
+                               value_idx n_vertices)
+{
   value_idx tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid < n_vertices) {
-    value_idx node = children[tid];
+    value_idx node      = children[tid];
     value_idx cur_level = tid / 2;
 
     /**
@@ -184,12 +195,12 @@ __global__ void inherit_labels(const value_idx *children,
     if (cur_level > cut_level) return;
 
     value_idx cur_parent = node;
-    value_idx label = labels[cur_parent];
+    value_idx label      = labels[cur_parent];
 
     while (label == -1) {
       cur_parent = cur_level + n_leaves;
-      cur_level = levels[cur_parent];
-      label = labels[cur_parent];
+      cur_level  = levels[cur_parent];
+      label      = labels[cur_parent];
     }
 
     labels[node] = label;
@@ -198,15 +209,16 @@ __global__ void inherit_labels(const value_idx *children,
 
 template <typename value_idx>
 struct init_label_roots {
-  init_label_roots(value_idx *labels_) : labels(labels_) {}
+  init_label_roots(value_idx* labels_) : labels(labels_) {}
 
   template <typename Tuple>
-  __host__ __device__ void operator()(Tuple t) {
+  __host__ __device__ void operator()(Tuple t)
+  {
     labels[thrust::get<1>(t)] = thrust::get<0>(t);
   }
 
  private:
-  value_idx *labels;
+  value_idx* labels;
 };
 
 /**
@@ -222,10 +234,13 @@ struct init_label_roots {
  * @param n_leaves
  */
 template <typename value_idx, int tpb = 256>
-void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
-                                const value_idx *children, size_t n_clusters,
-                                size_t n_leaves) {
-  auto stream = handle.get_stream();
+void extract_flattened_clusters(const raft::handle_t& handle,
+                                value_idx* labels,
+                                const value_idx* children,
+                                size_t n_clusters,
+                                size_t n_leaves)
+{
+  auto stream        = handle.get_stream();
   auto thrust_policy = handle.get_thrust_policy();
 
   // Handle special case where n_clusters == 1
@@ -243,24 +258,21 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
 
     auto n_edges = (n_leaves - 1) * 2;
 
-    thrust::device_ptr<const value_idx> d_ptr =
-      thrust::device_pointer_cast(children);
-    value_idx n_vertices =
-      *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1;
+    thrust::device_ptr<const value_idx> d_ptr = thrust::device_pointer_cast(children);
+    value_idx n_vertices = *(thrust::max_element(thrust_policy, d_ptr, d_ptr + n_edges)) + 1;
 
     // Prevent potential infinite loop from labeling disconnected
     // connectivities graph.
     RAFT_EXPECTS(n_leaves > 0, "n_leaves must be positive");
-    RAFT_EXPECTS(static_cast<std::size_t>(n_vertices) ==
-                   static_cast<std::size_t>((n_leaves - 1) * 2),
-                 "Multiple components found in MST or MST is invalid. "
-                 "Cannot find single-linkage solution.");
+    RAFT_EXPECTS(
+      static_cast<std::size_t>(n_vertices) == static_cast<std::size_t>((n_leaves - 1) * 2),
+      "Multiple components found in MST or MST is invalid. "
+      "Cannot find single-linkage solution.");
 
     rmm::device_uvector<value_idx> levels(n_vertices, stream);
 
     value_idx n_blocks = ceildiv(n_vertices, (value_idx)tpb);
-    write_levels_kernel<<<n_blocks, tpb, 0, stream>>>(children, levels.data(),
-                                                      n_vertices);
+    write_levels_kernel<<<n_blocks, tpb, 0, stream>>>(children, levels.data(), n_vertices);
     /**
      * Step 1: Find label roots:
      *
@@ -274,27 +286,26 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
     rmm::device_uvector<value_idx> label_roots(child_size, stream);
 
     value_idx children_cpy_start = n_edges - child_size;
-    raft::copy_async(label_roots.data(), children + children_cpy_start,
-                     child_size, stream);
+    raft::copy_async(label_roots.data(), children + children_cpy_start, child_size, stream);
 
-    thrust::sort(thrust_policy, label_roots.data(),
+    thrust::sort(thrust_policy,
+                 label_roots.data(),
                  label_roots.data() + (child_size),
                  thrust::greater<value_idx>());
 
     rmm::device_uvector<value_idx> tmp_labels(n_vertices, stream);
 
     // Init labels to -1
-    thrust::fill(thrust_policy, tmp_labels.data(),
-                 tmp_labels.data() + n_vertices, -1);
+    thrust::fill(thrust_policy, tmp_labels.data(), tmp_labels.data() + n_vertices, -1);
 
     // Write labels for cluster roots to "labels"
     thrust::counting_iterator<uint> first(0);
 
-    auto z_iter = thrust::make_zip_iterator(thrust::make_tuple(
-      first, label_roots.data() + (label_roots.size() - n_clusters)));
+    auto z_iter = thrust::make_zip_iterator(
+      thrust::make_tuple(first, label_roots.data() + (label_roots.size() - n_clusters)));
 
-    thrust::for_each(thrust_policy, z_iter, z_iter + n_clusters,
-                     init_label_roots<value_idx>(tmp_labels.data()));
+    thrust::for_each(
+      thrust_policy, z_iter, z_iter + n_clusters, init_label_roots<value_idx>(tmp_labels.data()));
 
     /**
      * Step 2: Propagate labels by having children iterate through their parents
@@ -304,9 +315,8 @@ void extract_flattened_clusters(const raft::handle_t &handle, value_idx *labels,
      */
     value_idx cut_level = (n_edges / 2) - (n_clusters - 1);
 
-    inherit_labels<<<n_blocks, tpb, 0, stream>>>(children, levels.data(),
-                                                 n_leaves, tmp_labels.data(),
-                                                 cut_level, n_vertices);
+    inherit_labels<<<n_blocks, tpb, 0, stream>>>(
+      children, levels.data(), n_leaves, tmp_labels.data(), cut_level, n_vertices);
 
     // copy tmp labels to actual labels
     raft::copy_async(labels, tmp_labels.data(), n_leaves, stream);
diff --git a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
index 31e4a0f263..c06c24e100 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/connectivities.cuh
@@ -36,14 +36,17 @@ namespace raft {
 namespace hierarchy {
 namespace detail {
 
-template <raft::hierarchy::LinkageDistance dist_type, typename value_idx,
-          typename value_t>
+template <raft::hierarchy::LinkageDistance dist_type, typename value_idx, typename value_t>
 struct distance_graph_impl {
-  void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n,
+  void run(const raft::handle_t& handle,
+           const value_t* X,
+           size_t m,
+           size_t n,
            raft::distance::DistanceType metric,
-           rmm::device_uvector<value_idx> &indptr,
-           rmm::device_uvector<value_idx> &indices,
-           rmm::device_uvector<value_t> &data, int c);
+           rmm::device_uvector<value_idx>& indptr,
+           rmm::device_uvector<value_idx>& indices,
+           rmm::device_uvector<value_t>& data,
+           int c);
 };
 
 /**
@@ -52,37 +55,41 @@ struct distance_graph_impl {
  * @tparam value_t
  */
 template <typename value_idx, typename value_t>
-struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
-                           value_idx, value_t> {
-  void run(const raft::handle_t &handle, const value_t *X, size_t m, size_t n,
+struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH, value_idx, value_t> {
+  void run(const raft::handle_t& handle,
+           const value_t* X,
+           size_t m,
+           size_t n,
            raft::distance::DistanceType metric,
-           rmm::device_uvector<value_idx> &indptr,
-           rmm::device_uvector<value_idx> &indices,
-           rmm::device_uvector<value_t> &data, int c) {
-    auto stream = handle.get_stream();
+           rmm::device_uvector<value_idx>& indptr,
+           rmm::device_uvector<value_idx>& indices,
+           rmm::device_uvector<value_t>& data,
+           int c)
+  {
+    auto stream        = handle.get_stream();
     auto thrust_policy = handle.get_thrust_policy();
 
     // Need to symmetrize knn into undirected graph
     raft::sparse::COO<value_t, value_idx> knn_graph_coo(stream);
 
-    raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo,
-                                       c);
+    raft::sparse::selection::knn_graph(handle, X, m, n, metric, knn_graph_coo, c);
 
     indices.resize(knn_graph_coo.nnz, stream);
     data.resize(knn_graph_coo.nnz, stream);
 
     // self-loops get max distance
-    auto transform_in = thrust::make_zip_iterator(thrust::make_tuple(
-      knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals()));
-
-    thrust::transform(
-      thrust_policy, transform_in, transform_in + knn_graph_coo.nnz,
-      knn_graph_coo.vals(),
-      [=] __device__(const thrust::tuple<value_idx, value_idx, value_t> &tup) {
-        bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup);
-        return (self_loop * std::numeric_limits<value_t>::max()) +
-               (!self_loop * thrust::get<2>(tup));
-      });
+    auto transform_in = thrust::make_zip_iterator(
+      thrust::make_tuple(knn_graph_coo.rows(), knn_graph_coo.cols(), knn_graph_coo.vals()));
+
+    thrust::transform(thrust_policy,
+                      transform_in,
+                      transform_in + knn_graph_coo.nnz,
+                      knn_graph_coo.vals(),
+                      [=] __device__(const thrust::tuple<value_idx, value_idx, value_t>& tup) {
+                        bool self_loop = thrust::get<0>(tup) == thrust::get<1>(tup);
+                        return (self_loop * std::numeric_limits<value_t>::max()) +
+                               (!self_loop * thrust::get<2>(tup));
+                      });
 
     raft::sparse::convert::sorted_coo_to_csr(
       knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream);
@@ -90,10 +97,8 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
     // TODO: Wouldn't need to copy here if we could compute knn
     // graph directly on the device uvectors
     // ref: https://github.com/rapidsai/raft/issues/227
-    raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz,
-                     stream);
-    raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz,
-                     stream);
+    raft::copy_async(indices.data(), knn_graph_coo.cols(), knn_graph_coo.nnz, stream);
+    raft::copy_async(data.data(), knn_graph_coo.vals(), knn_graph_coo.nnz, stream);
   }
 };
 
@@ -113,13 +118,17 @@ struct distance_graph_impl<raft::hierarchy::LinkageDistance::KNN_GRAPH,
  * @param[out] c constant 'c' used for nearest neighbors-based distances
  *             which will guarantee k <= log(n) + c
  */
-template <typename value_idx, typename value_t,
-          raft::hierarchy::LinkageDistance dist_type>
-void get_distance_graph(const raft::handle_t &handle, const value_t *X,
-                        size_t m, size_t n, raft::distance::DistanceType metric,
-                        rmm::device_uvector<value_idx> &indptr,
-                        rmm::device_uvector<value_idx> &indices,
-                        rmm::device_uvector<value_t> &data, int c) {
+template <typename value_idx, typename value_t, raft::hierarchy::LinkageDistance dist_type>
+void get_distance_graph(const raft::handle_t& handle,
+                        const value_t* X,
+                        size_t m,
+                        size_t n,
+                        raft::distance::DistanceType metric,
+                        rmm::device_uvector<value_idx>& indptr,
+                        rmm::device_uvector<value_idx>& indices,
+                        rmm::device_uvector<value_t>& data,
+                        int c)
+{
   auto stream = handle.get_stream();
 
   indptr.resize(m + 1, stream);
diff --git a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
index 6ef6f9879b..0c0b049f11 100644
--- a/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
+++ b/cpp/include/raft/sparse/hierarchy/detail/mst.cuh
@@ -34,9 +34,10 @@ namespace hierarchy {
 namespace detail {
 
 template <typename value_idx, typename value_t>
-void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
-                raft::Graph_COO<value_idx, value_idx, value_t> &coo2,
-                cudaStream_t stream) {
+void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t>& coo1,
+                raft::Graph_COO<value_idx, value_idx, value_t>& coo2,
+                cudaStream_t stream)
+{
   /** Add edges to existing mst **/
   int final_nnz = coo2.n_edges + coo1.n_edges;
 
@@ -47,12 +48,9 @@ void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
   /**
    * Construct final edge list
    */
-  raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(),
-                   coo2.n_edges, stream);
-  raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(),
-                   coo2.n_edges, stream);
-  raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(),
-                   coo2.n_edges, stream);
+  raft::copy_async(coo1.src.data() + coo1.n_edges, coo2.src.data(), coo2.n_edges, stream);
+  raft::copy_async(coo1.dst.data() + coo1.n_edges, coo2.dst.data(), coo2.n_edges, stream);
+  raft::copy_async(coo1.weights.data() + coo1.n_edges, coo2.weights.data(), coo2.n_edges, stream);
 
   coo1.n_edges = final_nnz;
 }
@@ -71,12 +69,16 @@ void merge_msts(raft::Graph_COO<value_idx, value_idx, value_t> &coo1,
  * @return updated MST edge list
  */
 template <typename value_idx, typename value_t, typename red_op>
-void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
-                       raft::Graph_COO<value_idx, value_idx, value_t> &msf,
-                       size_t m, size_t n, value_idx *color,
-                       red_op reduction_op,
-                       raft::distance::DistanceType metric =
-                         raft::distance::DistanceType::L2SqrtExpanded) {
+void connect_knn_graph(
+  const raft::handle_t& handle,
+  const value_t* X,
+  raft::Graph_COO<value_idx, value_idx, value_t>& msf,
+  size_t m,
+  size_t n,
+  value_idx* color,
+  red_op reduction_op,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
+{
   auto stream = handle.get_stream();
 
   raft::sparse::COO<value_t, value_idx> connected_edges(stream);
@@ -90,9 +92,16 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
 
   // On the second call, we hand the MST the original colors
   // and the new set of edges and let it restart the optimization process
-  auto new_mst = raft::mst::mst<value_idx, value_idx, value_t, double>(
-    handle, indptr2.data(), connected_edges.cols(), connected_edges.vals(), m,
-    connected_edges.nnz, color, stream, false, false);
+  auto new_mst = raft::mst::mst<value_idx, value_idx, value_t, double>(handle,
+                                                                       indptr2.data(),
+                                                                       connected_edges.cols(),
+                                                                       connected_edges.vals(),
+                                                                       m,
+                                                                       connected_edges.nnz,
+                                                                       color,
+                                                                       stream,
+                                                                       false,
+                                                                       false);
 
   merge_msts<value_idx, value_t>(msf, new_mst, stream);
 }
@@ -122,28 +131,34 @@ void connect_knn_graph(const raft::handle_t &handle, const value_t *X,
  *  argument is really just a safeguard against the potential for infinite loops.
  */
 template <typename value_idx, typename value_t, typename red_op>
-void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
-                      const value_idx *indptr, const value_idx *indices,
-                      const value_t *pw_dists, size_t m, size_t n,
-                      value_idx *mst_src, value_idx *mst_dst,
-                      value_t *mst_weight, value_idx *color, size_t nnz,
-                      red_op reduction_op,
-                      raft::distance::DistanceType metric =
-                        raft::distance::DistanceType::L2SqrtExpanded,
-                      int max_iter = 10) {
+void build_sorted_mst(
+  const raft::handle_t& handle,
+  const value_t* X,
+  const value_idx* indptr,
+  const value_idx* indices,
+  const value_t* pw_dists,
+  size_t m,
+  size_t n,
+  value_idx* mst_src,
+  value_idx* mst_dst,
+  value_t* mst_weight,
+  value_idx* color,
+  size_t nnz,
+  red_op reduction_op,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded,
+  int max_iter                        = 10)
+{
   auto stream = handle.get_stream();
 
   // We want to have MST initialize colors on first call.
   auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(
-    handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false,
-    true);
+    handle, indptr, indices, pw_dists, (value_idx)m, nnz, color, stream, false, true);
 
-  int iters = 1;
+  int iters        = 1;
   int n_components = linkage::get_n_components(color, m, stream);
 
   while (n_components > 1 && iters < max_iter) {
-    connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color,
-                                          reduction_op);
+    connect_knn_graph<value_idx, value_t>(handle, X, mst_coo, m, n, color, reduction_op);
 
     iters++;
 
@@ -170,9 +185,8 @@ void build_sorted_mst(const raft::handle_t &handle, const value_t *X,
                " or increase 'max_iter'",
                max_iter);
 
-  raft::sparse::op::coo_sort_by_weight(mst_coo.src.data(), mst_coo.dst.data(),
-                                       mst_coo.weights.data(), mst_coo.n_edges,
-                                       stream);
+  raft::sparse::op::coo_sort_by_weight(
+    mst_coo.src.data(), mst_coo.dst.data(), mst_coo.weights.data(), mst_coo.n_edges, stream);
 
   raft::copy_async(mst_src, mst_coo.src.data(), mst_coo.n_edges, stream);
   raft::copy_async(mst_dst, mst_coo.dst.data(), mst_coo.n_edges, stream);
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
index 06fffb8aed..3b6f1347ab 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.hpp
@@ -44,18 +44,24 @@ static const size_t EMPTY = 0;
  * @param[in] n number of columns in X
  * @param[in] metric distance metrix to use when constructing connectivities graph
  * @param[out] out struct containing output dendrogram and cluster assignments
- * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect control
+ * @param[in] c a constant used when constructing connectivities from knn graph. Allows the indirect
+ control
  *            of k. The algorithm will set `k = log(n) + c`
  * @param[in] n_clusters number of clusters to assign data samples
  */
-template <typename value_idx, typename value_t,
+template <typename value_idx,
+          typename value_t,
           LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
-void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
-                    size_t n, raft::distance::DistanceType metric,
-                    linkage_output<value_idx, value_t> *out, int c,
-                    size_t n_clusters) {
-  ASSERT(n_clusters <= m,
-         "n_clusters must be less than or equal to the number of data points");
+void single_linkage(const raft::handle_t& handle,
+                    const value_t* X,
+                    size_t m,
+                    size_t n,
+                    raft::distance::DistanceType metric,
+                    linkage_output<value_idx, value_t>* out,
+                    int c,
+                    size_t n_clusters)
+{
+  ASSERT(n_clusters <= m, "n_clusters must be less than or equal to the number of data points");
 
   auto stream = handle.get_stream();
 
@@ -78,10 +84,20 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
    */
   rmm::device_uvector<value_idx> color(m, stream);
   raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> op(color.data(), m);
-  detail::build_sorted_mst<value_idx, value_t>(
-    handle, X, indptr.data(), indices.data(), pw_dists.data(), m, n,
-    mst_rows.data(), mst_cols.data(), mst_data.data(), color.data(),
-    indices.size(), op, metric);
+  detail::build_sorted_mst<value_idx, value_t>(handle,
+                                               X,
+                                               indptr.data(),
+                                               indices.data(),
+                                               pw_dists.data(),
+                                               m,
+                                               n,
+                                               mst_rows.data(),
+                                               mst_cols.data(),
+                                               mst_data.data(),
+                                               color.data(),
+                                               indices.size(),
+                                               op,
+                                               metric);
 
   pw_dists.release();
 
@@ -93,15 +109,19 @@ void single_linkage(const raft::handle_t &handle, const value_t *X, size_t m,
   rmm::device_uvector<value_t> out_delta(n_edges, stream);
   rmm::device_uvector<value_idx> out_size(n_edges, stream);
   // Create dendrogram
-  detail::build_dendrogram_host<value_idx, value_t>(
-    handle, mst_rows.data(), mst_cols.data(), mst_data.data(), n_edges,
-    out->children, out_delta.data(), out_size.data());
-  detail::extract_flattened_clusters(handle, out->labels, out->children,
-                                     n_clusters, m);
-
-  out->m = m;
-  out->n_clusters = n_clusters;
-  out->n_leaves = m;
+  detail::build_dendrogram_host<value_idx, value_t>(handle,
+                                                    mst_rows.data(),
+                                                    mst_cols.data(),
+                                                    mst_data.data(),
+                                                    n_edges,
+                                                    out->children,
+                                                    out_delta.data(),
+                                                    out_size.data());
+  detail::extract_flattened_clusters(handle, out->labels, out->children, n_clusters, m);
+
+  out->m                      = m;
+  out->n_clusters             = n_clusters;
+  out->n_leaves               = m;
   out->n_connected_components = 1;
 }
 
diff --git a/cpp/include/raft/sparse/linalg/add.cuh b/cpp/include/raft/sparse/linalg/add.cuh
index 7ed627b9e2..0c17d55762 100644
--- a/cpp/include/raft/sparse/linalg/add.cuh
+++ b/cpp/include/raft/sparse/linalg/add.cuh
@@ -40,40 +40,47 @@ namespace sparse {
 namespace linalg {
 
 template <typename T, int TPB_X = 128>
-__global__ void csr_add_calc_row_counts_kernel(
-  const int *a_ind, const int *a_indptr, const T *a_val, int nnz1,
-  const int *b_ind, const int *b_indptr, const T *b_val, int nnz2, int m,
-  int *out_rowcounts) {
+__global__ void csr_add_calc_row_counts_kernel(const int* a_ind,
+                                               const int* a_indptr,
+                                               const T* a_val,
+                                               int nnz1,
+                                               const int* b_ind,
+                                               const int* b_indptr,
+                                               const T* b_val,
+                                               int nnz2,
+                                               int m,
+                                               int* out_rowcounts)
+{
   // loop through columns in each set of rows and
   // calculate number of unique cols across both rows
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < m) {
     int a_start_idx = a_ind[row];
-    int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
+    int a_stop_idx  = get_stop_idx(row, m, nnz1, a_ind);
 
     int b_start_idx = b_ind[row];
-    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
+    int b_stop_idx  = get_stop_idx(row, m, nnz2, b_ind);
 
     /**
-         * Union of columns within each row of A and B so that we can scan through
-         * them, adding their values together.
-         */
+     * Union of columns within each row of A and B so that we can scan through
+     * them, adding their values together.
+     */
     int max_size = (a_stop_idx - a_start_idx) + (b_stop_idx - b_start_idx);
 
-    int *arr = new int[max_size];
+    int* arr        = new int[max_size];
     int cur_arr_idx = 0;
     for (int j = a_start_idx; j < a_stop_idx; j++) {
       arr[cur_arr_idx] = a_indptr[j];
       cur_arr_idx++;
     }
 
-    int arr_size = cur_arr_idx;
+    int arr_size   = cur_arr_idx;
     int final_size = arr_size;
 
     for (int j = b_start_idx; j < b_stop_idx; j++) {
       int cur_col = b_indptr[j];
-      bool found = false;
+      bool found  = false;
       for (int k = 0; k < arr_size; k++) {
         if (arr[k] == cur_col) {
           found = true;
@@ -81,9 +88,7 @@ __global__ void csr_add_calc_row_counts_kernel(
         }
       }
 
-      if (!found) {
-        final_size++;
-      }
+      if (!found) { final_size++; }
     }
 
     out_rowcounts[row] = final_size;
@@ -94,11 +99,19 @@ __global__ void csr_add_calc_row_counts_kernel(
 }
 
 template <typename T, int TPB_X = 128>
-__global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
-                               const T *a_val, int nnz1, const int *b_ind,
-                               const int *b_indptr, const T *b_val, int nnz2,
-                               int m, int *out_ind, int *out_indptr,
-                               T *out_val) {
+__global__ void csr_add_kernel(const int* a_ind,
+                               const int* a_indptr,
+                               const T* a_val,
+                               int nnz1,
+                               const int* b_ind,
+                               const int* b_indptr,
+                               const T* b_val,
+                               int nnz2,
+                               int m,
+                               int* out_ind,
+                               int* out_indptr,
+                               T* out_val)
+{
   // 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
@@ -109,21 +122,21 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
     int a_stop_idx = get_stop_idx(row, m, nnz1, a_ind);
 
     int b_start_idx = b_ind[row];
-    int b_stop_idx = get_stop_idx(row, m, nnz2, b_ind);
+    int b_stop_idx  = get_stop_idx(row, m, nnz2, b_ind);
 
     int o_idx = out_ind[row];
 
     int cur_o_idx = o_idx;
     for (int j = a_start_idx; j < a_stop_idx; j++) {
       out_indptr[cur_o_idx] = a_indptr[j];
-      out_val[cur_o_idx] = a_val[j];
+      out_val[cur_o_idx]    = a_val[j];
       cur_o_idx++;
     }
 
     int arr_size = cur_o_idx - o_idx;
     for (int j = b_start_idx; j < b_stop_idx; j++) {
       int cur_col = b_indptr[j];
-      bool found = false;
+      bool found  = false;
       for (int k = o_idx; k < o_idx + arr_size; k++) {
         // If we found a match, sum the two values
         if (out_indptr[k] == cur_col) {
@@ -136,7 +149,7 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
       // if we didn't find a match, add the value for b
       if (!found) {
         out_indptr[o_idx + arr_size] = cur_col;
-        out_val[o_idx + arr_size] = b_val[j];
+        out_val[o_idx + arr_size]    = b_val[j];
         arr_size++;
       }
     }
@@ -159,31 +172,35 @@ __global__ void csr_add_kernel(const int *a_ind, const int *a_indptr,
  * @param stream: cuda stream to use
  */
 template <typename T, int TPB_X = 128>
-size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
-                         int nnz1, const int *b_ind, const int *b_indptr,
-                         const T *b_val, int nnz2, int m, int *out_ind,
-                         cudaStream_t stream) {
+size_t csr_add_calc_inds(const int* a_ind,
+                         const int* a_indptr,
+                         const T* a_val,
+                         int nnz1,
+                         const int* b_ind,
+                         const int* b_indptr,
+                         const T* b_val,
+                         int nnz2,
+                         int m,
+                         int* out_ind,
+                         cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
   rmm::device_uvector<int> row_counts(m + 1, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(row_counts.data(), 0, (m + 1) * sizeof(int), stream));
 
-  csr_add_calc_row_counts_kernel<T, TPB_X>
-    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
-                               b_val, nnz2, m, row_counts.data());
+  csr_add_calc_row_counts_kernel<T, TPB_X><<<grid, blk, 0, stream>>>(
+    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, row_counts.data());
 
   int cnnz = 0;
   raft::update_host(&cnnz, row_counts.data() + m, 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   // create csr compressed row index from row counts
-  thrust::device_ptr<int> row_counts_d =
-    thrust::device_pointer_cast(row_counts.data());
-  thrust::device_ptr<int> c_ind_d = thrust::device_pointer_cast(out_ind);
-  exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m,
-                 c_ind_d);
+  thrust::device_ptr<int> row_counts_d = thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<int> c_ind_d      = thrust::device_pointer_cast(out_ind);
+  exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d);
 
   return cnnz;
 }
@@ -206,16 +223,25 @@ size_t csr_add_calc_inds(const int *a_ind, const int *a_indptr, const T *a_val,
  * @param stream: cuda stream to use
  */
 template <typename T, int TPB_X = 128>
-void csr_add_finalize(const int *a_ind, const int *a_indptr, const T *a_val,
-                      int nnz1, const int *b_ind, const int *b_indptr,
-                      const T *b_val, int nnz2, int m, int *c_ind,
-                      int *c_indptr, T *c_val, cudaStream_t stream) {
+void csr_add_finalize(const int* a_ind,
+                      const int* a_indptr,
+                      const T* a_val,
+                      int nnz1,
+                      const int* b_ind,
+                      const int* b_indptr,
+                      const T* b_val,
+                      int nnz2,
+                      int m,
+                      int* c_ind,
+                      int* c_indptr,
+                      T* c_val,
+                      cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_add_kernel<T, TPB_X>
-    <<<grid, blk, 0, stream>>>(a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr,
-                               b_val, nnz2, m, c_ind, c_indptr, c_val);
+  csr_add_kernel<T, TPB_X><<<grid, blk, 0, stream>>>(
+    a_ind, a_indptr, a_val, nnz1, b_ind, b_indptr, b_val, nnz2, m, c_ind, c_indptr, c_val);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index ef6a067c39..052f674325 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -44,11 +44,10 @@ namespace linalg {
  * @param results array to place results
  */
 template <int TPB_X = 64, typename T = int>
-__global__ void coo_degree_kernel(const T *rows, int nnz, T *results) {
+__global__ void coo_degree_kernel(const T* rows, int nnz, T* results)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz) {
-    atomicAdd(results + rows[row], (T)1);
-  }
+  if (row < nnz) { atomicAdd(results + rows[row], (T)1); }
 }
 
 /**
@@ -60,7 +59,8 @@ __global__ void coo_degree_kernel(const T *rows, int nnz, T *results) {
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T = int>
-void coo_degree(const T *rows, int nnz, T *results, cudaStream_t stream) {
+void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
@@ -77,31 +77,28 @@ void coo_degree(const T *rows, int nnz, T *results, cudaStream_t stream) {
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree(COO<T> *in, int *results, cudaStream_t stream) {
+void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
-  coo_degree_kernel<TPB_X>
-    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->nnz, results);
+  coo_degree_kernel<TPB_X><<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->nnz, results);
   CUDA_CHECK(cudaGetLastError());
 }
 
 template <int TPB_X = 64, typename T>
-__global__ void coo_degree_nz_kernel(const int *rows, const T *vals, int nnz,
-                                     int *results) {
+__global__ void coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz && vals[row] != 0.0) {
-    raft::myAtomicAdd(results + rows[row], 1);
-  }
+  if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
 template <int TPB_X = 64, typename T>
-__global__ void coo_degree_scalar_kernel(const int *rows, const T *vals,
-                                         int nnz, T scalar, int *results) {
+__global__ void coo_degree_scalar_kernel(
+  const int* rows, const T* vals, int nnz, T scalar, int* results)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz && vals[row] != scalar) {
-    raft::myAtomicAdd(results + rows[row], 1);
-  }
+  if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
 /**
@@ -114,12 +111,12 @@ __global__ void coo_degree_scalar_kernel(const int *rows, const T *vals,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_scalar(COO<T> *in, T scalar, int *results,
-                       cudaStream_t stream) {
+void coo_degree_scalar(COO<T>* in, T scalar, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
-  coo_degree_scalar_kernel<TPB_X, T><<<grid_rc, blk_rc, 0, stream>>>(
-    in->rows(), in->vals(), in->nnz, scalar, results);
+  coo_degree_scalar_kernel<TPB_X, T>
+    <<<grid_rc, blk_rc, 0, stream>>>(in->rows(), in->vals(), in->nnz, scalar, results);
   CUDA_CHECK(cudaGetLastError());
 }
 
@@ -135,8 +132,9 @@ void coo_degree_scalar(COO<T> *in, T scalar, int *results,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar,
-                       int *results, cudaStream_t stream = 0) {
+void coo_degree_scalar(
+  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
+{
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
   coo_degree_scalar_kernel<TPB_X, T>
@@ -154,12 +152,11 @@ void coo_degree_scalar(const int *rows, const T *vals, int nnz, T scalar,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results,
-                   cudaStream_t stream) {
+void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
-  coo_degree_nz_kernel<TPB_X, T>
-    <<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, results);
+  coo_degree_nz_kernel<TPB_X, T><<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, results);
 }
 
 /**
@@ -171,7 +168,8 @@ void coo_degree_nz(const int *rows, const T *vals, int nnz, int *results,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void coo_degree_nz(COO<T> *in, int *results, cudaStream_t stream) {
+void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
+{
   dim3 grid_rc(raft::ceildiv(in->nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index bfcd3fd592..59dc5ff3e4 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -41,10 +41,12 @@ __global__ void csr_row_normalize_l1_kernel(
   // @TODO: This can be done much more parallel by
   // having threads in a warp compute the sum in parallel
   // over each row and then divide the values in parallel.
-  const int *ia,           // csr row ex_scan (sorted by row)
-  const T *vals, int nnz,  // array of values and number of non-zeros
-  int m,                   // num rows in csr
-  T *result) {             // output array
+  const int* ia,  // csr row ex_scan (sorted by row)
+  const T* vals,
+  int nnz,  // array of values and number of non-zeros
+  int m,    // num rows in csr
+  T* result)
+{  // output array
 
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -52,7 +54,7 @@ __global__ void csr_row_normalize_l1_kernel(
   // sum all vals_arr for row and divide each val by sum
   if (row < m) {
     int start_idx = ia[row];
-    int stop_idx = 0;
+    int stop_idx  = 0;
     if (row < m - 1) {
       stop_idx = ia[row + 1];
     } else
@@ -65,7 +67,7 @@ __global__ void csr_row_normalize_l1_kernel(
 
     for (int j = start_idx; j < stop_idx; j++) {
       if (sum != 0.0) {
-        T val = vals[j];
+        T val     = vals[j];
         result[j] = val / sum;
       } else {
         result[j] = 0.0;
@@ -85,18 +87,18 @@ __global__ void csr_row_normalize_l1_kernel(
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 64, typename T>
-void csr_row_normalize_l1(const int *ia,  // csr row ex_scan (sorted by row)
-                          const T *vals,
+void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
+                          const T* vals,
                           int nnz,  // array of values and number of non-zeros
                           int m,    // num rows in csr
-                          T *result,
-                          cudaStream_t stream) {  // output array
+                          T* result,
+                          cudaStream_t stream)
+{  // output array
 
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_row_normalize_l1_kernel<TPB_X, T>
-    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
+  csr_row_normalize_l1_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
   CUDA_CHECK(cudaGetLastError());
 }
 
@@ -105,10 +107,12 @@ __global__ void csr_row_normalize_max_kernel(
   // @TODO: This can be done much more parallel by
   // having threads in a warp compute the sum in parallel
   // over each row and then divide the values in parallel.
-  const int *ia,           // csr row ind array (sorted by row)
-  const T *vals, int nnz,  // array of values and number of non-zeros
-  int m,                   // num total rows in csr
-  T *result) {             // output array
+  const int* ia,  // csr row ind array (sorted by row)
+  const T* vals,
+  int nnz,  // array of values and number of non-zeros
+  int m,    // num total rows in csr
+  T* result)
+{  // output array
 
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
@@ -116,7 +120,7 @@ __global__ void csr_row_normalize_max_kernel(
   // find max across columns and divide
   if (row < m) {
     int start_idx = ia[row];
-    int stop_idx = 0;
+    int stop_idx  = 0;
     if (row < m - 1) {
       stop_idx = ia[row + 1];
     } else
@@ -130,7 +134,7 @@ __global__ void csr_row_normalize_max_kernel(
     // divide nonzeros in current row by max
     for (int j = start_idx; j < stop_idx; j++) {
       if (max != 0.0 && max > std::numeric_limits<float>::min()) {
-        T val = vals[j];
+        T val     = vals[j];
         result[j] = val / max;
       } else {
         result[j] = 0.0;
@@ -151,16 +155,17 @@ __global__ void csr_row_normalize_max_kernel(
  */
 
 template <int TPB_X = 64, typename T>
-void csr_row_normalize_max(const int *ia,  // csr row ind array (sorted by row)
-                           const T *vals,
+void csr_row_normalize_max(const int* ia,  // csr row ind array (sorted by row)
+                           const T* vals,
                            int nnz,  // array of values and number of non-zeros
                            int m,    // num total rows in csr
-                           T *result, cudaStream_t stream) {
+                           T* result,
+                           cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(m, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  csr_row_normalize_max_kernel<TPB_X, T>
-    <<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
+  csr_row_normalize_max_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(ia, vals, nnz, m, result);
   CUDA_CHECK(cudaGetLastError());
 }
 
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index ce0c4bbe6f..a293e359c2 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -30,15 +30,22 @@ namespace sparse {
 namespace spectral {
 
 template <typename T>
-void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
-                   int nnz, int n, int n_components, T *out,
-                   unsigned long long seed = 1234567) {
+void fit_embedding(const raft::handle_t& handle,
+                   int* rows,
+                   int* cols,
+                   T* vals,
+                   int nnz,
+                   int n,
+                   int n_components,
+                   T* out,
+                   unsigned long long seed = 1234567)
+{
   auto stream = handle.get_stream();
   rmm::device_uvector<int> src_offsets(n + 1, stream);
   rmm::device_uvector<int> dst_cols(nnz, stream);
   rmm::device_uvector<T> dst_vals(nnz, stream);
-  convert::coo_to_csr(handle, rows, cols, vals, nnz, n, src_offsets.data(),
-                      dst_cols.data(), dst_vals.data());
+  convert::coo_to_csr(
+    handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data());
 
   rmm::device_uvector<T> eigVals(n_components + 1, stream);
   rmm::device_uvector<T> eigVecs(n * (n_components + 1), stream);
@@ -52,45 +59,49 @@ void fit_embedding(const raft::handle_t &handle, int *rows, int *cols, T *vals,
   using index_type = int;
   using value_type = T;
 
-  index_type *ro = src_offsets.data();
-  index_type *ci = dst_cols.data();
-  value_type *vs = dst_vals.data();
+  index_type* ro = src_offsets.data();
+  index_type* ci = dst_cols.data();
+  value_type* vs = dst_vals.data();
 
-  raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
-    handle, ro, ci, vs, n, nnz};
+  raft::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{handle, ro, ci, vs, n, nnz};
 
-  index_type neigvs = n_components + 1;
-  index_type maxiter = 4000;  //default reset value (when set to 0);
-  value_type tol = 0.01;
-  index_type restart_iter = 15 + neigvs;  //what cugraph is using
+  index_type neigvs       = n_components + 1;
+  index_type maxiter      = 4000;  // default reset value (when set to 0);
+  value_type tol          = 0.01;
+  index_type restart_iter = 15 + neigvs;  // what cugraph is using
 
-  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter,
-                                                          restart_iter, tol};
+  raft::eigen_solver_config_t<index_type, value_type> cfg{neigvs, maxiter, restart_iter, tol};
 
   cfg.seed = seed;
 
   raft::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
 
-  //cluster computation here is irrelevant,
-  //hence define a no-op such solver to
-  //feed partition():
+  // cluster computation here is irrelevant,
+  // hence define a no-op such solver to
+  // feed partition():
   //
   struct no_op_cluster_solver_t {
     using index_type_t = index_type;
-    using size_type_t = index_type;
+    using size_type_t  = index_type;
     using value_type_t = value_type;
 
-    std::pair<value_type_t, index_type_t> solve(
-      handle_t const &handle, size_type_t n_obs_vecs, size_type_t dim,
-      value_type_t const *__restrict__ obs,
-      index_type_t *__restrict__ codes) const {
+    std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+                                                size_type_t n_obs_vecs,
+                                                size_type_t dim,
+                                                value_type_t const* __restrict__ obs,
+                                                index_type_t* __restrict__ codes) const
+    {
       return std::make_pair<value_type_t, index_type_t>(0, 0);
     }
   };
 
-  raft::spectral::partition(handle, r_csr_m, eig_solver,
-                            no_op_cluster_solver_t{}, labels.data(),
-                            eigVals.data(), eigVecs.data());
+  raft::spectral::partition(handle,
+                            r_csr_m,
+                            eig_solver,
+                            no_op_cluster_solver_t{},
+                            labels.data(),
+                            eigVals.data(),
+                            eigVecs.data());
 
   raft::copy<T>(out, eigVecs.data() + n, n * n_components, stream);
 
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index a6e1027288..ae89e7993c 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -47,26 +47,34 @@ namespace linalg {
 // TODO: value_idx param needs to be used for this once FAISS is updated to use float32
 // for indices so that the index types can be uniform
 template <int TPB_X = 128, typename T, typename Lambda>
-__global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
-                                      T *vals, int *orows, int *ocols, T *ovals,
-                                      int n, int cnnz, Lambda reduction_op) {
+__global__ void coo_symmetrize_kernel(int* row_ind,
+                                      int* rows,
+                                      int* cols,
+                                      T* vals,
+                                      int* orows,
+                                      int* ocols,
+                                      T* ovals,
+                                      int n,
+                                      int cnnz,
+                                      Lambda reduction_op)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < n) {
     int start_idx = row_ind[row];  // each thread processes one row
-    int stop_idx = get_stop_idx(row, n, cnnz, row_ind);
+    int stop_idx  = get_stop_idx(row, n, cnnz, row_ind);
 
-    int row_nnz = 0;
+    int row_nnz       = 0;
     int out_start_idx = start_idx * 2;
 
     for (int idx = 0; idx < stop_idx - start_idx; idx++) {
       int cur_row = rows[idx + start_idx];
       int cur_col = cols[idx + start_idx];
-      T cur_val = vals[idx + start_idx];
+      T cur_val   = vals[idx + start_idx];
 
       int lookup_row = cur_col;
-      int t_start = row_ind[lookup_row];  // Start at
-      int t_stop = get_stop_idx(lookup_row, n, cnnz, row_ind);
+      int t_start    = row_ind[lookup_row];  // Start at
+      int t_stop     = get_stop_idx(lookup_row, n, cnnz, row_ind);
 
       T transpose = 0.0;
 
@@ -77,7 +85,7 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
         // done in a different thread.
         if (cols[t_idx] == cur_row && rows[t_idx] == cur_col) {
           // If it exists already, set transposed value to existing value
-          transpose = vals[t_idx];
+          transpose   = vals[t_idx];
           found_match = true;
           break;
         }
@@ -123,9 +131,11 @@ __global__ void coo_symmetrize_kernel(int *row_ind, int *rows, int *cols,
  * @param stream: cuda stream to use
  */
 template <int TPB_X = 128, typename T, typename Lambda>
-void coo_symmetrize(COO<T> *in, COO<T> *out,
+void coo_symmetrize(COO<T>* in,
+                    COO<T>* out,
                     Lambda reduction_op,  // two-argument reducer
-                    cudaStream_t stream) {
+                    cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(in->n_rows, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
@@ -137,9 +147,16 @@ void coo_symmetrize(COO<T> *in, COO<T> *out,
 
   out->allocate(in->nnz * 2, in->n_rows, in->n_cols, true, stream);
 
-  coo_symmetrize_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(
-    in_row_ind.data(), in->rows(), in->cols(), in->vals(), out->rows(),
-    out->cols(), out->vals(), in->n_rows, in->nnz, reduction_op);
+  coo_symmetrize_kernel<TPB_X, T><<<grid, blk, 0, stream>>>(in_row_ind.data(),
+                                                            in->rows(),
+                                                            in->cols(),
+                                                            in->vals(),
+                                                            out->rows(),
+                                                            out->cols(),
+                                                            out->vals(),
+                                                            in->n_rows,
+                                                            in->nnz,
+                                                            reduction_op);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -155,14 +172,15 @@ void coo_symmetrize(COO<T> *in, COO<T> *out,
  * @param row_sizes2: Input empty row sum 2 array(n) for faster reduction
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_find_size(const value_t *restrict data,
-                                           const value_idx *restrict indices,
-                                           const value_idx n, const int k,
-                                           value_idx *restrict row_sizes,
-                                           value_idx *restrict row_sizes2) {
+__global__ static void symmetric_find_size(const value_t* restrict data,
+                                           const value_idx* restrict indices,
+                                           const value_idx n,
+                                           const int k,
+                                           value_idx* restrict row_sizes,
+                                           value_idx* restrict row_sizes2)
+{
   const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
-  const auto j =
-    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
+  const auto j   = blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
   if (row >= n || j >= k) return;
 
   const auto col = indices[row * k + j];
@@ -182,9 +200,11 @@ __global__ static void symmetric_find_size(const value_t *restrict data,
  * @param row_sizes2: Input row sum 2 array(n) for faster reduction
  */
 template <typename value_idx>
-__global__ static void reduce_find_size(const value_idx n, const int k,
-                                        value_idx *restrict row_sizes,
-                                        const value_idx *restrict row_sizes2) {
+__global__ static void reduce_find_size(const value_idx n,
+                                        const int k,
+                                        value_idx* restrict row_sizes,
+                                        const value_idx* restrict row_sizes2)
+{
   const auto i = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (i >= n) return;
   row_sizes[i] += (row_sizes2[i] + k);
@@ -205,20 +225,21 @@ __global__ static void reduce_find_size(const value_idx n, const int k,
  * @param k: Number of n_neighbors
  */
 template <typename value_idx = int64_t, typename value_t = float>
-__global__ static void symmetric_sum(value_idx *restrict edges,
-                                     const value_t *restrict data,
-                                     const value_idx *restrict indices,
-                                     value_t *restrict VAL,
-                                     value_idx *restrict COL,
-                                     value_idx *restrict ROW, const value_idx n,
-                                     const int k) {
+__global__ static void symmetric_sum(value_idx* restrict edges,
+                                     const value_t* restrict data,
+                                     const value_idx* restrict indices,
+                                     value_t* restrict VAL,
+                                     value_idx* restrict COL,
+                                     value_idx* restrict ROW,
+                                     const value_idx n,
+                                     const int k)
+{
   const auto row = blockIdx.x * blockDim.x + threadIdx.x;  // for every row
-  const auto j =
-    blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
+  const auto j   = blockIdx.y * blockDim.y + threadIdx.y;  // for every item in row
   if (row >= n || j >= k) return;
 
-  const auto col = indices[row * k + j];
-  const auto original = atomicAdd(&edges[row], value_idx(1));
+  const auto col       = indices[row * k + j];
+  const auto original  = atomicAdd(&edges[row], value_idx(1));
   const auto transpose = atomicAdd(&edges[col], value_idx(1));
 
   VAL[transpose] = VAL[original] = data[row * k + j];
@@ -247,27 +268,25 @@ __global__ static void symmetric_sum(value_idx *restrict edges,
  * @param out: Output COO Matrix class
  * @param stream: Input cuda stream
  */
-template <typename value_idx = int64_t, typename value_t = float,
-          int TPB_X = 32, int TPB_Y = 32>
-void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices,
-                                const value_t *restrict knn_dists,
-                                const value_idx n, const int k,
-                                COO<value_t, value_idx> *out,
-                                cudaStream_t stream) {
+template <typename value_idx = int64_t, typename value_t = float, int TPB_X = 32, int TPB_Y = 32>
+void from_knn_symmetrize_matrix(const value_idx* restrict knn_indices,
+                                const value_t* restrict knn_dists,
+                                const value_idx n,
+                                const int k,
+                                COO<value_t, value_idx>* out,
+                                cudaStream_t stream)
+{
   // (1) Find how much space needed in each row
   // We look through all datapoints and increment the count for each row.
   const dim3 threadsPerBlock(TPB_X, TPB_Y);
-  const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X),
-                       raft::ceildiv(k, TPB_Y));
+  const dim3 numBlocks(raft::ceildiv(n, (value_idx)TPB_X), raft::ceildiv(k, TPB_Y));
 
   // Notice n+1 since we can reuse these arrays for transpose_edges, original_edges in step (4)
   rmm::device_uvector<value_idx> row_sizes(n, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
+  CUDA_CHECK(cudaMemsetAsync(row_sizes.data(), 0, sizeof(value_idx) * n, stream));
 
   rmm::device_uvector<value_idx> row_sizes2(n, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
+  CUDA_CHECK(cudaMemsetAsync(row_sizes2.data(), 0, sizeof(value_idx) * n, stream));
 
   symmetric_find_size<<<numBlocks, threadsPerBlock, 0, stream>>>(
     knn_dists, knn_indices, n, k, row_sizes.data(), row_sizes2.data());
@@ -288,14 +307,12 @@ void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices,
   // This mirrors CSR matrix's row Pointer, were maximum bounds for each row
   // are calculated as the cumulative rolling sum of the previous rows.
   // Notice reusing old row_sizes2 memory
-  value_idx *edges = row_sizes2.data();
-  thrust::device_ptr<value_idx> __edges = thrust::device_pointer_cast(edges);
-  thrust::device_ptr<value_idx> __row_sizes =
-    thrust::device_pointer_cast(row_sizes.data());
+  value_idx* edges                          = row_sizes2.data();
+  thrust::device_ptr<value_idx> __edges     = thrust::device_pointer_cast(edges);
+  thrust::device_ptr<value_idx> __row_sizes = thrust::device_pointer_cast(row_sizes.data());
 
   // Rolling cumulative sum
-  thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n,
-                         __edges);
+  thrust::exclusive_scan(rmm::exec_policy(stream), __row_sizes, __row_sizes + n, __edges);
 
   // (5) Perform final data + data.T operation in tandem with memcpying
   symmetric_sum<<<numBlocks, threadsPerBlock, 0, stream>>>(
@@ -307,9 +324,15 @@ void from_knn_symmetrize_matrix(const value_idx *restrict knn_indices,
  * Symmetrizes a COO matrix
  */
 template <typename value_idx, typename value_t>
-void symmetrize(const raft::handle_t &handle, const value_idx *rows,
-                const value_idx *cols, const value_t *vals, size_t m, size_t n,
-                size_t nnz, raft::sparse::COO<value_t, value_idx> &out) {
+void symmetrize(const raft::handle_t& handle,
+                const value_idx* rows,
+                const value_idx* cols,
+                const value_t* vals,
+                size_t m,
+                size_t n,
+                size_t nnz,
+                raft::sparse::COO<value_t, value_idx>& out)
+{
   auto stream = handle.get_stream();
 
   // copy rows to cols and cols to rows
@@ -326,13 +349,16 @@ void symmetrize(const raft::handle_t &handle, const value_idx *rows,
   raft::copy_async(symm_vals.data() + nnz, vals, nnz, stream);
 
   // sort COO
-  raft::sparse::op::coo_sort((value_idx)m, (value_idx)n, (value_idx)nnz * 2,
-                             symm_rows.data(), symm_cols.data(),
-                             symm_vals.data(), stream);
-
-  raft::sparse::op::max_duplicates(handle, out, symm_rows.data(),
-                                   symm_cols.data(), symm_vals.data(), nnz * 2,
-                                   m, n);
+  raft::sparse::op::coo_sort((value_idx)m,
+                             (value_idx)n,
+                             (value_idx)nnz * 2,
+                             symm_rows.data(),
+                             symm_cols.data(),
+                             symm_vals.data(),
+                             stream);
+
+  raft::sparse::op::max_duplicates(
+    handle, out, symm_rows.data(), symm_cols.data(), symm_vals.data(), nnz * 2, m, n);
 }
 
 };  // end NAMESPACE linalg
diff --git a/cpp/include/raft/sparse/linalg/transpose.h b/cpp/include/raft/sparse/linalg/transpose.h
index 7ad4b93ec0..e3a9b1fbd9 100644
--- a/cpp/include/raft/sparse/linalg/transpose.h
+++ b/cpp/include/raft/sparse/linalg/transpose.h
@@ -55,27 +55,53 @@ namespace linalg {
  * @param[in] stream : Cuda stream for ordering events
  */
 template <typename value_idx, typename value_t>
-void csr_transpose(cusparseHandle_t handle, const value_idx *csr_indptr,
-                   const value_idx *csr_indices, const value_t *csr_data,
-                   value_idx *csc_indptr, value_idx *csc_indices,
-                   value_t *csc_data, value_idx csr_nrows, value_idx csr_ncols,
-                   value_idx nnz, cudaStream_t stream) {
+void csr_transpose(cusparseHandle_t handle,
+                   const value_idx* csr_indptr,
+                   const value_idx* csr_indices,
+                   const value_t* csr_data,
+                   value_idx* csc_indptr,
+                   value_idx* csc_indices,
+                   value_t* csc_data,
+                   value_idx csr_nrows,
+                   value_idx csr_ncols,
+                   value_idx nnz,
+                   cudaStream_t stream)
+{
   size_t convert_csc_workspace_size = 0;
 
-  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(
-    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
-    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
-    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
-    &convert_csc_workspace_size, stream));
+  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc_bufferSize(handle,
+                                                          csr_nrows,
+                                                          csr_ncols,
+                                                          nnz,
+                                                          csr_data,
+                                                          csr_indptr,
+                                                          csr_indices,
+                                                          csc_data,
+                                                          csc_indptr,
+                                                          csc_indices,
+                                                          CUSPARSE_ACTION_NUMERIC,
+                                                          CUSPARSE_INDEX_BASE_ZERO,
+                                                          CUSPARSE_CSR2CSC_ALG1,
+                                                          &convert_csc_workspace_size,
+                                                          stream));
 
-  rmm::device_uvector<char> convert_csc_workspace(convert_csc_workspace_size,
-                                                  stream);
+  rmm::device_uvector<char> convert_csc_workspace(convert_csc_workspace_size, stream);
 
-  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(
-    handle, csr_nrows, csr_ncols, nnz, csr_data, csr_indptr, csr_indices,
-    csc_data, csc_indptr, csc_indices, CUSPARSE_ACTION_NUMERIC,
-    CUSPARSE_INDEX_BASE_ZERO, CUSPARSE_CSR2CSC_ALG1,
-    convert_csc_workspace.data(), stream));
+  CUSPARSE_CHECK(raft::sparse::cusparsecsr2csc(handle,
+                                               csr_nrows,
+                                               csr_ncols,
+                                               nnz,
+                                               csr_data,
+                                               csr_indptr,
+                                               csr_indices,
+                                               csc_data,
+                                               csc_indptr,
+                                               csc_indices,
+                                               CUSPARSE_ACTION_NUMERIC,
+                                               CUSPARSE_INDEX_BASE_ZERO,
+                                               CUSPARSE_CSR2CSC_ALG1,
+                                               convert_csc_workspace.data(),
+                                               stream));
 }
 
 };  // end NAMESPACE linalg
diff --git a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh
index f0d30b0cb7..36d426029b 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_kernels.cuh
@@ -28,10 +28,16 @@ namespace mst {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename alteration_t>
-__global__ void kernel_min_edge_per_vertex(
-  const edge_t* offsets, const vertex_t* indices, const alteration_t* weights,
-  const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge,
-  const bool* mst_edge, alteration_t* min_edge_color, const vertex_t v) {
+__global__ void kernel_min_edge_per_vertex(const edge_t* offsets,
+                                           const vertex_t* indices,
+                                           const alteration_t* weights,
+                                           const vertex_t* color,
+                                           const vertex_t* color_index,
+                                           edge_t* new_mst_edge,
+                                           const bool* mst_edge,
+                                           alteration_t* min_edge_color,
+                                           const vertex_t v)
+{
   edge_t tid = threadIdx.x + blockIdx.x * blockDim.x;
 
   unsigned warp_id = tid / 32;
@@ -41,14 +47,14 @@ __global__ void kernel_min_edge_per_vertex(
   __shared__ alteration_t min_edge_weight[32];
   __shared__ vertex_t min_color[32];
 
-  min_edge_index[lane_id] = std::numeric_limits<edge_t>::max();
+  min_edge_index[lane_id]  = std::numeric_limits<edge_t>::max();
   min_edge_weight[lane_id] = std::numeric_limits<alteration_t>::max();
-  min_color[lane_id] = std::numeric_limits<vertex_t>::max();
+  min_color[lane_id]       = std::numeric_limits<vertex_t>::max();
 
   __syncthreads();
 
   vertex_t self_color_idx = color_index[warp_id];
-  vertex_t self_color = color[self_color_idx];
+  vertex_t self_color     = color[self_color_idx];
 
   // find the minimum edge associated per row
   // each thread in warp holds the minimum edge for
@@ -56,20 +62,20 @@ __global__ void kernel_min_edge_per_vertex(
   if (warp_id < v) {
     // one row is associated with one warp
     edge_t row_start = offsets[warp_id];
-    edge_t row_end = offsets[warp_id + 1];
+    edge_t row_end   = offsets[warp_id + 1];
 
     // assuming one warp per row
     // find min for each thread in warp
     for (edge_t e = row_start + lane_id; e < row_end; e += 32) {
       alteration_t curr_edge_weight = weights[e];
-      vertex_t successor_color_idx = color_index[indices[e]];
-      vertex_t successor_color = color[successor_color_idx];
+      vertex_t successor_color_idx  = color_index[indices[e]];
+      vertex_t successor_color      = color[successor_color_idx];
 
       if (!mst_edge[e] && self_color != successor_color) {
         if (curr_edge_weight < min_edge_weight[lane_id]) {
-          min_color[lane_id] = successor_color;
+          min_color[lane_id]       = successor_color;
           min_edge_weight[lane_id] = curr_edge_weight;
-          min_edge_index[lane_id] = e;
+          min_edge_index[lane_id]  = e;
         }
       }
     }
@@ -82,9 +88,9 @@ __global__ void kernel_min_edge_per_vertex(
   for (int offset = 16; offset > 0; offset >>= 1) {
     if (lane_id < offset) {
       if (min_edge_weight[lane_id] > min_edge_weight[lane_id + offset]) {
-        min_color[lane_id] = min_color[lane_id + offset];
+        min_color[lane_id]       = min_color[lane_id + offset];
         min_edge_weight[lane_id] = min_edge_weight[lane_id + offset];
-        min_edge_index[lane_id] = min_edge_index[lane_id + offset];
+        min_edge_index[lane_id]  = min_edge_index[lane_id + offset];
       }
     }
     __syncthreads();
@@ -102,19 +108,26 @@ __global__ void kernel_min_edge_per_vertex(
   }
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-__global__ void min_edge_per_supervertex(
-  const vertex_t* color, const vertex_t* color_index, edge_t* new_mst_edge,
-  bool* mst_edge, const vertex_t* indices, const weight_t* weights,
-  const alteration_t* altered_weights, vertex_t* temp_src, vertex_t* temp_dst,
-  weight_t* temp_weights, const alteration_t* min_edge_color, const vertex_t v,
-  bool symmetrize_output) {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+__global__ void min_edge_per_supervertex(const vertex_t* color,
+                                         const vertex_t* color_index,
+                                         edge_t* new_mst_edge,
+                                         bool* mst_edge,
+                                         const vertex_t* indices,
+                                         const weight_t* weights,
+                                         const alteration_t* altered_weights,
+                                         vertex_t* temp_src,
+                                         vertex_t* temp_dst,
+                                         weight_t* temp_weights,
+                                         const alteration_t* min_edge_color,
+                                         const vertex_t v,
+                                         bool symmetrize_output)
+{
   auto tid = get_1D_idx<vertex_t>();
   if (tid < v) {
     vertex_t vertex_color_idx = color_index[tid];
-    vertex_t vertex_color = color[vertex_color_idx];
-    edge_t edge_idx = new_mst_edge[tid];
+    vertex_t vertex_color     = color[vertex_color_idx];
+    edge_t edge_idx           = new_mst_edge[tid];
 
     // check if valid outgoing edge was found
     // find minimum edge is same as minimum edge of whole supervertex
@@ -129,32 +142,27 @@ __global__ void min_edge_per_supervertex(
         auto dst = indices[edge_idx];
         if (!symmetrize_output) {
           auto dst_edge_idx = new_mst_edge[dst];
-          auto dst_color = color[color_index[dst]];
+          auto dst_color    = color[color_index[dst]];
 
           // vertices added each other
           // only if destination has found an edge
           // the edge points back to source
           // the edge is minimum edge found for dst color
-          if (dst_edge_idx != std::numeric_limits<edge_t>::max() &&
-              indices[dst_edge_idx] == tid &&
+          if (dst_edge_idx != std::numeric_limits<edge_t>::max() && indices[dst_edge_idx] == tid &&
               min_edge_color[dst_color] == altered_weights[dst_edge_idx]) {
-            if (vertex_color > dst_color) {
-              add_edge = false;
-            }
+            if (vertex_color > dst_color) { add_edge = false; }
           }
         }
 
         if (add_edge) {
-          temp_src[tid] = tid;
-          temp_dst[tid] = dst;
-          temp_weights[tid] = weights[edge_idx];
+          temp_src[tid]      = tid;
+          temp_dst[tid]      = dst;
+          temp_weights[tid]  = weights[edge_idx];
           mst_edge[edge_idx] = true;
         }
       }
 
-      if (!add_edge) {
-        new_mst_edge[tid] = std::numeric_limits<edge_t>::max();
-      }
+      if (!add_edge) { new_mst_edge[tid] = std::numeric_limits<edge_t>::max(); }
     }
   }
 }
@@ -162,9 +170,13 @@ __global__ void min_edge_per_supervertex(
 template <typename vertex_t, typename edge_t, typename weight_t>
 __global__ void add_reverse_edge(const edge_t* new_mst_edge,
                                  const vertex_t* indices,
-                                 const weight_t* weights, vertex_t* temp_src,
-                                 vertex_t* temp_dst, weight_t* temp_weights,
-                                 const vertex_t v, bool symmetrize_output) {
+                                 const weight_t* weights,
+                                 vertex_t* temp_src,
+                                 vertex_t* temp_dst,
+                                 weight_t* temp_weights,
+                                 const vertex_t v,
+                                 bool symmetrize_output)
+{
   auto tid = get_1D_idx<vertex_t>();
 
   if (tid < v) {
@@ -186,9 +198,7 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
 
           // if vertices did not pick each other
           // add a reverse edge
-          if (tid != neighbor_vertex_neighbor) {
-            reverse_needed = true;
-          }
+          if (tid != neighbor_vertex_neighbor) { reverse_needed = true; }
         }
       }
 
@@ -197,8 +207,8 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
         // it is assumed the each vertex only picks one valid min edge
         // per cycle
         // hence, we store at index tid + v for the reverse edge scenario
-        temp_src[tid + v] = neighbor_vertex;
-        temp_dst[tid + v] = tid;
+        temp_src[tid + v]     = neighbor_vertex;
+        temp_dst[tid + v]     = tid;
         temp_weights[tid + v] = weights[edge_idx];
       }
     }
@@ -207,11 +217,13 @@ __global__ void add_reverse_edge(const edge_t* new_mst_edge,
 
 // executes for newly added mst edges and updates the colors of both vertices to the lower color
 template <typename vertex_t, typename edge_t>
-__global__ void min_pair_colors(const vertex_t v, const vertex_t* indices,
+__global__ void min_pair_colors(const vertex_t v,
+                                const vertex_t* indices,
                                 const edge_t* new_mst_edge,
                                 const vertex_t* color,
                                 const vertex_t* color_index,
-                                vertex_t* next_color) {
+                                vertex_t* next_color)
+{
   auto i = get_1D_idx<vertex_t>();
 
   if (i < v) {
@@ -220,9 +232,9 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices,
     if (edge_idx != std::numeric_limits<edge_t>::max()) {
       vertex_t neighbor_vertex = indices[edge_idx];
       // vertex_t self_color = color[i];
-      vertex_t self_color_idx = color_index[i];
-      vertex_t self_color = color[self_color_idx];
-      vertex_t neighbor_color_idx = color_index[neighbor_vertex];
+      vertex_t self_color_idx       = color_index[i];
+      vertex_t self_color           = color[self_color_idx];
+      vertex_t neighbor_color_idx   = color_index[neighbor_vertex];
       vertex_t neighbor_super_color = color[neighbor_color_idx];
 
       // update my own color as source of edge
@@ -238,33 +250,36 @@ __global__ void min_pair_colors(const vertex_t v, const vertex_t* indices,
 
 // for each vertex, update color if it was changed in min_pair_colors kernel
 template <typename vertex_t>
-__global__ void update_colors(const vertex_t v, vertex_t* color,
+__global__ void update_colors(const vertex_t v,
+                              vertex_t* color,
                               const vertex_t* color_index,
-                              const vertex_t* next_color, bool* done) {
+                              const vertex_t* next_color,
+                              bool* done)
+{
   auto i = get_1D_idx<vertex_t>();
 
   if (i < v) {
-    vertex_t self_color = color[i];
+    vertex_t self_color     = color[i];
     vertex_t self_color_idx = color_index[i];
-    vertex_t new_color = next_color[self_color_idx];
+    vertex_t new_color      = next_color[self_color_idx];
 
     // update self color to new smaller color
     if (self_color > new_color) {
       color[i] = new_color;
-      *done = false;
+      *done    = false;
     }
   }
 }
 
 // point vertices to their final color index
 template <typename vertex_t>
-__global__ void final_color_indices(const vertex_t v, const vertex_t* color,
-                                    vertex_t* color_index) {
+__global__ void final_color_indices(const vertex_t v, const vertex_t* color, vertex_t* color_index)
+{
   auto i = get_1D_idx<vertex_t>();
 
   if (i < v) {
     vertex_t self_color_idx = color_index[i];
-    vertex_t self_color = color[self_color_idx];
+    vertex_t self_color     = color[self_color_idx];
 
     // if self color is not equal to self color index,
     // it means self is not supervertex
@@ -272,7 +287,7 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color,
     // parent supervertex
     while (self_color_idx != self_color) {
       self_color_idx = color_index[self_color];
-      self_color = color[self_color_idx];
+      self_color     = color[self_color_idx];
     }
 
     // point to new supervertex
@@ -282,22 +297,23 @@ __global__ void final_color_indices(const vertex_t v, const vertex_t* color,
 
 // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
 // Consider using curand device API instead of precomputed random_values array
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-__global__ void alteration_kernel(const vertex_t v, const edge_t e,
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+__global__ void alteration_kernel(const vertex_t v,
+                                  const edge_t e,
                                   const edge_t* offsets,
                                   const vertex_t* indices,
-                                  const weight_t* weights, alteration_t max,
+                                  const weight_t* weights,
+                                  alteration_t max,
                                   alteration_t* random_values,
-                                  alteration_t* altered_weights) {
+                                  alteration_t* altered_weights)
+{
   auto row = get_1D_idx<vertex_t>();
   if (row < v) {
     auto row_begin = offsets[row];
-    auto row_end = offsets[row + 1];
+    auto row_end   = offsets[row + 1];
     for (auto i = row_begin; i < row_end; i++) {
-      auto column = indices[i];
-      altered_weights[i] =
-        weights[i] + max * (random_values[row] + random_values[column]);
+      auto column        = indices[i];
+      altered_weights[i] = weights[i] + max * (random_values[row] + random_values[column]);
     }
   }
 }
@@ -305,17 +321,15 @@ __global__ void alteration_kernel(const vertex_t v, const edge_t e,
 template <typename vertex_t, typename edge_t>
 __global__ void kernel_count_new_mst_edges(const vertex_t* mst_src,
                                            edge_t* mst_edge_count,
-                                           const vertex_t v) {
+                                           const vertex_t v)
+{
   auto tid = get_1D_idx<vertex_t>();
 
   // count number of new mst edges added
-  bool predicate =
-    tid < v && (mst_src[tid] != std::numeric_limits<vertex_t>::max());
+  bool predicate       = tid < v && (mst_src[tid] != std::numeric_limits<vertex_t>::max());
   vertex_t block_count = __syncthreads_count(predicate);
 
-  if (threadIdx.x == 0 && block_count > 0) {
-    atomicAdd(mst_edge_count, block_count);
-  }
+  if (threadIdx.x == 0 && block_count > 0) { atomicAdd(mst_edge_count, block_count); }
 }
 
 }  // namespace detail
diff --git a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
index 33b980afcd..5591e15b19 100644
--- a/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
+++ b/cpp/include/raft/sparse/mst/detail/mst_solver_inl.cuh
@@ -40,21 +40,30 @@ typedef std::chrono::high_resolution_clock Clock;
 
 // curand generator uniform
 inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator,
-                                               float* outputPtr, size_t n) {
+                                               float* outputPtr,
+                                               size_t n)
+{
   return curandGenerateUniform(generator, outputPtr, n);
 }
 inline curandStatus_t curand_generate_uniformX(curandGenerator_t generator,
-                                               double* outputPtr, size_t n) {
+                                               double* outputPtr,
+                                               size_t n)
+{
   return curandGenerateUniformDouble(generator, outputPtr, n);
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
-  const raft::handle_t& handle_, const edge_t* offsets_,
-  const vertex_t* indices_, const weight_t* weights_, const vertex_t v_,
-  const edge_t e_, vertex_t* color_, cudaStream_t stream_,
-  bool symmetrize_output_, bool initialize_colors_, int iterations_)
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(const raft::handle_t& handle_,
+                                                                 const edge_t* offsets_,
+                                                                 const vertex_t* indices_,
+                                                                 const weight_t* weights_,
+                                                                 const vertex_t v_,
+                                                                 const edge_t e_,
+                                                                 vertex_t* color_,
+                                                                 cudaStream_t stream_,
+                                                                 bool symmetrize_output_,
+                                                                 bool initialize_colors_,
+                                                                 int iterations_)
   : handle(handle_),
     offsets(offsets_),
     indices(indices_),
@@ -76,17 +85,17 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
     stream(stream_),
     symmetrize_output(symmetrize_output_),
     initialize_colors(initialize_colors_),
-    iterations(iterations_) {
-  max_blocks = handle_.get_device_properties().maxGridSize[0];
+    iterations(iterations_)
+{
+  max_blocks  = handle_.get_device_properties().maxGridSize[0];
   max_threads = handle_.get_device_properties().maxThreadsPerBlock;
-  sm_count = handle_.get_device_properties().multiProcessorCount;
+  sm_count    = handle_.get_device_properties().multiProcessorCount;
 
   mst_edge_count.set_value_to_zero_async(stream);
   prev_mst_edge_count.set_value_to_zero_async(stream);
-  CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool),
-                             stream));
+  CUDA_CHECK(cudaMemsetAsync(mst_edge.data(), 0, mst_edge.size() * sizeof(bool), stream));
 
-  //Initially, color holds the vertex id as color
+  // Initially, color holds the vertex id as color
   auto policy = handle.get_thrust_policy();
   if (initialize_colors_) {
     thrust::sequence(policy, color.begin(), color.end(), 0);
@@ -97,10 +106,10 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::MST_solver(
   thrust::sequence(policy, next_color.begin(), next_color.end(), 0);
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
 raft::Graph_COO<vertex_t, edge_t, weight_t>
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
+MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve()
+{
   RAFT_EXPECTS(v > 0, "0 vertices");
   RAFT_EXPECTS(e > 0, "0 edges");
   RAFT_EXPECTS(offsets != nullptr, "Null offsets.");
@@ -113,12 +122,13 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
 
   // Alterating the weights
   // this is done by identifying the lowest cost edge weight gap that is not 0, call this theta.
-  // For each edge, add noise that is less than theta. That is, generate a random number in the range [0.0, theta) and add it to each edge weight.
+  // For each edge, add noise that is less than theta. That is, generate a random number in the
+  // range [0.0, theta) and add it to each edge weight.
   alteration();
 
 #ifdef MST_TIME
   auto stop = Clock::now();
-  timer0 = duration_us(stop - start);
+  timer0    = duration_us(stop - start);
 #endif
 
   auto max_mst_edges = symmetrize_output ? 2 * v - 2 : v - 1;
@@ -167,8 +177,8 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
     if (curr_mst_edge_count == prev_mst_edge_count.value(stream)) {
 #ifdef MST_TIME
       std::cout << "Iterations: " << i << std::endl;
-      std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3
-                << "," << timer4 << "," << timer5 << std::endl;
+      std::cout << timer0 << "," << timer1 << "," << timer2 << "," << timer3 << "," << timer4 << ","
+                << timer5 << std::endl;
 #endif
       // exit here when reaching steady state
       break;
@@ -178,8 +188,7 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
     start = Clock::now();
 #endif
     // append the newly found MST edges to the final output
-    append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(),
-                        mst_result.weights.data());
+    append_src_dst_pair(mst_result.src.data(), mst_result.dst.data(), mst_result.weights.data());
 #ifdef MST_TIME
     stop = Clock::now();
     timer4 += duration_us(stop - start);
@@ -210,50 +219,46 @@ MST_solver<vertex_t, edge_t, weight_t, alteration_t>::solve() {
 // ||y|-|x||
 template <typename weight_t>
 struct alteration_functor {
-  __host__ __device__ weight_t
-  operator()(const thrust::tuple<weight_t, weight_t>& t) {
+  __host__ __device__ weight_t operator()(const thrust::tuple<weight_t, weight_t>& t)
+  {
     auto x = thrust::get<0>(t);
     auto y = thrust::get<1>(t);
-    x = x < 0 ? -x : x;
-    y = y < 0 ? -y : y;
+    x      = x < 0 ? -x : x;
+    y      = y < 0 ? -y : y;
     return x < y ? y - x : x - y;
   }
 };
 
 // Compute the uper bound for the alteration
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-alteration_t
-MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+alteration_t MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration_max()
+{
   auto policy = handle.get_thrust_policy();
   rmm::device_uvector<weight_t> tmp(e, stream);
   thrust::device_ptr<const weight_t> weights_ptr(weights);
   thrust::copy(policy, weights_ptr, weights_ptr + e, tmp.begin());
-  //sort tmp weights
+  // sort tmp weights
   thrust::sort(policy, tmp.begin(), tmp.end());
 
-  //remove duplicates
+  // remove duplicates
   auto new_end = thrust::unique(policy, tmp.begin(), tmp.end());
 
-  //min(a[i+1]-a[i])/2
-  auto begin =
-    thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1));
-  auto end =
-    thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end));
-  auto init = tmp.element(1, stream) - tmp.element(0, stream);
-  auto max =
-    thrust::transform_reduce(policy, begin, end, alteration_functor<weight_t>(),
-                             init, thrust::minimum<weight_t>());
+  // min(a[i+1]-a[i])/2
+  auto begin = thrust::make_zip_iterator(thrust::make_tuple(tmp.begin(), tmp.begin() + 1));
+  auto end   = thrust::make_zip_iterator(thrust::make_tuple(new_end - 1, new_end));
+  auto init  = tmp.element(1, stream) - tmp.element(0, stream);
+  auto max   = thrust::transform_reduce(
+    policy, begin, end, alteration_functor<weight_t>(), init, thrust::minimum<weight_t>());
   return max / static_cast<alteration_t>(2);
 }
 
 // Compute the alteration to make all undirected edge weight unique
 // Preserves weights order
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration()
+{
   auto nthreads = std::min(v, max_threads);
-  auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks);
+  auto nblocks  = std::min((v + nthreads - 1) / nthreads, max_blocks);
 
   // maximum alteration that does not change realtive weights order
   alteration_t max = alteration_max();
@@ -270,34 +275,32 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::alteration() {
   auto curand_status = curand_generate_uniformX(randGen, rand_values.data(), v);
   RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND failed");
   curand_status = curandDestroyGenerator(randGen);
-  RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS,
-               "MST: CURAND cleanup failed");
+  RAFT_EXPECTS(curand_status == CURAND_STATUS_SUCCESS, "MST: CURAND cleanup failed");
 
-  //Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
+  // Alterate the weights, make all undirected edge weight unique while keeping Wuv == Wvu
   detail::alteration_kernel<<<nblocks, nthreads, 0, stream>>>(
-    v, e, offsets, indices, weights, max, rand_values.data(),
-    altered_weights.data());
+    v, e, offsets, indices, weights, max, rand_values.data(), altered_weights.data());
 }
 
 // updates colors of vertices by propagating the lower color to the higher
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(
-  vertex_t* mst_src, vertex_t* mst_dst) {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(vertex_t* mst_src,
+                                                                      vertex_t* mst_dst)
+{
   // update the colors of both ends its until there is no change in colors
   edge_t curr_mst_edge_count = mst_edge_count.value(stream);
 
   auto min_pair_nthreads = std::min(v, (vertex_t)max_threads);
-  auto min_pair_nblocks = std::min(
-    (v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks);
+  auto min_pair_nblocks =
+    std::min((v + min_pair_nthreads - 1) / min_pair_nthreads, (vertex_t)max_blocks);
 
   edge_t* new_mst_edge_ptr = new_mst_edge.data();
-  vertex_t* color_ptr = color.data();
+  vertex_t* color_ptr      = color.data();
   vertex_t* next_color_ptr = next_color.data();
 
   rmm::device_scalar<bool> done(stream);
   done.set_value_to_zero_async(stream);
-  bool* done_ptr = done.data();
+  bool* done_ptr      = done.data();
   const bool true_val = true;
 
   auto i = 0;
@@ -312,84 +315,99 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::label_prop(
     i++;
   }
 
-  detail::
-    final_color_indices<<<min_pair_nblocks, min_pair_nthreads, 0, stream>>>(
-      v, color_ptr, color_index);
+  detail::final_color_indices<<<min_pair_nblocks, min_pair_nthreads, 0, stream>>>(
+    v, color_ptr, color_index);
 #ifdef MST_TIME
   std::cout << "Label prop iterations: " << i << std::endl;
 #endif
 }
 
 // Finds the minimum edge from each vertex to the lowest color
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t,
-                alteration_t>::min_edge_per_vertex() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::min_edge_per_vertex()
+{
   auto policy = handle.get_thrust_policy();
-  thrust::fill(policy, min_edge_color.begin(), min_edge_color.end(),
-               std::numeric_limits<alteration_t>::max());
-  thrust::fill(policy, new_mst_edge.begin(), new_mst_edge.end(),
-               std::numeric_limits<weight_t>::max());
+  thrust::fill(
+    policy, min_edge_color.begin(), min_edge_color.end(), std::numeric_limits<alteration_t>::max());
+  thrust::fill(
+    policy, new_mst_edge.begin(), new_mst_edge.end(), std::numeric_limits<weight_t>::max());
 
   int n_threads = 32;
 
-  vertex_t* color_ptr = color.data();
-  edge_t* new_mst_edge_ptr = new_mst_edge.data();
-  bool* mst_edge_ptr = mst_edge.data();
-  alteration_t* min_edge_color_ptr = min_edge_color.data();
+  vertex_t* color_ptr               = color.data();
+  edge_t* new_mst_edge_ptr          = new_mst_edge.data();
+  bool* mst_edge_ptr                = mst_edge.data();
+  alteration_t* min_edge_color_ptr  = min_edge_color.data();
   alteration_t* altered_weights_ptr = altered_weights.data();
 
-  detail::kernel_min_edge_per_vertex<<<v, n_threads, 0, stream>>>(
-    offsets, indices, altered_weights_ptr, color_ptr, color_index,
-    new_mst_edge_ptr, mst_edge_ptr, min_edge_color_ptr, v);
+  detail::kernel_min_edge_per_vertex<<<v, n_threads, 0, stream>>>(offsets,
+                                                                  indices,
+                                                                  altered_weights_ptr,
+                                                                  color_ptr,
+                                                                  color_index,
+                                                                  new_mst_edge_ptr,
+                                                                  mst_edge_ptr,
+                                                                  min_edge_color_ptr,
+                                                                  v);
 }
 
 // Finds the minimum edge from each supervertex to the lowest color
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t,
-                alteration_t>::min_edge_per_supervertex() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::min_edge_per_supervertex()
+{
   auto nthreads = std::min(v, max_threads);
-  auto nblocks = std::min((v + nthreads - 1) / nthreads, max_blocks);
+  auto nblocks  = std::min((v + nthreads - 1) / nthreads, max_blocks);
 
   auto policy = handle.get_thrust_policy();
-  thrust::fill(policy, temp_src.begin(), temp_src.end(),
-               std::numeric_limits<vertex_t>::max());
+  thrust::fill(policy, temp_src.begin(), temp_src.end(), std::numeric_limits<vertex_t>::max());
 
-  vertex_t* color_ptr = color.data();
-  edge_t* new_mst_edge_ptr = new_mst_edge.data();
-  bool* mst_edge_ptr = mst_edge.data();
-  alteration_t* min_edge_color_ptr = min_edge_color.data();
+  vertex_t* color_ptr               = color.data();
+  edge_t* new_mst_edge_ptr          = new_mst_edge.data();
+  bool* mst_edge_ptr                = mst_edge.data();
+  alteration_t* min_edge_color_ptr  = min_edge_color.data();
   alteration_t* altered_weights_ptr = altered_weights.data();
-  vertex_t* temp_src_ptr = temp_src.data();
-  vertex_t* temp_dst_ptr = temp_dst.data();
-  weight_t* temp_weights_ptr = temp_weights.data();
-
-  detail::min_edge_per_supervertex<<<nblocks, nthreads, 0, stream>>>(
-    color_ptr, color_index, new_mst_edge_ptr, mst_edge_ptr, indices, weights,
-    altered_weights_ptr, temp_src_ptr, temp_dst_ptr, temp_weights_ptr,
-    min_edge_color_ptr, v, symmetrize_output);
+  vertex_t* temp_src_ptr            = temp_src.data();
+  vertex_t* temp_dst_ptr            = temp_dst.data();
+  weight_t* temp_weights_ptr        = temp_weights.data();
+
+  detail::min_edge_per_supervertex<<<nblocks, nthreads, 0, stream>>>(color_ptr,
+                                                                     color_index,
+                                                                     new_mst_edge_ptr,
+                                                                     mst_edge_ptr,
+                                                                     indices,
+                                                                     weights,
+                                                                     altered_weights_ptr,
+                                                                     temp_src_ptr,
+                                                                     temp_dst_ptr,
+                                                                     temp_weights_ptr,
+                                                                     min_edge_color_ptr,
+                                                                     v,
+                                                                     symmetrize_output);
 
   // the above kernel only adds directed mst edges in the case where
   // a pair of vertices don't pick the same min edge between them
   // so, now we add the reverse edge to make it undirected
   if (symmetrize_output) {
-    detail::add_reverse_edge<<<nblocks, nthreads, 0, stream>>>(
-      new_mst_edge_ptr, indices, weights, temp_src_ptr, temp_dst_ptr,
-      temp_weights_ptr, v, symmetrize_output);
+    detail::add_reverse_edge<<<nblocks, nthreads, 0, stream>>>(new_mst_edge_ptr,
+                                                               indices,
+                                                               weights,
+                                                               temp_src_ptr,
+                                                               temp_dst_ptr,
+                                                               temp_weights_ptr,
+                                                               v,
+                                                               symmetrize_output);
   }
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
-void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination() {
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
+void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination()
+{
   vertex_t nthreads = std::min(2 * v, (vertex_t)max_threads);
-  vertex_t nblocks =
-    std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks);
+  vertex_t nblocks  = std::min((2 * v + nthreads - 1) / nthreads, (vertex_t)max_blocks);
 
   // count number of new mst edges
   edge_t* mst_edge_count_ptr = mst_edge_count.data();
-  vertex_t* temp_src_ptr = temp_src.data();
+  vertex_t* temp_src_ptr     = temp_src.data();
 
   detail::kernel_count_new_mst_edges<<<nblocks, nthreads, 0, stream>>>(
     temp_src_ptr, mst_edge_count_ptr, 2 * v);
@@ -397,36 +415,40 @@ void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::check_termination() {
 
 template <typename vertex_t, typename weight_t>
 struct new_edges_functor {
-  __host__ __device__ bool operator()(
-    const thrust::tuple<vertex_t, vertex_t, weight_t>& t) {
+  __host__ __device__ bool operator()(const thrust::tuple<vertex_t, vertex_t, weight_t>& t)
+  {
     auto src = thrust::get<0>(t);
 
     return src != std::numeric_limits<vertex_t>::max() ? true : false;
   }
 };
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
 void MST_solver<vertex_t, edge_t, weight_t, alteration_t>::append_src_dst_pair(
-  vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights) {
+  vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights)
+{
   auto policy = handle.get_thrust_policy();
 
   edge_t curr_mst_edge_count = prev_mst_edge_count.value(stream);
 
   // iterator to end of mst edges added to final output in previous iteration
-  auto src_dst_zip_end = thrust::make_zip_iterator(thrust::make_tuple(
-    mst_src + curr_mst_edge_count, mst_dst + curr_mst_edge_count,
-    mst_weights + curr_mst_edge_count));
+  auto src_dst_zip_end =
+    thrust::make_zip_iterator(thrust::make_tuple(mst_src + curr_mst_edge_count,
+                                                 mst_dst + curr_mst_edge_count,
+                                                 mst_weights + curr_mst_edge_count));
 
   // iterator to new mst edges found
-  auto temp_src_dst_zip_begin = thrust::make_zip_iterator(thrust::make_tuple(
-    temp_src.begin(), temp_dst.begin(), temp_weights.begin()));
+  auto temp_src_dst_zip_begin = thrust::make_zip_iterator(
+    thrust::make_tuple(temp_src.begin(), temp_dst.begin(), temp_weights.begin()));
   auto temp_src_dst_zip_end = thrust::make_zip_iterator(
     thrust::make_tuple(temp_src.end(), temp_dst.end(), temp_weights.end()));
 
   // copy new mst edges to final output
-  thrust::copy_if(policy, temp_src_dst_zip_begin, temp_src_dst_zip_end,
-                  src_dst_zip_end, new_edges_functor<vertex_t, weight_t>());
+  thrust::copy_if(policy,
+                  temp_src_dst_zip_begin,
+                  temp_src_dst_zip_end,
+                  src_dst_zip_end,
+                  new_edges_functor<vertex_t, weight_t>());
 }
 
 }  // namespace mst
diff --git a/cpp/include/raft/sparse/mst/detail/utils.cuh b/cpp/include/raft/sparse/mst/detail/utils.cuh
index 4d5ca6ebe1..97a76e1d50 100644
--- a/cpp/include/raft/sparse/mst/detail/utils.cuh
+++ b/cpp/include/raft/sparse/mst/detail/utils.cuh
@@ -26,32 +26,29 @@ namespace mst {
 namespace detail {
 
 template <typename idx_t>
-__device__ idx_t get_1D_idx() {
+__device__ idx_t get_1D_idx()
+{
   return blockIdx.x * blockDim.x + threadIdx.x;
 }
 
 // somewhat smart vector print
 template <typename T>
-void printv(rmm::device_uvector<T>& vec, const std::string& name = "",
-            const size_t displ = 5) {
+void printv(rmm::device_uvector<T>& vec, const std::string& name = "", const size_t displ = 5)
+{
 #ifdef MST_TIME
   std::cout.precision(15);
   std::cout << name << " size = " << vec.size() << std::endl;
   if (displ < vec.size()) {
-    thrust::copy(vec.begin(), vec.begin() + displ,
-                 std::ostream_iterator<T>(std::cout, " "));
+    thrust::copy(vec.begin(), vec.begin() + displ, std::ostream_iterator<T>(std::cout, " "));
     std::cout << " ... ";
-    thrust::copy(vec.end() - displ, vec.end(),
-                 std::ostream_iterator<T>(std::cout, " "));
+    thrust::copy(vec.end() - displ, vec.end(), std::ostream_iterator<T>(std::cout, " "));
   } else {
-    thrust::copy(vec.begin(), vec.end(),
-                 std::ostream_iterator<T>(std::cout, " "));
+    thrust::copy(vec.begin(), vec.end(), std::ostream_iterator<T>(std::cout, " "));
   }
   std::cout << std::endl << std::endl;
 #endif
 }
-#define duration_us(a) \
-  std::chrono::duration_cast<std::chrono::microseconds>(a).count()
+#define duration_us(a) std::chrono::duration_cast<std::chrono::microseconds>(a).count()
 
 }  // namespace detail
 }  // namespace mst
diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh
index 10c981445e..b49003467b 100644
--- a/cpp/include/raft/sparse/mst/mst.cuh
+++ b/cpp/include/raft/sparse/mst/mst.cuh
@@ -22,16 +22,30 @@
 namespace raft {
 namespace mst {
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t = weight_t>
-raft::Graph_COO<vertex_t, edge_t, weight_t> mst(
-  const raft::handle_t& handle, edge_t const* offsets, vertex_t const* indices,
-  weight_t const* weights, vertex_t const v, edge_t const e, vertex_t* color,
-  cudaStream_t stream, bool symmetrize_output = true,
-  bool initialize_colors = true, int iterations = 0) {
-  MST_solver<vertex_t, edge_t, weight_t, alteration_t> mst_solver(
-    handle, offsets, indices, weights, v, e, color, stream, symmetrize_output,
-    initialize_colors, iterations);
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t = weight_t>
+raft::Graph_COO<vertex_t, edge_t, weight_t> mst(const raft::handle_t& handle,
+                                                edge_t const* offsets,
+                                                vertex_t const* indices,
+                                                weight_t const* weights,
+                                                vertex_t const v,
+                                                edge_t const e,
+                                                vertex_t* color,
+                                                cudaStream_t stream,
+                                                bool symmetrize_output = true,
+                                                bool initialize_colors = true,
+                                                int iterations         = 0)
+{
+  MST_solver<vertex_t, edge_t, weight_t, alteration_t> mst_solver(handle,
+                                                                  offsets,
+                                                                  indices,
+                                                                  weights,
+                                                                  v,
+                                                                  e,
+                                                                  color,
+                                                                  stream,
+                                                                  symmetrize_output,
+                                                                  initialize_colors,
+                                                                  iterations);
   return mst_solver.solve();
 }
 
diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh
index 44b34ee5c7..bae5d77d8e 100644
--- a/cpp/include/raft/sparse/mst/mst_solver.cuh
+++ b/cpp/include/raft/sparse/mst/mst_solver.cuh
@@ -31,20 +31,27 @@ struct Graph_COO {
   edge_t n_edges;
 
   Graph_COO(vertex_t size, cudaStream_t stream)
-    : src(size, stream), dst(size, stream), weights(size, stream) {}
+    : src(size, stream), dst(size, stream), weights(size, stream)
+  {
+  }
 };
 
 namespace mst {
 
-template <typename vertex_t, typename edge_t, typename weight_t,
-          typename alteration_t>
+template <typename vertex_t, typename edge_t, typename weight_t, typename alteration_t>
 class MST_solver {
  public:
-  MST_solver(const raft::handle_t& handle_, const edge_t* offsets_,
-             const vertex_t* indices_, const weight_t* weights_,
-             const vertex_t v_, const edge_t e_, vertex_t* color_,
-             cudaStream_t stream_, bool symmetrize_output_,
-             bool initialize_colors_, int iterations_);
+  MST_solver(const raft::handle_t& handle_,
+             const edge_t* offsets_,
+             const vertex_t* indices_,
+             const weight_t* weights_,
+             const vertex_t v_,
+             const edge_t e_,
+             vertex_t* color_,
+             cudaStream_t stream_,
+             bool symmetrize_output_,
+             bool initialize_colors_,
+             int iterations_);
 
   raft::Graph_COO<vertex_t, edge_t, weight_t> solve();
 
@@ -56,7 +63,7 @@ class MST_solver {
   bool symmetrize_output, initialize_colors;
   int iterations;
 
-  //CSR
+  // CSR
   const edge_t* offsets;
   const vertex_t* indices;
   const weight_t* weights;
@@ -67,20 +74,16 @@ class MST_solver {
   vertex_t max_threads;
   vertex_t sm_count;
 
-  vertex_t* color_index;  // represent each supervertex as a color
-  rmm::device_uvector<alteration_t>
-    min_edge_color;  // minimum incident edge weight per color
-  rmm::device_uvector<edge_t> new_mst_edge;  // new minimum edge per vertex
-  rmm::device_uvector<alteration_t>
-    altered_weights;  // weights to be used for mst
+  vertex_t* color_index;                              // represent each supervertex as a color
+  rmm::device_uvector<alteration_t> min_edge_color;   // minimum incident edge weight per color
+  rmm::device_uvector<edge_t> new_mst_edge;           // new minimum edge per vertex
+  rmm::device_uvector<alteration_t> altered_weights;  // weights to be used for mst
+  rmm::device_scalar<edge_t> mst_edge_count;  // total number of edges added after every iteration
   rmm::device_scalar<edge_t>
-    mst_edge_count;  // total number of edges added after every iteration
-  rmm::device_scalar<edge_t>
-    prev_mst_edge_count;  // total number of edges up to the previous iteration
-  rmm::device_uvector<bool>
-    mst_edge;  // mst output -  true if the edge belongs in mst
+    prev_mst_edge_count;                     // total number of edges up to the previous iteration
+  rmm::device_uvector<bool> mst_edge;        // mst output -  true if the edge belongs in mst
   rmm::device_uvector<vertex_t> next_color;  //  next iteration color
-  rmm::device_uvector<vertex_t> color;  // index of color that vertex points to
+  rmm::device_uvector<vertex_t> color;       // index of color that vertex points to
 
   // new src-dst pairs found per iteration
   rmm::device_uvector<vertex_t> temp_src;
@@ -93,8 +96,7 @@ class MST_solver {
   void check_termination();
   void alteration();
   alteration_t alteration_max();
-  void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst,
-                           weight_t* mst_weights);
+  void append_src_dst_pair(vertex_t* mst_src, vertex_t* mst_dst, weight_t* mst_weights);
 };
 
 }  // namespace mst
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index 492058f85f..8bc8c746f9 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -42,15 +42,23 @@ namespace sparse {
 namespace op {
 
 template <int TPB_X, typename T>
-__global__ void coo_remove_scalar_kernel(const int *rows, const int *cols,
-                                         const T *vals, int nnz, int *crows,
-                                         int *ccols, T *cvals, int *ex_scan,
-                                         int *cur_ex_scan, int m, T scalar) {
+__global__ void coo_remove_scalar_kernel(const int* rows,
+                                         const int* cols,
+                                         const T* vals,
+                                         int nnz,
+                                         int* crows,
+                                         int* ccols,
+                                         T* cvals,
+                                         int* ex_scan,
+                                         int* cur_ex_scan,
+                                         int m,
+                                         T scalar)
+{
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < m) {
-    int start = cur_ex_scan[row];
-    int stop = get_stop_idx(row, m, nnz, cur_ex_scan);
+    int start       = cur_ex_scan[row];
+    int stop        = get_stop_idx(row, m, nnz, cur_ex_scan);
     int cur_out_idx = ex_scan[row];
 
     for (int idx = start; idx < stop; idx++) {
@@ -82,35 +90,49 @@ __global__ void coo_remove_scalar_kernel(const int *rows, const int *cols,
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
-                       int *crows, int *ccols, T *cvals, int *cnnz,
-                       int *cur_cnnz, T scalar, int n, cudaStream_t stream) {
+void coo_remove_scalar(const int* rows,
+                       const int* cols,
+                       const T* vals,
+                       int nnz,
+                       int* crows,
+                       int* ccols,
+                       T* cvals,
+                       int* cnnz,
+                       int* cur_cnnz,
+                       T scalar,
+                       int n,
+                       cudaStream_t stream)
+{
   rmm::device_uvector<int> ex_scan(n, stream);
   rmm::device_uvector<int> cur_ex_scan(n, stream);
 
   CUDA_CHECK(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream));
   CUDA_CHECK(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream));
 
-  thrust::device_ptr<int> dev_cnnz = thrust::device_pointer_cast(cnnz);
-  thrust::device_ptr<int> dev_ex_scan =
-    thrust::device_pointer_cast(ex_scan.data());
-  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n,
-                         dev_ex_scan);
+  thrust::device_ptr<int> dev_cnnz    = thrust::device_pointer_cast(cnnz);
+  thrust::device_ptr<int> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
+  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  thrust::device_ptr<int> dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz);
-  thrust::device_ptr<int> dev_cur_ex_scan =
-    thrust::device_pointer_cast(cur_ex_scan.data());
-  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz,
-                         dev_cur_cnnz + n, dev_cur_ex_scan);
+  thrust::device_ptr<int> dev_cur_cnnz    = thrust::device_pointer_cast(cur_cnnz);
+  thrust::device_ptr<int> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
+  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan);
   CUDA_CHECK(cudaPeekAtLastError());
 
   dim3 grid(raft::ceildiv(n, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
-  coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(
-    rows, cols, vals, nnz, crows, ccols, cvals, dev_ex_scan.get(),
-    dev_cur_ex_scan.get(), n, scalar);
+  coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(rows,
+                                                            cols,
+                                                            vals,
+                                                            nnz,
+                                                            crows,
+                                                            ccols,
+                                                            cvals,
+                                                            dev_ex_scan.get(),
+                                                            dev_cur_ex_scan.get(),
+                                                            n,
+                                                            scalar);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -123,33 +145,39 @@ void coo_remove_scalar(const int *rows, const int *cols, const T *vals, int nnz,
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar, cudaStream_t stream) {
+void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
+{
   rmm::device_uvector<int> row_count_nz(in->n_rows, stream);
   rmm::device_uvector<int> row_count(in->n_rows, stream);
 
-  CUDA_CHECK(
-    cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
-  CUDA_CHECK(
-    cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
 
   linalg::coo_degree<TPB_X>(in->rows(), in->nnz, row_count.data(), stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  linalg::coo_degree_scalar<TPB_X>(in->rows(), in->vals(), in->nnz, scalar,
-                                   row_count_nz.data(), stream);
+  linalg::coo_degree_scalar<TPB_X>(
+    in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream);
   CUDA_CHECK(cudaPeekAtLastError());
 
-  thrust::device_ptr<int> d_row_count_nz =
-    thrust::device_pointer_cast(row_count_nz.data());
-  int out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz,
-                               d_row_count_nz + in->n_rows);
+  thrust::device_ptr<int> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
+  int out_nnz =
+    thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows);
 
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
 
-  coo_remove_scalar<TPB_X, T>(in->rows(), in->cols(), in->vals(), in->nnz,
-                              out->rows(), out->cols(), out->vals(),
-                              row_count_nz.data(), row_count.data(), scalar,
-                              in->n_rows, stream);
+  coo_remove_scalar<TPB_X, T>(in->rows(),
+                              in->cols(),
+                              in->vals(),
+                              in->nnz,
+                              out->rows(),
+                              out->cols(),
+                              out->vals(),
+                              row_count_nz.data(),
+                              row_count.data(),
+                              scalar,
+                              in->n_rows,
+                              stream);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -161,7 +189,8 @@ void coo_remove_scalar(COO<T> *in, COO<T> *out, T scalar, cudaStream_t stream) {
  * @param stream: cuda stream to use
  */
 template <int TPB_X, typename T>
-void coo_remove_zeros(COO<T> *in, COO<T> *out, cudaStream_t stream) {
+void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
+{
   coo_remove_scalar<TPB_X, T>(in, out, T(0.0), stream);
 }
 
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index 09a35720fb..84d584d108 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -44,25 +44,29 @@ namespace sparse {
 namespace op {
 
 template <typename value_idx>
-__global__ void compute_duplicates_diffs_kernel(const value_idx *rows,
-                                                const value_idx *cols,
-                                                value_idx *diff, size_t nnz) {
+__global__ void compute_duplicates_diffs_kernel(const value_idx* rows,
+                                                const value_idx* cols,
+                                                value_idx* diff,
+                                                size_t nnz)
+{
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid >= nnz) return;
 
   value_idx d = 1;
-  if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid]))
-    d = 0;
+  if (tid == 0 || (rows[tid - 1] == rows[tid] && cols[tid - 1] == cols[tid])) d = 0;
   diff[tid] = d;
 }
 
 template <typename value_idx, typename value_t>
-__global__ void max_duplicates_kernel(const value_idx *src_rows,
-                                      const value_idx *src_cols,
-                                      const value_t *src_vals,
-                                      const value_idx *index,
-                                      value_idx *out_rows, value_idx *out_cols,
-                                      value_t *out_vals, size_t nnz) {
+__global__ void max_duplicates_kernel(const value_idx* src_rows,
+                                      const value_idx* src_cols,
+                                      const value_t* src_vals,
+                                      const value_idx* index,
+                                      value_idx* out_rows,
+                                      value_idx* out_cols,
+                                      value_t* out_vals,
+                                      size_t nnz)
+{
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid < nnz) {
@@ -94,13 +98,13 @@ __global__ void max_duplicates_kernel(const value_idx *src_rows,
  * @param[in] stream cuda ops will be ordered wrt this stream
  */
 template <typename value_idx>
-void compute_duplicates_mask(value_idx *mask, const value_idx *rows,
-                             const value_idx *cols, size_t nnz,
-                             cudaStream_t stream) {
+void compute_duplicates_mask(
+  value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream)
+{
   CUDA_CHECK(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream));
 
-  compute_duplicates_diffs_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0,
-                                    stream>>>(rows, cols, mask, nnz);
+  compute_duplicates_diffs_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0, stream>>>(
+    rows, cols, mask, nnz);
 }
 
 /**
@@ -120,11 +124,16 @@ void compute_duplicates_mask(value_idx *mask, const value_idx *rows,
  * @param[in] stream cuda ops will be ordered wrt this stream
  */
 template <typename value_idx, typename value_t>
-void max_duplicates(const raft::handle_t &handle,
-                    raft::sparse::COO<value_t, value_idx> &out,
-                    const value_idx *rows, const value_idx *cols,
-                    const value_t *vals, size_t nnz, size_t m, size_t n) {
-  auto stream = handle.get_stream();
+void max_duplicates(const raft::handle_t& handle,
+                    raft::sparse::COO<value_t, value_idx>& out,
+                    const value_idx* rows,
+                    const value_idx* cols,
+                    const value_t* vals,
+                    size_t nnz,
+                    size_t m,
+                    size_t n)
+{
+  auto stream        = handle.get_stream();
   auto thrust_policy = handle.get_thrust_policy();
 
   // compute diffs & take exclusive scan
@@ -132,8 +141,7 @@ void max_duplicates(const raft::handle_t &handle,
 
   compute_duplicates_mask(diff.data(), rows, cols, nnz, stream);
 
-  thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(),
-                         diff.data());
+  thrust::exclusive_scan(thrust_policy, diff.data(), diff.data() + diff.size(), diff.data());
 
   // compute final size
   value_idx size = 0;
diff --git a/cpp/include/raft/sparse/op/row_op.cuh b/cpp/include/raft/sparse/op/row_op.cuh
index 9e5034dc28..194a878ac1 100644
--- a/cpp/include/raft/sparse/op/row_op.cuh
+++ b/cpp/include/raft/sparse/op/row_op.cuh
@@ -38,12 +38,12 @@ namespace sparse {
 namespace op {
 
 template <typename T, int TPB_X = 256, typename Lambda = auto(T, T, T)->void>
-__global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz,
-                                  Lambda op) {
+__global__ void csr_row_op_kernel(const T* row_ind, T n_rows, T nnz, Lambda op)
+{
   T row = blockIdx.x * TPB_X + threadIdx.x;
   if (row < n_rows) {
     T start_idx = row_ind[row];
-    T stop_idx = row < n_rows - 1 ? row_ind[row + 1] : nnz;
+    T stop_idx  = row < n_rows - 1 ? row_ind[row + 1] : nnz;
     op(row, start_idx, stop_idx);
   }
 }
@@ -59,14 +59,12 @@ __global__ void csr_row_op_kernel(const T *row_ind, T n_rows, T nnz,
  * @param op custom row operation functor accepting the row and beginning index.
  * @param stream cuda stream to use
  */
-template <typename Index_, int TPB_X = 256,
-          typename Lambda = auto(Index_, Index_, Index_)->void>
-void csr_row_op(const Index_ *row_ind, Index_ n_rows, Index_ nnz, Lambda op,
-                cudaStream_t stream) {
+template <typename Index_, int TPB_X = 256, typename Lambda = auto(Index_, Index_, Index_)->void>
+void csr_row_op(const Index_* row_ind, Index_ n_rows, Index_ nnz, Lambda op, cudaStream_t stream)
+{
   dim3 grid(raft::ceildiv(n_rows, Index_(TPB_X)), 1, 1);
   dim3 blk(TPB_X, 1, 1);
-  csr_row_op_kernel<Index_, TPB_X>
-    <<<grid, blk, 0, stream>>>(row_ind, n_rows, nnz, op);
+  csr_row_op_kernel<Index_, TPB_X><<<grid, blk, 0, stream>>>(row_ind, n_rows, nnz, op);
 
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/sparse/op/slice.h b/cpp/include/raft/sparse/op/slice.h
index 46f4f41879..9bbe04cf34 100644
--- a/cpp/include/raft/sparse/op/slice.h
+++ b/cpp/include/raft/sparse/op/slice.h
@@ -50,10 +50,14 @@ namespace op {
  * @param[in] stream : cuda stream for ordering events
  */
 template <typename value_idx>
-void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
-                          const value_idx *indptr, value_idx *indptr_out,
-                          value_idx *start_offset, value_idx *stop_offset,
-                          cudaStream_t stream) {
+void csr_row_slice_indptr(value_idx start_row,
+                          value_idx stop_row,
+                          const value_idx* indptr,
+                          value_idx* indptr_out,
+                          value_idx* start_offset,
+                          value_idx* stop_offset,
+                          cudaStream_t stream)
+{
   raft::update_host(start_offset, indptr + start_row, 1, stream);
   raft::update_host(stop_offset, indptr + stop_row + 1, 1, stream);
 
@@ -63,11 +67,12 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
 
   // 0-based indexing so we need to add 1 to stop row. Because we want n_rows+1,
   // we add another 1 to stop row.
-  raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row,
-                   stream);
+  raft::copy_async(indptr_out, indptr + start_row, (stop_row + 2) - start_row, stream);
 
   raft::linalg::unaryOp<value_idx>(
-    indptr_out, indptr_out, (stop_row + 2) - start_row,
+    indptr_out,
+    indptr_out,
+    (stop_row + 2) - start_row,
     [s_offset] __device__(value_idx input) { return input - s_offset; },
     stream);
 }
@@ -85,12 +90,15 @@ void csr_row_slice_indptr(value_idx start_row, value_idx stop_row,
  * @param[in] stream : cuda stream for ordering events
  */
 template <typename value_idx, typename value_t>
-void csr_row_slice_populate(value_idx start_offset, value_idx stop_offset,
-                            const value_idx *indices, const value_t *data,
-                            value_idx *indices_out, value_t *data_out,
-                            cudaStream_t stream) {
-  raft::copy(indices_out, indices + start_offset, stop_offset - start_offset,
-             stream);
+void csr_row_slice_populate(value_idx start_offset,
+                            value_idx stop_offset,
+                            const value_idx* indices,
+                            const value_t* data,
+                            value_idx* indices_out,
+                            value_t* data_out,
+                            cudaStream_t stream)
+{
+  raft::copy(indices_out, indices + start_offset, stop_offset - start_offset, stream);
   raft::copy(data_out, data + start_offset, stop_offset - start_offset, stream);
 }
 
diff --git a/cpp/include/raft/sparse/op/sort.h b/cpp/include/raft/sparse/op/sort.h
index c40801a0b1..d397bce780 100644
--- a/cpp/include/raft/sparse/op/sort.h
+++ b/cpp/include/raft/sparse/op/sort.h
@@ -38,7 +38,8 @@ namespace op {
 
 struct TupleComp {
   template <typename one, typename two>
-  __host__ __device__ bool operator()(const one &t1, const two &t2) {
+  __host__ __device__ bool operator()(const one& t1, const two& t2)
+  {
     // sort first by each sample's color,
     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
     if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
@@ -61,13 +62,12 @@ struct TupleComp {
  * @param stream: cuda stream to use
  */
 template <typename T>
-void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
-              cudaStream_t stream) {
+void coo_sort(int m, int n, int nnz, int* rows, int* cols, T* vals, cudaStream_t stream)
+{
   auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz,
-                      vals, TupleComp());
+  thrust::sort_by_key(rmm::exec_policy(stream), coo_indices, coo_indices + nnz, vals, TupleComp());
 }
 
 /**
@@ -77,9 +77,9 @@ void coo_sort(int m, int n, int nnz, int *rows, int *cols, T *vals,
  * @param stream: the cuda stream to use
  */
 template <typename T>
-void coo_sort(COO<T> *const in, cudaStream_t stream) {
-  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(),
-              in->vals(), stream);
+void coo_sort(COO<T>* const in, cudaStream_t stream)
+{
+  coo_sort<T>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
 }
 
 /**
@@ -93,8 +93,9 @@ void coo_sort(COO<T> *const in, cudaStream_t stream) {
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t>
-void coo_sort_by_weight(value_idx *rows, value_idx *cols, value_t *data,
-                        value_idx nnz, cudaStream_t stream) {
+void coo_sort_by_weight(
+  value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream)
+{
   thrust::device_ptr<value_t> t_data = thrust::device_pointer_cast(data);
 
   auto first = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
diff --git a/cpp/include/raft/sparse/selection/connect_components.cuh b/cpp/include/raft/sparse/selection/connect_components.cuh
index 5313b81192..8edb0e8b43 100644
--- a/cpp/include/raft/sparse/selection/connect_components.cuh
+++ b/cpp/include/raft/sparse/selection/connect_components.cuh
@@ -59,17 +59,20 @@ struct KeyValuePair {
   __host__ __device__ __forceinline__ KeyValuePair() {}
 
   /// Copy Constructor
-  __host__ __device__ __forceinline__
-  KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp)
-    : key(kvp.key), value(kvp.value) {}
+  __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value> kvp)
+    : key(kvp.key), value(kvp.value)
+  {
+  }
 
   /// Constructor
-  __host__ __device__ __forceinline__ KeyValuePair(Key const &key,
-                                                   Value const &value)
-    : key(key), value(value) {}
+  __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value)
+    : key(key), value(value)
+  {
+  }
 
   /// Inequality operator
-  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair &b) {
+  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b)
+  {
     return (value != b.value) || (key != b.key);
   }
 };
@@ -83,31 +86,32 @@ struct KeyValuePair {
  */
 template <typename value_idx, typename value_t>
 struct FixConnectivitiesRedOp {
-  value_idx *colors;
+  value_idx* colors;
   value_idx m;
 
-  FixConnectivitiesRedOp(value_idx *colors_, value_idx m_)
-    : colors(colors_), m(m_){};
+  FixConnectivitiesRedOp(value_idx* colors_, value_idx m_) : colors(colors_), m(m_){};
 
   typedef typename cub::KeyValuePair<value_idx, value_t> KVP;
-  DI void operator()(value_idx rit, KVP *out, const KVP &other) {
-    if (rit < m && other.value < out->value &&
-        colors[rit] != colors[other.key]) {
-      out->key = other.key;
+  DI void operator()(value_idx rit, KVP* out, const KVP& other)
+  {
+    if (rit < m && other.value < out->value && colors[rit] != colors[other.key]) {
+      out->key   = other.key;
       out->value = other.value;
     }
   }
 
-  DI KVP operator()(value_idx rit, const KVP &a, const KVP &b) {
+  DI KVP operator()(value_idx rit, const KVP& a, const KVP& b)
+  {
     if (rit < m && a.value < b.value && colors[rit] != colors[a.key]) {
       return a;
     } else
       return b;
   }
 
-  DI void init(value_t *out, value_t maxVal) { *out = maxVal; }
-  DI void init(KVP *out, value_t maxVal) {
-    out->key = -1;
+  DI void init(value_t* out, value_t maxVal) { *out = maxVal; }
+  DI void init(KVP* out, value_t maxVal)
+  {
+    out->key   = -1;
     out->value = maxVal;
   }
 };
@@ -119,7 +123,8 @@ struct FixConnectivitiesRedOp {
  */
 struct TupleComp {
   template <typename one, typename two>
-  __host__ __device__ bool operator()(const one &t1, const two &t2) {
+  __host__ __device__ bool operator()(const one& t1, const two& t2)
+  {
     // sort first by each sample's color,
     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
     if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
@@ -137,13 +142,9 @@ template <typename LabelT, typename DataT>
 struct CubKVPMinReduce {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
-  DI KVP operator()(const KVP &a, const KVP &b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
 };  // KVPMinReduce
 
@@ -158,11 +159,10 @@ struct CubKVPMinReduce {
  * @return total number of components
  */
 template <typename value_idx>
-value_idx get_n_components(value_idx *colors, size_t n_rows,
-                           cudaStream_t stream) {
+value_idx get_n_components(value_idx* colors, size_t n_rows, cudaStream_t stream)
+{
   rmm::device_uvector<value_idx> map_ids(0, stream);
-  int num_clusters =
-    raft::label::getUniquelabels(map_ids, colors, n_rows, stream);
+  int num_clusters = raft::label::getUniquelabels(map_ids, colors, n_rows, stream);
   return num_clusters;
 }
 
@@ -173,11 +173,12 @@ value_idx get_n_components(value_idx *colors, size_t n_rows,
  */
 template <typename value_idx, typename value_t>
 struct LookupColorOp {
-  value_idx *colors;
+  value_idx* colors;
 
-  LookupColorOp(value_idx *colors_) : colors(colors_) {}
+  LookupColorOp(value_idx* colors_) : colors(colors_) {}
 
-  DI value_idx operator()(const cub::KeyValuePair<value_idx, value_t> &kvp) {
+  DI value_idx operator()(const cub::KeyValuePair<value_idx, value_t>& kvp)
+  {
     return colors[kvp.key];
   }
 };
@@ -187,7 +188,8 @@ struct LookupColorOp {
  * the given array of components
  * @tparam value_idx
  * @tparam value_t
- * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given array of components
+ * @param[out] kvp mapping of closest neighbor vertex and distance for each vertex in the given
+ * array of components
  * @param[out] nn_colors components of nearest neighbors for each vertex
  * @param[in] colors components of each vertex
  * @param[in] X original dense data
@@ -196,24 +198,38 @@ struct LookupColorOp {
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t, typename red_op>
-void perform_1nn(cub::KeyValuePair<value_idx, value_t> *kvp,
-                 value_idx *nn_colors, value_idx *colors, const value_t *X,
-                 size_t n_rows, size_t n_cols, cudaStream_t stream,
-                 red_op reduction_op) {
+void perform_1nn(cub::KeyValuePair<value_idx, value_t>* kvp,
+                 value_idx* nn_colors,
+                 value_idx* colors,
+                 const value_t* X,
+                 size_t n_rows,
+                 size_t n_cols,
+                 cudaStream_t stream,
+                 red_op reduction_op)
+{
   rmm::device_uvector<int> workspace(n_rows, stream);
   rmm::device_uvector<value_t> x_norm(n_rows, stream);
 
-  raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm,
-                        true, stream);
-
-  raft::distance::fusedL2NN<value_t, cub::KeyValuePair<value_idx, value_t>,
-                            value_idx>(
-    kvp, X, X, x_norm.data(), x_norm.data(), n_rows, n_rows, n_cols,
-    workspace.data(), reduction_op, reduction_op, true, true, stream);
+  raft::linalg::rowNorm(x_norm.data(), X, n_cols, n_rows, raft::linalg::L2Norm, true, stream);
+
+  raft::distance::fusedL2NN<value_t, cub::KeyValuePair<value_idx, value_t>, value_idx>(
+    kvp,
+    X,
+    X,
+    x_norm.data(),
+    x_norm.data(),
+    n_rows,
+    n_rows,
+    n_cols,
+    workspace.data(),
+    reduction_op,
+    reduction_op,
+    true,
+    true,
+    stream);
 
   LookupColorOp<value_idx, value_t> extract_colors_op(colors);
-  thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors,
-                    extract_colors_op);
+  thrust::transform(rmm::exec_policy(stream), kvp, kvp + n_rows, nn_colors, extract_colors_op);
 }
 
 /**
@@ -229,27 +245,33 @@ void perform_1nn(cub::KeyValuePair<value_idx, value_t> *kvp,
  * @param stream stream for which to order CUDA operations
  */
 template <typename value_idx, typename value_t>
-void sort_by_color(value_idx *colors, value_idx *nn_colors,
-                   cub::KeyValuePair<value_idx, value_t> *kvp,
-                   value_idx *src_indices, size_t n_rows, cudaStream_t stream) {
+void sort_by_color(value_idx* colors,
+                   value_idx* nn_colors,
+                   cub::KeyValuePair<value_idx, value_t>* kvp,
+                   value_idx* src_indices,
+                   size_t n_rows,
+                   cudaStream_t stream)
+{
   thrust::counting_iterator<value_idx> arg_sort_iter(0);
-  thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows,
-               src_indices);
+  thrust::copy(rmm::exec_policy(stream), arg_sort_iter, arg_sort_iter + n_rows, src_indices);
 
-  auto keys = thrust::make_zip_iterator(thrust::make_tuple(
-    colors, nn_colors, (raft::linkage::KeyValuePair<value_idx, value_t> *)kvp));
+  auto keys = thrust::make_zip_iterator(
+    thrust::make_tuple(colors, nn_colors, (raft::linkage::KeyValuePair<value_idx, value_t>*)kvp));
   auto vals = thrust::make_zip_iterator(thrust::make_tuple(src_indices));
 
   // get all the colors in contiguous locations so we can map them to warps.
-  thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals,
-                      TupleComp());
+  thrust::sort_by_key(rmm::exec_policy(stream), keys, keys + n_rows, vals, TupleComp());
 }
 
 template <typename value_idx, typename value_t>
-__global__ void min_components_by_color_kernel(
-  value_idx *out_rows, value_idx *out_cols, value_t *out_vals,
-  const value_idx *out_index, const value_idx *indices,
-  const cub::KeyValuePair<value_idx, value_t> *kvp, size_t nnz) {
+__global__ void min_components_by_color_kernel(value_idx* out_rows,
+                                               value_idx* out_cols,
+                                               value_t* out_vals,
+                                               const value_idx* out_index,
+                                               const value_idx* indices,
+                                               const cub::KeyValuePair<value_idx, value_t>* kvp,
+                                               size_t nnz)
+{
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid >= nnz) return;
@@ -278,19 +300,20 @@ __global__ void min_components_by_color_kernel(
  * @param[in] stream cuda stream for which to order cuda operations
  */
 template <typename value_idx, typename value_t>
-void min_components_by_color(raft::sparse::COO<value_t, value_idx> &coo,
-                             const value_idx *out_index,
-                             const value_idx *indices,
-                             const cub::KeyValuePair<value_idx, value_t> *kvp,
-                             size_t nnz, cudaStream_t stream) {
+void min_components_by_color(raft::sparse::COO<value_t, value_idx>& coo,
+                             const value_idx* out_index,
+                             const value_idx* indices,
+                             const cub::KeyValuePair<value_idx, value_t>* kvp,
+                             size_t nnz,
+                             cudaStream_t stream)
+{
   /**
    * Arrays should be ordered by: colors_indptr->colors_n->kvp.value
    * so the last element of each column in the input CSR should be
    * the min.
    */
-  min_components_by_color_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0,
-                                   stream>>>(coo.rows(), coo.cols(), coo.vals(),
-                                             out_index, indices, kvp, nnz);
+  min_components_by_color_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0, stream>>>(
+    coo.rows(), coo.cols(), coo.vals(), out_index, indices, kvp, nnz);
 }
 
 /**
@@ -312,12 +335,16 @@ void min_components_by_color(raft::sparse::COO<value_t, value_idx> &coo,
  * @param[in] n_cols number of cols in X
  */
 template <typename value_idx, typename value_t, typename red_op>
-void connect_components(const raft::handle_t &handle,
-                        raft::sparse::COO<value_t, value_idx> &out,
-                        const value_t *X, const value_idx *orig_colors,
-                        size_t n_rows, size_t n_cols, red_op reduction_op,
-                        raft::distance::DistanceType metric =
-                          raft::distance::DistanceType::L2SqrtExpanded) {
+void connect_components(
+  const raft::handle_t& handle,
+  raft::sparse::COO<value_t, value_idx>& out,
+  const value_t* X,
+  const value_idx* orig_colors,
+  size_t n_rows,
+  size_t n_cols,
+  red_op reduction_op,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded)
+{
   auto stream = handle.get_stream();
 
   RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
@@ -328,8 +355,7 @@ void connect_components(const raft::handle_t &handle,
   raft::copy_async(colors.data(), orig_colors, n_rows, stream);
 
   // Normalize colors so they are drawn from a monotonically increasing set
-  raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream,
-                              true);
+  raft::label::make_monotonic(colors.data(), colors.data(), n_rows, stream, true);
 
   value_idx n_components = get_n_components(colors.data(), n_rows, stream);
 
@@ -338,36 +364,42 @@ void connect_components(const raft::handle_t &handle,
    * is guaranteed to be != color of its nearest neighbor.
    */
   rmm::device_uvector<value_idx> nn_colors(n_rows, stream);
-  rmm::device_uvector<cub::KeyValuePair<value_idx, value_t>> temp_inds_dists(
-    n_rows, stream);
+  rmm::device_uvector<cub::KeyValuePair<value_idx, value_t>> temp_inds_dists(n_rows, stream);
   rmm::device_uvector<value_idx> src_indices(n_rows, stream);
 
-  perform_1nn(temp_inds_dists.data(), nn_colors.data(), colors.data(), X,
-              n_rows, n_cols, stream, reduction_op);
+  perform_1nn(temp_inds_dists.data(),
+              nn_colors.data(),
+              colors.data(),
+              X,
+              n_rows,
+              n_cols,
+              stream,
+              reduction_op);
 
   /**
    * Sort data points by color (neighbors are not sorted)
    */
   // max_color + 1 = number of connected components
   // sort nn_colors by key w/ original colors
-  sort_by_color(colors.data(), nn_colors.data(), temp_inds_dists.data(),
-                src_indices.data(), n_rows, stream);
+  sort_by_color(
+    colors.data(), nn_colors.data(), temp_inds_dists.data(), src_indices.data(), n_rows, stream);
 
   /**
    * Take the min for any duplicate colors
    */
   // Compute mask of duplicates
   rmm::device_uvector<value_idx> out_index(n_rows + 1, stream);
-  raft::sparse::op::compute_duplicates_mask(out_index.data(), colors.data(),
-                                            nn_colors.data(), n_rows, stream);
+  raft::sparse::op::compute_duplicates_mask(
+    out_index.data(), colors.data(), nn_colors.data(), n_rows, stream);
 
-  thrust::exclusive_scan(handle.get_thrust_policy(), out_index.data(),
-                         out_index.data() + out_index.size(), out_index.data());
+  thrust::exclusive_scan(handle.get_thrust_policy(),
+                         out_index.data(),
+                         out_index.data() + out_index.size(),
+                         out_index.data());
 
   // compute final size
   value_idx size = 0;
-  raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1,
-                    stream);
+  raft::update_host(&size, out_index.data() + (out_index.size() - 1), 1, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
   size++;
@@ -375,14 +407,14 @@ void connect_components(const raft::handle_t &handle,
   raft::sparse::COO<value_t, value_idx> min_edges(stream);
   min_edges.allocate(size, n_rows, n_rows, true, stream);
 
-  min_components_by_color(min_edges, out_index.data(), src_indices.data(),
-                          temp_inds_dists.data(), n_rows, stream);
+  min_components_by_color(
+    min_edges, out_index.data(), src_indices.data(), temp_inds_dists.data(), n_rows, stream);
 
   /**
    * Symmetrize resulting edge list
    */
-  raft::sparse::linalg::symmetrize(handle, min_edges.rows(), min_edges.cols(),
-                                   min_edges.vals(), n_rows, n_rows, size, out);
+  raft::sparse::linalg::symmetrize(
+    handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out);
 }
 
 };  // end namespace linkage
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index b796b63dc8..8486abd863 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -38,9 +38,11 @@ namespace selection {
 
 template <typename value_idx, typename value_t>
 struct csr_batcher_t {
-  csr_batcher_t(value_idx batch_size, value_idx n_rows,
-                const value_idx *csr_indptr, const value_idx *csr_indices,
-                const value_t *csr_data)
+  csr_batcher_t(value_idx batch_size,
+                value_idx n_rows,
+                const value_idx* csr_indptr,
+                const value_idx* csr_indices,
+                const value_t* csr_data)
     : batch_start_(0),
       batch_stop_(0),
       batch_rows_(0),
@@ -50,32 +52,42 @@ struct csr_batcher_t {
       csr_indices_(csr_indices),
       csr_data_(csr_data),
       batch_csr_start_offset_(0),
-      batch_csr_stop_offset_(0) {}
+      batch_csr_stop_offset_(0)
+  {
+  }
 
-  void set_batch(int batch_num) {
+  void set_batch(int batch_num)
+  {
     batch_start_ = batch_num * batch_size_;
-    batch_stop_ = batch_start_ + batch_size_ - 1;  // zero-based indexing
+    batch_stop_  = batch_start_ + batch_size_ - 1;  // zero-based indexing
 
-    if (batch_stop_ >= total_rows_)
-      batch_stop_ = total_rows_ - 1;  // zero-based indexing
+    if (batch_stop_ >= total_rows_) batch_stop_ = total_rows_ - 1;  // zero-based indexing
 
     batch_rows_ = (batch_stop_ - batch_start_) + 1;
   }
 
-  value_idx get_batch_csr_indptr_nnz(value_idx *batch_indptr,
-                                     cudaStream_t stream) {
-    raft::sparse::op::csr_row_slice_indptr(
-      batch_start_, batch_stop_, csr_indptr_, batch_indptr,
-      &batch_csr_start_offset_, &batch_csr_stop_offset_, stream);
+  value_idx get_batch_csr_indptr_nnz(value_idx* batch_indptr, cudaStream_t stream)
+  {
+    raft::sparse::op::csr_row_slice_indptr(batch_start_,
+                                           batch_stop_,
+                                           csr_indptr_,
+                                           batch_indptr,
+                                           &batch_csr_start_offset_,
+                                           &batch_csr_stop_offset_,
+                                           stream);
 
     return batch_csr_stop_offset_ - batch_csr_start_offset_;
   }
 
-  void get_batch_csr_indices_data(value_idx *csr_indices, value_t *csr_data,
-                                  cudaStream_t stream) {
-    raft::sparse::op::csr_row_slice_populate(
-      batch_csr_start_offset_, batch_csr_stop_offset_, csr_indices_, csr_data_,
-      csr_indices, csr_data, stream);
+  void get_batch_csr_indices_data(value_idx* csr_indices, value_t* csr_data, cudaStream_t stream)
+  {
+    raft::sparse::op::csr_row_slice_populate(batch_csr_start_offset_,
+                                             batch_csr_stop_offset_,
+                                             csr_indices_,
+                                             csr_data_,
+                                             csr_indices,
+                                             csr_data,
+                                             stream);
   }
 
   value_idx batch_rows() const { return batch_rows_; }
@@ -92,9 +104,9 @@ struct csr_batcher_t {
 
   value_idx total_rows_;
 
-  const value_idx *csr_indptr_;
-  const value_idx *csr_indices_;
-  const value_t *csr_data_;
+  const value_idx* csr_indptr_;
+  const value_idx* csr_indices_;
+  const value_t* csr_data_;
 
   value_idx batch_csr_start_offset_;
   value_idx batch_csr_stop_offset_;
@@ -103,18 +115,26 @@ struct csr_batcher_t {
 template <typename value_idx, typename value_t>
 class sparse_knn_t {
  public:
-  sparse_knn_t(const value_idx *idxIndptr_, const value_idx *idxIndices_,
-               const value_t *idxData_, size_t idxNNZ_, int n_idx_rows_,
-               int n_idx_cols_, const value_idx *queryIndptr_,
-               const value_idx *queryIndices_, const value_t *queryData_,
-               size_t queryNNZ_, int n_query_rows_, int n_query_cols_,
-               value_idx *output_indices_, value_t *output_dists_, int k_,
-               const raft::handle_t &handle_,
-               size_t batch_size_index_ = 2 << 14,  // approx 1M
-               size_t batch_size_query_ = 2 << 14,
-               raft::distance::DistanceType metric_ =
-                 raft::distance::DistanceType::L2Expanded,
-               float metricArg_ = 0)
+  sparse_knn_t(const value_idx* idxIndptr_,
+               const value_idx* idxIndices_,
+               const value_t* idxData_,
+               size_t idxNNZ_,
+               int n_idx_rows_,
+               int n_idx_cols_,
+               const value_idx* queryIndptr_,
+               const value_idx* queryIndices_,
+               const value_t* queryData_,
+               size_t queryNNZ_,
+               int n_query_rows_,
+               int n_query_cols_,
+               value_idx* output_indices_,
+               value_t* output_dists_,
+               int k_,
+               const raft::handle_t& handle_,
+               size_t batch_size_index_             = 2 << 14,  // approx 1M
+               size_t batch_size_query_             = 2 << 14,
+               raft::distance::DistanceType metric_ = raft::distance::DistanceType::L2Expanded,
+               float metricArg_                     = 0)
     : idxIndptr(idxIndptr_),
       idxIndices(idxIndices_),
       idxData(idxData_),
@@ -134,9 +154,12 @@ class sparse_knn_t {
       batch_size_index(batch_size_index_),
       batch_size_query(batch_size_query_),
       metric(metric_),
-      metricArg(metricArg_) {}
+      metricArg(metricArg_)
+  {
+  }
 
-  void run() {
+  void run()
+  {
     using namespace raft::sparse;
 
     int n_batches_query = raft::ceildiv((size_t)n_query_rows, batch_size_query);
@@ -147,37 +170,33 @@ class sparse_knn_t {
 
     for (int i = 0; i < n_batches_query; i++) {
       /**
-        * Compute index batch info
-        */
+       * Compute index batch info
+       */
       query_batcher.set_batch(i);
 
       /**
-        * Slice CSR to rows in batch
-        */
+       * Slice CSR to rows in batch
+       */
 
-      rmm::device_uvector<value_idx> query_batch_indptr(
-        query_batcher.batch_rows() + 1, handle.get_stream());
+      rmm::device_uvector<value_idx> query_batch_indptr(query_batcher.batch_rows() + 1,
+                                                        handle.get_stream());
 
-      value_idx n_query_batch_nnz = query_batcher.get_batch_csr_indptr_nnz(
-        query_batch_indptr.data(), handle.get_stream());
+      value_idx n_query_batch_nnz =
+        query_batcher.get_batch_csr_indptr_nnz(query_batch_indptr.data(), handle.get_stream());
 
-      rmm::device_uvector<value_idx> query_batch_indices(n_query_batch_nnz,
-                                                         handle.get_stream());
-      rmm::device_uvector<value_t> query_batch_data(n_query_batch_nnz,
-                                                    handle.get_stream());
+      rmm::device_uvector<value_idx> query_batch_indices(n_query_batch_nnz, handle.get_stream());
+      rmm::device_uvector<value_t> query_batch_data(n_query_batch_nnz, handle.get_stream());
 
-      query_batcher.get_batch_csr_indices_data(query_batch_indices.data(),
-                                               query_batch_data.data(),
-                                               handle.get_stream());
+      query_batcher.get_batch_csr_indices_data(
+        query_batch_indices.data(), query_batch_data.data(), handle.get_stream());
 
       // A 3-partition temporary merge space to scale the batching. 2 parts for subsequent
       // batches and 1 space for the results of the merge, which get copied back to the top
-      rmm::device_uvector<value_idx> merge_buffer_indices(0,
-                                                          handle.get_stream());
+      rmm::device_uvector<value_idx> merge_buffer_indices(0, handle.get_stream());
       rmm::device_uvector<value_t> merge_buffer_dists(0, handle.get_stream());
 
-      value_t *dists_merge_buffer_ptr;
-      value_idx *indices_merge_buffer_ptr;
+      value_t* dists_merge_buffer_ptr;
+      value_idx* indices_merge_buffer_ptr;
 
       int n_batches_idx = raft::ceildiv((size_t)n_idx_rows, batch_size_index);
       csr_batcher_t<value_idx, value_t> idx_batcher(
@@ -186,22 +205,19 @@ class sparse_knn_t {
       for (int j = 0; j < n_batches_idx; j++) {
         idx_batcher.set_batch(j);
 
-        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3,
-                                    handle.get_stream());
-        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3,
-                                  handle.get_stream());
+        merge_buffer_indices.resize(query_batcher.batch_rows() * k * 3, handle.get_stream());
+        merge_buffer_dists.resize(query_batcher.batch_rows() * k * 3, handle.get_stream());
 
         /**
-          * Slice CSR to rows in batch
-        */
-        rmm::device_uvector<value_idx> idx_batch_indptr(
-          idx_batcher.batch_rows() + 1, handle.get_stream());
-        rmm::device_uvector<value_idx> idx_batch_indices(0,
-                                                         handle.get_stream());
+         * Slice CSR to rows in batch
+         */
+        rmm::device_uvector<value_idx> idx_batch_indptr(idx_batcher.batch_rows() + 1,
+                                                        handle.get_stream());
+        rmm::device_uvector<value_idx> idx_batch_indices(0, handle.get_stream());
         rmm::device_uvector<value_t> idx_batch_data(0, handle.get_stream());
 
-        value_idx idx_batch_nnz = idx_batcher.get_batch_csr_indptr_nnz(
-          idx_batch_indptr.data(), handle.get_stream());
+        value_idx idx_batch_nnz =
+          idx_batcher.get_batch_csr_indptr_nnz(idx_batch_indptr.data(), handle.get_stream());
 
         idx_batch_indices.resize(idx_batch_nnz, handle.get_stream());
         idx_batch_data.resize(idx_batch_nnz, handle.get_stream());
@@ -210,111 +226,126 @@ class sparse_knn_t {
           idx_batch_indices.data(), idx_batch_data.data(), handle.get_stream());
 
         /**
-           * Compute distances
-           */
-        size_t dense_size =
-          idx_batcher.batch_rows() * query_batcher.batch_rows();
-        rmm::device_uvector<value_t> batch_dists(dense_size,
-                                                 handle.get_stream());
-
-        CUDA_CHECK(cudaMemset(batch_dists.data(), 0,
-                              batch_dists.size() * sizeof(value_t)));
-
-        compute_distances(idx_batcher, query_batcher, idx_batch_nnz,
-                          n_query_batch_nnz, idx_batch_indptr.data(),
-                          idx_batch_indices.data(), idx_batch_data.data(),
-                          query_batch_indptr.data(), query_batch_indices.data(),
-                          query_batch_data.data(), batch_dists.data());
+         * Compute distances
+         */
+        size_t dense_size = idx_batcher.batch_rows() * query_batcher.batch_rows();
+        rmm::device_uvector<value_t> batch_dists(dense_size, handle.get_stream());
+
+        CUDA_CHECK(cudaMemset(batch_dists.data(), 0, batch_dists.size() * sizeof(value_t)));
+
+        compute_distances(idx_batcher,
+                          query_batcher,
+                          idx_batch_nnz,
+                          n_query_batch_nnz,
+                          idx_batch_indptr.data(),
+                          idx_batch_indices.data(),
+                          idx_batch_data.data(),
+                          query_batch_indptr.data(),
+                          query_batch_indices.data(),
+                          query_batch_data.data(),
+                          batch_dists.data());
 
         // Build batch indices array
-        rmm::device_uvector<value_idx> batch_indices(batch_dists.size(),
-                                                     handle.get_stream());
+        rmm::device_uvector<value_idx> batch_indices(batch_dists.size(), handle.get_stream());
 
         // populate batch indices array
-        value_idx batch_rows = query_batcher.batch_rows(),
-                  batch_cols = idx_batcher.batch_rows();
+        value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
 
-        iota_fill(batch_indices.data(), batch_rows, batch_cols,
-                  handle.get_stream());
+        iota_fill(batch_indices.data(), batch_rows, batch_cols, handle.get_stream());
 
         /**
          * Perform k-selection on batch & merge with other k-selections
          */
         size_t merge_buffer_offset = batch_rows * k;
-        dists_merge_buffer_ptr =
-          merge_buffer_dists.data() + merge_buffer_offset;
-        indices_merge_buffer_ptr =
-          merge_buffer_indices.data() + merge_buffer_offset;
-
-        perform_k_selection(idx_batcher, query_batcher, batch_dists.data(),
-                            batch_indices.data(), dists_merge_buffer_ptr,
+        dists_merge_buffer_ptr     = merge_buffer_dists.data() + merge_buffer_offset;
+        indices_merge_buffer_ptr   = merge_buffer_indices.data() + merge_buffer_offset;
+
+        perform_k_selection(idx_batcher,
+                            query_batcher,
+                            batch_dists.data(),
+                            batch_indices.data(),
+                            dists_merge_buffer_ptr,
                             indices_merge_buffer_ptr);
 
-        value_t *dists_merge_buffer_tmp_ptr = dists_merge_buffer_ptr;
-        value_idx *indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
+        value_t* dists_merge_buffer_tmp_ptr     = dists_merge_buffer_ptr;
+        value_idx* indices_merge_buffer_tmp_ptr = indices_merge_buffer_ptr;
 
         // Merge results of difference batches if necessary
         if (idx_batcher.batch_start() > 0) {
-          size_t merge_buffer_tmp_out = batch_rows * k * 2;
-          dists_merge_buffer_tmp_ptr =
-            merge_buffer_dists.data() + merge_buffer_tmp_out;
-          indices_merge_buffer_tmp_ptr =
-            merge_buffer_indices.data() + merge_buffer_tmp_out;
-
-          merge_batches(idx_batcher, query_batcher, merge_buffer_dists.data(),
-                        merge_buffer_indices.data(), dists_merge_buffer_tmp_ptr,
+          size_t merge_buffer_tmp_out  = batch_rows * k * 2;
+          dists_merge_buffer_tmp_ptr   = merge_buffer_dists.data() + merge_buffer_tmp_out;
+          indices_merge_buffer_tmp_ptr = merge_buffer_indices.data() + merge_buffer_tmp_out;
+
+          merge_batches(idx_batcher,
+                        query_batcher,
+                        merge_buffer_dists.data(),
+                        merge_buffer_indices.data(),
+                        dists_merge_buffer_tmp_ptr,
                         indices_merge_buffer_tmp_ptr);
         }
 
         // copy merged output back into merge buffer partition for next iteration
         raft::copy_async<value_idx>(merge_buffer_indices.data(),
                                     indices_merge_buffer_tmp_ptr,
-                                    batch_rows * k, handle.get_stream());
+                                    batch_rows * k,
+                                    handle.get_stream());
         raft::copy_async<value_t>(merge_buffer_dists.data(),
-                                  dists_merge_buffer_tmp_ptr, batch_rows * k,
+                                  dists_merge_buffer_tmp_ptr,
+                                  batch_rows * k,
                                   handle.get_stream());
       }
 
       // Copy final merged batch to output array
-      raft::copy_async<value_idx>(
-        output_indices + (rows_processed * k), merge_buffer_indices.data(),
-        query_batcher.batch_rows() * k, handle.get_stream());
-      raft::copy_async<value_t>(
-        output_dists + (rows_processed * k), merge_buffer_dists.data(),
-        query_batcher.batch_rows() * k, handle.get_stream());
+      raft::copy_async<value_idx>(output_indices + (rows_processed * k),
+                                  merge_buffer_indices.data(),
+                                  query_batcher.batch_rows() * k,
+                                  handle.get_stream());
+      raft::copy_async<value_t>(output_dists + (rows_processed * k),
+                                merge_buffer_dists.data(),
+                                query_batcher.batch_rows() * k,
+                                handle.get_stream());
 
       rows_processed += query_batcher.batch_rows();
     }
   }
 
  private:
-  void merge_batches(csr_batcher_t<value_idx, value_t> &idx_batcher,
-                     csr_batcher_t<value_idx, value_t> &query_batcher,
-                     value_t *merge_buffer_dists,
-                     value_idx *merge_buffer_indices, value_t *out_dists,
-                     value_idx *out_indices) {
+  void merge_batches(csr_batcher_t<value_idx, value_t>& idx_batcher,
+                     csr_batcher_t<value_idx, value_t>& query_batcher,
+                     value_t* merge_buffer_dists,
+                     value_idx* merge_buffer_indices,
+                     value_t* out_dists,
+                     value_idx* out_indices)
+  {
     // build translation buffer to shift resulting indices by the batch
     std::vector<value_idx> id_ranges;
     id_ranges.push_back(0);
     id_ranges.push_back(idx_batcher.batch_start());
 
     rmm::device_uvector<value_idx> trans(id_ranges.size(), handle.get_stream());
-    raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(),
-                        handle.get_stream());
+    raft::update_device(trans.data(), id_ranges.data(), id_ranges.size(), handle.get_stream());
 
     // combine merge buffers only if there's more than 1 partition to combine
-    raft::spatial::knn::knn_merge_parts(
-      merge_buffer_dists, merge_buffer_indices, out_dists, out_indices,
-      query_batcher.batch_rows(), 2, k, handle.get_stream(), trans.data());
+    raft::spatial::knn::knn_merge_parts(merge_buffer_dists,
+                                        merge_buffer_indices,
+                                        out_dists,
+                                        out_indices,
+                                        query_batcher.batch_rows(),
+                                        2,
+                                        k,
+                                        handle.get_stream(),
+                                        trans.data());
   }
 
   void perform_k_selection(csr_batcher_t<value_idx, value_t> idx_batcher,
                            csr_batcher_t<value_idx, value_t> query_batcher,
-                           value_t *batch_dists, value_idx *batch_indices,
-                           value_t *out_dists, value_idx *out_indices) {
+                           value_t* batch_dists,
+                           value_idx* batch_indices,
+                           value_t* out_dists,
+                           value_idx* out_indices)
+  {
     // populate batch indices array
-    value_idx batch_rows = query_batcher.batch_rows(),
-              batch_cols = idx_batcher.batch_rows();
+    value_idx batch_rows = query_batcher.batch_rows(), batch_cols = idx_batcher.batch_rows();
 
     // build translation buffer to shift resulting indices by the batch
     std::vector<value_idx> id_ranges;
@@ -329,52 +360,60 @@ class sparse_knn_t {
     if (metric == raft::distance::DistanceType::InnerProduct) ascending = false;
 
     // kernel to slice first (min) k cols and copy into batched merge buffer
-    raft::spatial::knn::select_k(batch_dists, batch_indices, batch_rows,
-                                 batch_cols, out_dists, out_indices, ascending,
-                                 n_neighbors, handle.get_stream());
+    raft::spatial::knn::select_k(batch_dists,
+                                 batch_indices,
+                                 batch_rows,
+                                 batch_cols,
+                                 out_dists,
+                                 out_indices,
+                                 ascending,
+                                 n_neighbors,
+                                 handle.get_stream());
   }
 
-  void compute_distances(csr_batcher_t<value_idx, value_t> &idx_batcher,
-                         csr_batcher_t<value_idx, value_t> &query_batcher,
-                         size_t idx_batch_nnz, size_t query_batch_nnz,
-                         value_idx *idx_batch_indptr,
-                         value_idx *idx_batch_indices, value_t *idx_batch_data,
-                         value_idx *query_batch_indptr,
-                         value_idx *query_batch_indices,
-                         value_t *query_batch_data, value_t *batch_dists) {
+  void compute_distances(csr_batcher_t<value_idx, value_t>& idx_batcher,
+                         csr_batcher_t<value_idx, value_t>& query_batcher,
+                         size_t idx_batch_nnz,
+                         size_t query_batch_nnz,
+                         value_idx* idx_batch_indptr,
+                         value_idx* idx_batch_indices,
+                         value_t* idx_batch_data,
+                         value_idx* query_batch_indptr,
+                         value_idx* query_batch_indices,
+                         value_t* query_batch_data,
+                         value_t* batch_dists)
+  {
     /**
      * Compute distances
      */
-    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config(
-      handle);
+    raft::sparse::distance::distances_config_t<value_idx, value_t> dist_config(handle);
     dist_config.b_nrows = idx_batcher.batch_rows();
     dist_config.b_ncols = n_idx_cols;
-    dist_config.b_nnz = idx_batch_nnz;
+    dist_config.b_nnz   = idx_batch_nnz;
 
-    dist_config.b_indptr = idx_batch_indptr;
+    dist_config.b_indptr  = idx_batch_indptr;
     dist_config.b_indices = idx_batch_indices;
-    dist_config.b_data = idx_batch_data;
+    dist_config.b_data    = idx_batch_data;
 
     dist_config.a_nrows = query_batcher.batch_rows();
     dist_config.a_ncols = n_query_cols;
-    dist_config.a_nnz = query_batch_nnz;
+    dist_config.a_nnz   = query_batch_nnz;
 
-    dist_config.a_indptr = query_batch_indptr;
+    dist_config.a_indptr  = query_batch_indptr;
     dist_config.a_indices = query_batch_indices;
-    dist_config.a_data = query_batch_data;
+    dist_config.a_data    = query_batch_data;
 
     if (raft::sparse::distance::supportedDistance.find(metric) ==
         raft::sparse::distance::supportedDistance.end())
       THROW("DistanceType not supported: %d", metric);
 
-    raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric,
-                                             metricArg);
+    raft::sparse::distance::pairwiseDistance(batch_dists, dist_config, metric, metricArg);
   }
 
   const value_idx *idxIndptr, *idxIndices, *queryIndptr, *queryIndices;
-  value_idx *output_indices;
+  value_idx* output_indices;
   const value_t *idxData, *queryData;
-  value_t *output_dists;
+  value_t* output_dists;
 
   size_t idxNNZ, queryNNZ, batch_size_index, batch_size_query;
 
@@ -384,50 +423,74 @@ class sparse_knn_t {
 
   int n_idx_rows, n_idx_cols, n_query_rows, n_query_cols, k;
 
-  const raft::handle_t &handle;
+  const raft::handle_t& handle;
 };
 
 /**
-   * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
-   * using some distance implementation
-   * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
-   * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
-   * @param[in] idxData csr data array of the index matrix (size idxNNZ)
-   * @param[in] idxNNA number of non-zeros for sparse index matrix
-   * @param[in] n_idx_rows number of data samples in index matrix
-   * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
-   * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
-   * @param[in] queryData csr data array of the query matrix (size queryNNZ)
-   * @param[in] queryNNZ number of non-zeros for sparse query matrix
-   * @param[in] n_query_rows number of data samples in query matrix
-   * @param[in] n_query_cols number of features in query matrix
-   * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
-   * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
-   * @param[in] k the number of neighbors to query
-   * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
-   * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
-   * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
-   * @param[in] metric distance metric/measure to use
-   * @param[in] metricArg potential argument for metric (currently unused)
-   */
+ * Search the sparse kNN for the k-nearest neighbors of a set of sparse query vectors
+ * using some distance implementation
+ * @param[in] idxIndptr csr indptr of the index matrix (size n_idx_rows + 1)
+ * @param[in] idxIndices csr column indices array of the index matrix (size n_idx_nnz)
+ * @param[in] idxData csr data array of the index matrix (size idxNNZ)
+ * @param[in] idxNNA number of non-zeros for sparse index matrix
+ * @param[in] n_idx_rows number of data samples in index matrix
+ * @param[in] queryIndptr csr indptr of the query matrix (size n_query_rows + 1)
+ * @param[in] queryIndices csr indices array of the query matrix (size queryNNZ)
+ * @param[in] queryData csr data array of the query matrix (size queryNNZ)
+ * @param[in] queryNNZ number of non-zeros for sparse query matrix
+ * @param[in] n_query_rows number of data samples in query matrix
+ * @param[in] n_query_cols number of features in query matrix
+ * @param[out] output_indices dense matrix for output indices (size n_query_rows * k)
+ * @param[out] output_dists dense matrix for output distances (size n_query_rows * k)
+ * @param[in] k the number of neighbors to query
+ * @param[in] handle.get_stream() CUDA handle.get_stream() to order operations with respect to
+ * @param[in] batch_size_index maximum number of rows to use from index matrix per batch
+ * @param[in] batch_size_query maximum number of rows to use from query matrix per batch
+ * @param[in] metric distance metric/measure to use
+ * @param[in] metricArg potential argument for metric (currently unused)
+ */
 template <typename value_idx = int, typename value_t = float, int TPB_X = 32>
-void brute_force_knn(const value_idx *idxIndptr, const value_idx *idxIndices,
-                     const value_t *idxData, size_t idxNNZ, int n_idx_rows,
-                     int n_idx_cols, const value_idx *queryIndptr,
-                     const value_idx *queryIndices, const value_t *queryData,
-                     size_t queryNNZ, int n_query_rows, int n_query_cols,
-                     value_idx *output_indices, value_t *output_dists, int k,
-                     const raft::handle_t &handle,
-                     size_t batch_size_index = 2 << 14,  // approx 1M
-                     size_t batch_size_query = 2 << 14,
-                     raft::distance::DistanceType metric =
-                       raft::distance::DistanceType::L2Expanded,
-                     float metricArg = 0) {
-  sparse_knn_t<value_idx, value_t>(
-    idxIndptr, idxIndices, idxData, idxNNZ, n_idx_rows, n_idx_cols, queryIndptr,
-    queryIndices, queryData, queryNNZ, n_query_rows, n_query_cols,
-    output_indices, output_dists, k, handle, batch_size_index, batch_size_query,
-    metric, metricArg)
+void brute_force_knn(const value_idx* idxIndptr,
+                     const value_idx* idxIndices,
+                     const value_t* idxData,
+                     size_t idxNNZ,
+                     int n_idx_rows,
+                     int n_idx_cols,
+                     const value_idx* queryIndptr,
+                     const value_idx* queryIndices,
+                     const value_t* queryData,
+                     size_t queryNNZ,
+                     int n_query_rows,
+                     int n_query_cols,
+                     value_idx* output_indices,
+                     value_t* output_dists,
+                     int k,
+                     const raft::handle_t& handle,
+                     size_t batch_size_index             = 2 << 14,  // approx 1M
+                     size_t batch_size_query             = 2 << 14,
+                     raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
+                     float metricArg                     = 0)
+{
+  sparse_knn_t<value_idx, value_t>(idxIndptr,
+                                   idxIndices,
+                                   idxData,
+                                   idxNNZ,
+                                   n_idx_rows,
+                                   n_idx_cols,
+                                   queryIndptr,
+                                   queryIndices,
+                                   queryData,
+                                   queryNNZ,
+                                   n_query_rows,
+                                   n_query_cols,
+                                   output_indices,
+                                   output_dists,
+                                   k,
+                                   handle,
+                                   batch_size_index,
+                                   batch_size_query,
+                                   metric,
+                                   metricArg)
     .run();
 }
 
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
index 3df1c77081..f13c43c306 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -45,31 +45,34 @@ namespace selection {
  * @param m
  */
 template <typename value_idx>
-__global__ void fill_indices(value_idx *indices, size_t m, size_t nnz) {
+__global__ void fill_indices(value_idx* indices, size_t m, size_t nnz)
+{
   value_idx tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (tid >= nnz) return;
-  value_idx v = tid / m;
+  value_idx v  = tid / m;
   indices[tid] = v;
 }
 
 template <typename value_idx>
-value_idx build_k(value_idx n_samples, int c) {
+value_idx build_k(value_idx n_samples, int c)
+{
   // from "kNN-MST-Agglomerative: A fast & scalable graph-based data clustering
   // approach on GPU"
-  return min(n_samples,
-             max((value_idx)2, (value_idx)floor(log2(n_samples)) + c));
+  return min(n_samples, max((value_idx)2, (value_idx)floor(log2(n_samples)) + c));
 }
 
 template <typename in_t, typename out_t>
-__global__ void conv_indices_kernel(in_t *inds, out_t *out, size_t nnz) {
+__global__ void conv_indices_kernel(in_t* inds, out_t* out, size_t nnz)
+{
   size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid >= nnz) return;
-  out_t v = inds[tid];
+  out_t v  = inds[tid];
   out[tid] = v;
 }
 
 template <typename in_t, typename out_t, int tpb = 256>
-void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) {
+void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream)
+{
   size_t blocks = ceildiv(size, (size_t)tpb);
   conv_indices_kernel<<<blocks, tpb, 0, stream>>>(inds, out, size);
 }
@@ -92,9 +95,14 @@ void conv_indices(in_t *inds, out_t *out, size_t size, cudaStream_t stream) {
  * @param c
  */
 template <typename value_idx = int, typename value_t = float>
-void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
+void knn_graph(const handle_t& handle,
+               const value_t* X,
+               size_t m,
+               size_t n,
                raft::distance::DistanceType metric,
-               raft::sparse::COO<value_t, value_idx> &out, int c = 15) {
+               raft::sparse::COO<value_t, value_idx>& out,
+               int c = 15)
+{
   int k = build_k(m, c);
 
   auto stream = handle.get_stream();
@@ -108,8 +116,8 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
   size_t blocks = ceildiv(nnz, (size_t)256);
   fill_indices<value_idx><<<blocks, 256, 0, stream>>>(rows.data(), k, nnz);
 
-  std::vector<value_t *> inputs;
-  inputs.push_back(const_cast<value_t *>(X));
+  std::vector<value_t*> inputs;
+  inputs.push_back(const_cast<value_t*>(X));
 
   std::vector<int> sizes;
   sizes.push_back(m);
@@ -119,15 +127,25 @@ void knn_graph(const handle_t &handle, const value_t *X, size_t m, size_t n,
   rmm::device_uvector<int64_t> int64_indices(nnz, stream);
 
   uint32_t knn_start = curTimeMillis();
-  raft::spatial::knn::brute_force_knn(
-    handle, inputs, sizes, n, const_cast<value_t *>(X), m, int64_indices.data(),
-    data.data(), k, true, true, nullptr, metric);
+  raft::spatial::knn::brute_force_knn(handle,
+                                      inputs,
+                                      sizes,
+                                      n,
+                                      const_cast<value_t*>(X),
+                                      m,
+                                      int64_indices.data(),
+                                      data.data(),
+                                      k,
+                                      true,
+                                      true,
+                                      nullptr,
+                                      metric);
 
   // convert from current knn's 64-bit to 32-bit.
   conv_indices(int64_indices.data(), indices.data(), nnz, stream);
 
-  raft::sparse::linalg::symmetrize(handle, rows.data(), indices.data(),
-                                   data.data(), m, k, nnz, out);
+  raft::sparse::linalg::symmetrize(
+    handle, rows.data(), indices.data(), data.data(), m, k, nnz, out);
 }
 
 };  // namespace selection
diff --git a/cpp/include/raft/sparse/utils.h b/cpp/include/raft/sparse/utils.h
index 63578bf1f3..56e8832e0a 100644
--- a/cpp/include/raft/sparse/utils.h
+++ b/cpp/include/raft/sparse/utils.h
@@ -26,7 +26,8 @@ namespace sparse {
  * @param[in] ncols number of blocks to quantize
  */
 template <typename value_idx>
-inline int block_dim(value_idx ncols) {
+inline int block_dim(value_idx ncols)
+{
   int blockdim;
   if (ncols <= 32)
     blockdim = 32;
@@ -54,9 +55,9 @@ inline int block_dim(value_idx ncols) {
  * @return
  */
 template <typename G>
-__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask,
-                                                    G key) {
-  unsigned int mask = __ballot_sync(init_mask, true);
+__device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask, G key)
+{
+  unsigned int mask       = __ballot_sync(init_mask, true);
   unsigned int peer_group = 0;
   bool is_peer;
 
@@ -77,12 +78,14 @@ __device__ __inline__ unsigned int __match_any_sync(unsigned int init_mask,
 }
 #endif
 
-__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group) {
+__device__ __inline__ unsigned int get_lowest_peer(unsigned int peer_group)
+{
   return __ffs(peer_group) - 1;
 }
 
 template <typename value_idx>
-__global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) {
+__global__ void iota_fill_block_kernel(value_idx* indices, value_idx ncols)
+{
   int row = blockIdx.x;
   int tid = threadIdx.x;
 
@@ -92,15 +95,16 @@ __global__ void iota_fill_block_kernel(value_idx *indices, value_idx ncols) {
 }
 
 template <typename value_idx>
-void iota_fill(value_idx *indices, value_idx nrows, value_idx ncols,
-               cudaStream_t stream) {
+void iota_fill(value_idx* indices, value_idx nrows, value_idx ncols, cudaStream_t stream)
+{
   int blockdim = block_dim(ncols);
 
   iota_fill_block_kernel<<<nrows, blockdim, 0, stream>>>(indices, ncols);
 }
 
 template <typename T>
-__device__ int get_stop_idx(T row, T m, T nnz, const T *ind) {
+__device__ int get_stop_idx(T row, T m, T nnz, const T* ind)
+{
   int stop_idx = 0;
   if (row < (m - 1))
     stop_idx = ind[row + 1];
diff --git a/cpp/include/raft/spatial/knn/ann.hpp b/cpp/include/raft/spatial/knn/ann.hpp
index 2cdf9bf4f5..e8cc85256d 100644
--- a/cpp/include/raft/spatial/knn/ann.hpp
+++ b/cpp/include/raft/spatial/knn/ann.hpp
@@ -42,14 +42,16 @@ namespace knn {
  * @param[in] D the dimensionality of the index array
  */
 template <typename value_idx = int>
-inline void approx_knn_build_index(raft::handle_t &handle,
-                                   raft::spatial::knn::knnIndex *index,
-                                   knnIndexParam *params,
+inline void approx_knn_build_index(raft::handle_t& handle,
+                                   raft::spatial::knn::knnIndex* index,
+                                   knnIndexParam* params,
                                    raft::distance::DistanceType metric,
-                                   float metricArg, float *index_array,
-                                   value_idx n, value_idx D) {
-  detail::approx_knn_build_index(handle, index, params, metric, metricArg,
-                                 index_array, n, D);
+                                   float metricArg,
+                                   float* index_array,
+                                   value_idx n,
+                                   value_idx D)
+{
+  detail::approx_knn_build_index(handle, index, params, metric, metricArg, index_array, n, D);
 }
 
 /**
@@ -66,12 +68,15 @@ inline void approx_knn_build_index(raft::handle_t &handle,
  * @param[in] n number of rows in the query array
  */
 template <typename value_idx = int>
-inline void approx_knn_search(raft::handle_t &handle, float *distances,
-                              int64_t *indices,
-                              raft::spatial::knn::knnIndex *index, value_idx k,
-                              float *query_array, value_idx n) {
-  detail::approx_knn_search(handle, distances, indices, index, k, query_array,
-                            n);
+inline void approx_knn_search(raft::handle_t& handle,
+                              float* distances,
+                              int64_t* indices,
+                              raft::spatial::knn::knnIndex* index,
+                              value_idx k,
+                              float* query_array,
+                              value_idx n)
+{
+  detail::approx_knn_search(handle, distances, indices, index, k, query_array, n);
 }
 
 }  // namespace knn
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 6a6c7751c2..573a23181d 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -26,13 +26,14 @@ namespace spatial {
 namespace knn {
 
 struct knnIndex {
-  faiss::gpu::GpuIndex *index;
+  faiss::gpu::GpuIndex* index;
   raft::distance::DistanceType metric;
   float metricArg;
 
-  faiss::gpu::StandardGpuResources *gpu_res;
+  faiss::gpu::StandardGpuResources* gpu_res;
   int device;
-  ~knnIndex() {
+  ~knnIndex()
+  {
     delete index;
     delete gpu_res;
   }
@@ -57,7 +58,8 @@ struct IVFParam : knnIndexParam {
   int nprobe;
 };
 
-struct IVFFlatParam : IVFParam {};
+struct IVFFlatParam : IVFParam {
+};
 
 struct IVFPQParam : IVFParam {
   int M;
diff --git a/cpp/include/raft/spatial/knn/ball_cover.hpp b/cpp/include/raft/spatial/knn/ball_cover.hpp
index a98473f186..cb2b9e99cd 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover.hpp
@@ -28,12 +28,11 @@ namespace raft {
 namespace spatial {
 namespace knn {
 
-template <typename value_idx = std::int64_t, typename value_t,
-          typename value_int = std::uint32_t>
-void rbc_build_index(const raft::handle_t &handle,
-                     BallCoverIndex<value_idx, value_t, value_int> &index) {
-  ASSERT(index.n == 2,
-         "Random ball cover currently only works in 2-dimensions");
+template <typename value_idx = std::int64_t, typename value_t, typename value_int = std::uint32_t>
+void rbc_build_index(const raft::handle_t& handle,
+                     BallCoverIndex<value_idx, value_t, value_int>& index)
+{
+  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
   if (index.metric == raft::distance::DistanceType::Haversine) {
     detail::rbc_build_index(handle, index, detail::HaversineFunc());
   } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
@@ -74,23 +73,23 @@ void rbc_build_index(const raft::handle_t &handle,
  *               many datasets can still have great recall even by only
  *               looking in the closest landmark.
  */
-template <typename value_idx = std::int64_t, typename value_t,
-          typename value_int = std::uint32_t>
-void rbc_all_knn_query(const raft::handle_t &handle,
-                       BallCoverIndex<value_idx, value_t, value_int> &index,
-                       value_int k, value_idx *inds, value_t *dists,
-                       bool perform_post_filtering = true, float weight = 1.0) {
-  ASSERT(index.n == 2,
-         "Random ball cover currently only works in 2-dimensions");
+template <typename value_idx = std::int64_t, typename value_t, typename value_int = std::uint32_t>
+void rbc_all_knn_query(const raft::handle_t& handle,
+                       BallCoverIndex<value_idx, value_t, value_int>& index,
+                       value_int k,
+                       value_idx* inds,
+                       value_t* dists,
+                       bool perform_post_filtering = true,
+                       float weight                = 1.0)
+{
+  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
   if (index.metric == raft::distance::DistanceType::Haversine) {
-    detail::rbc_all_knn_query(handle, index, k, inds, dists,
-                              detail::HaversineFunc(), perform_post_filtering,
-                              weight);
+    detail::rbc_all_knn_query(
+      handle, index, k, inds, dists, detail::HaversineFunc(), perform_post_filtering, weight);
   } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
              index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
-    detail::rbc_all_knn_query(handle, index, k, inds, dists,
-                              detail::EuclideanFunc(), perform_post_filtering,
-                              weight);
+    detail::rbc_all_knn_query(
+      handle, index, k, inds, dists, detail::EuclideanFunc(), perform_post_filtering, weight);
   } else {
     RAFT_FAIL("Metric not supported");
   }
@@ -127,23 +126,40 @@ void rbc_all_knn_query(const raft::handle_t &handle,
  *               looking in the closest landmark.
  * @param[in] n_query_pts number of query points
  */
-template <typename value_idx = std::int64_t, typename value_t,
-          typename value_int = std::uint32_t>
-void rbc_knn_query(const raft::handle_t &handle,
-                   BallCoverIndex<value_idx, value_t, value_int> &index,
-                   value_int k, const value_t *query, value_int n_query_pts,
-                   value_idx *inds, value_t *dists,
-                   bool perform_post_filtering = true, float weight = 1.0) {
-  ASSERT(index.n == 2,
-         "Random ball cover currently only works in 2-dimensions");
+template <typename value_idx = std::int64_t, typename value_t, typename value_int = std::uint32_t>
+void rbc_knn_query(const raft::handle_t& handle,
+                   BallCoverIndex<value_idx, value_t, value_int>& index,
+                   value_int k,
+                   const value_t* query,
+                   value_int n_query_pts,
+                   value_idx* inds,
+                   value_t* dists,
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0)
+{
+  ASSERT(index.n == 2, "Random ball cover currently only works in 2-dimensions");
   if (index.metric == raft::distance::DistanceType::Haversine) {
-    detail::rbc_knn_query(handle, index, k, query, n_query_pts, inds, dists,
-                          detail::HaversineFunc(), perform_post_filtering,
+    detail::rbc_knn_query(handle,
+                          index,
+                          k,
+                          query,
+                          n_query_pts,
+                          inds,
+                          dists,
+                          detail::HaversineFunc(),
+                          perform_post_filtering,
                           weight);
   } else if (index.metric == raft::distance::DistanceType::L2SqrtExpanded ||
              index.metric == raft::distance::DistanceType::L2SqrtUnexpanded) {
-    detail::rbc_knn_query(handle, index, k, query, n_query_pts, inds, dists,
-                          detail::EuclideanFunc(), perform_post_filtering,
+    detail::rbc_knn_query(handle,
+                          index,
+                          k,
+                          query,
+                          n_query_pts,
+                          inds,
+                          dists,
+                          detail::EuclideanFunc(),
+                          perform_post_filtering,
                           weight);
   } else {
     RAFT_FAIL("Metric not supported");
diff --git a/cpp/include/raft/spatial/knn/ball_cover_common.h b/cpp/include/raft/spatial/knn/ball_cover_common.h
index ca614bb0cb..e38124edb6 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_common.h
+++ b/cpp/include/raft/spatial/knn/ball_cover_common.h
@@ -34,12 +34,13 @@ namespace knn {
  * @tparam value_t
  * @tparam value_int
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t>
+template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
 class BallCoverIndex {
  public:
-  explicit BallCoverIndex(const raft::handle_t &handle_, const value_t *X_,
-                          value_int m_, value_int n_,
+  explicit BallCoverIndex(const raft::handle_t& handle_,
+                          const value_t* X_,
+                          value_int m_,
+                          value_int n_,
                           raft::distance::DistanceType metric_)
     : handle(handle_),
       X(X_),
@@ -47,37 +48,39 @@ class BallCoverIndex {
       n(n_),
       metric(metric_),
       /**
-      * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound
-      *
-      * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m)
-      */
+       * the sqrt() here makes the sqrt(m)^2 a linear-time lower bound
+       *
+       * Total memory footprint of index: (2 * sqrt(m)) + (n * sqrt(m)) + (2 * m)
+       */
       n_landmarks(sqrt(m_)),
       R_indptr(sqrt(m_) + 1, handle.get_stream()),
       R_1nn_cols(m_, handle.get_stream()),
       R_1nn_dists(m_, handle.get_stream()),
       R(sqrt(m_) * n_, handle.get_stream()),
       R_radius(sqrt(m_), handle.get_stream()),
-      index_trained(false) {}
+      index_trained(false)
+  {
+  }
 
-  value_idx *get_R_indptr() { return R_indptr.data(); }
-  value_idx *get_R_1nn_cols() { return R_1nn_cols.data(); }
-  value_t *get_R_1nn_dists() { return R_1nn_dists.data(); }
-  value_t *get_R_radius() { return R_radius.data(); }
-  value_t *get_R() { return R.data(); }
-  const value_t *get_X() { return X; }
+  value_idx* get_R_indptr() { return R_indptr.data(); }
+  value_idx* get_R_1nn_cols() { return R_1nn_cols.data(); }
+  value_t* get_R_1nn_dists() { return R_1nn_dists.data(); }
+  value_t* get_R_radius() { return R_radius.data(); }
+  value_t* get_R() { return R.data(); }
+  const value_t* get_X() { return X; }
 
   bool is_index_trained() const { return index_trained; };
 
   // This should only be set by internal functions
   void set_index_trained() { index_trained = true; }
 
-  const raft::handle_t &handle;
+  const raft::handle_t& handle;
 
   const value_int m;
   const value_int n;
   const value_int n_landmarks;
 
-  const value_t *X;
+  const value_t* X;
 
   raft::distance::DistanceType metric;
 
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
index 980001f166..7f4e4511d2 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized_faiss.cuh
@@ -55,90 +55,84 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(
-  QuantizerType qtype) {
+inline faiss::ScalarQuantizer::QuantizerType build_faiss_qtype(QuantizerType qtype)
+{
   switch (qtype) {
-    case QuantizerType::QT_8bit:
-      return faiss::ScalarQuantizer::QuantizerType::QT_8bit;
+    case QuantizerType::QT_8bit: return faiss::ScalarQuantizer::QuantizerType::QT_8bit;
     case QuantizerType::QT_8bit_uniform:
       return faiss::ScalarQuantizer::QuantizerType::QT_8bit_uniform;
     case QuantizerType::QT_4bit_uniform:
       return faiss::ScalarQuantizer::QuantizerType::QT_4bit_uniform;
-    case QuantizerType::QT_fp16:
-      return faiss::ScalarQuantizer::QuantizerType::QT_fp16;
+    case QuantizerType::QT_fp16: return faiss::ScalarQuantizer::QuantizerType::QT_fp16;
     case QuantizerType::QT_8bit_direct:
       return faiss::ScalarQuantizer::QuantizerType::QT_8bit_direct;
-    case QuantizerType::QT_6bit:
-      return faiss::ScalarQuantizer::QuantizerType::QT_6bit;
-    default:
-      return (faiss::ScalarQuantizer::QuantizerType)qtype;
+    case QuantizerType::QT_6bit: return faiss::ScalarQuantizer::QuantizerType::QT_6bit;
+    default: return (faiss::ScalarQuantizer::QuantizerType)qtype;
   }
 }
 
 template <typename IntType = int>
-void approx_knn_ivfflat_build_index(knnIndex *index, IVFParam *params,
-                                    raft::distance::DistanceType metric,
-                                    IntType n, IntType D) {
+void approx_knn_ivfflat_build_index(
+  knnIndex* index, IVFParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
+{
   faiss::gpu::GpuIndexIVFFlatConfig config;
-  config.device = index->device;
+  config.device                  = index->device;
   faiss::MetricType faiss_metric = build_faiss_metric(metric);
-  faiss::gpu::GpuIndexIVFFlat *faiss_index = new faiss::gpu::GpuIndexIVFFlat(
-    index->gpu_res, D, params->nlist, faiss_metric, config);
+  faiss::gpu::GpuIndexIVFFlat* faiss_index =
+    new faiss::gpu::GpuIndexIVFFlat(index->gpu_res, D, params->nlist, faiss_metric, config);
   faiss_index->setNumProbes(params->nprobe);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_ivfpq_build_index(knnIndex *index, IVFPQParam *params,
-                                  raft::distance::DistanceType metric,
-                                  IntType n, IntType D) {
+void approx_knn_ivfpq_build_index(
+  knnIndex* index, IVFPQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
+{
   faiss::gpu::GpuIndexIVFPQConfig config;
-  config.device = index->device;
-  config.usePrecomputedTables = params->usePrecomputedTables;
-  config.interleavedLayout = params->n_bits != 8;
-  faiss::MetricType faiss_metric = build_faiss_metric(metric);
-  faiss::gpu::GpuIndexIVFPQ *faiss_index =
-    new faiss::gpu::GpuIndexIVFPQ(index->gpu_res, D, params->nlist, params->M,
-                                  params->n_bits, faiss_metric, config);
+  config.device                          = index->device;
+  config.usePrecomputedTables            = params->usePrecomputedTables;
+  config.interleavedLayout               = params->n_bits != 8;
+  faiss::MetricType faiss_metric         = build_faiss_metric(metric);
+  faiss::gpu::GpuIndexIVFPQ* faiss_index = new faiss::gpu::GpuIndexIVFPQ(
+    index->gpu_res, D, params->nlist, params->M, params->n_bits, faiss_metric, config);
   faiss_index->setNumProbes(params->nprobe);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_ivfsq_build_index(knnIndex *index, IVFSQParam *params,
-                                  raft::distance::DistanceType metric,
-                                  IntType n, IntType D) {
+void approx_knn_ivfsq_build_index(
+  knnIndex* index, IVFSQParam* params, raft::distance::DistanceType metric, IntType n, IntType D)
+{
   faiss::gpu::GpuIndexIVFScalarQuantizerConfig config;
-  config.device = index->device;
-  faiss::MetricType faiss_metric = build_faiss_metric(metric);
-  faiss::ScalarQuantizer::QuantizerType faiss_qtype =
-    build_faiss_qtype(params->qtype);
-  faiss::gpu::GpuIndexIVFScalarQuantizer *faiss_index =
-    new faiss::gpu::GpuIndexIVFScalarQuantizer(index->gpu_res, D, params->nlist,
-                                               faiss_qtype, faiss_metric,
-                                               params->encodeResidual);
+  config.device                                       = index->device;
+  faiss::MetricType faiss_metric                      = build_faiss_metric(metric);
+  faiss::ScalarQuantizer::QuantizerType faiss_qtype   = build_faiss_qtype(params->qtype);
+  faiss::gpu::GpuIndexIVFScalarQuantizer* faiss_index = new faiss::gpu::GpuIndexIVFScalarQuantizer(
+    index->gpu_res, D, params->nlist, faiss_qtype, faiss_metric, params->encodeResidual);
   faiss_index->setNumProbes(params->nprobe);
   index->index = faiss_index;
 }
 
 template <typename IntType = int>
-void approx_knn_build_index(raft::handle_t &handle,
-                            raft::spatial::knn::knnIndex *index,
-                            raft::spatial::knn::knnIndexParam *params,
+void approx_knn_build_index(raft::handle_t& handle,
+                            raft::spatial::knn::knnIndex* index,
+                            raft::spatial::knn::knnIndexParam* params,
                             raft::distance::DistanceType metric,
-                            float metricArg, float *index_array, IntType n,
-                            IntType D) {
+                            float metricArg,
+                            float* index_array,
+                            IntType n,
+                            IntType D)
+{
   int device;
   CUDA_CHECK(cudaGetDevice(&device));
 
-  faiss::gpu::StandardGpuResources *gpu_res =
-    new faiss::gpu::StandardGpuResources();
+  faiss::gpu::StandardGpuResources* gpu_res = new faiss::gpu::StandardGpuResources();
   gpu_res->noTempMemory();
   gpu_res->setDefaultStream(device, handle.get_stream());
-  index->gpu_res = gpu_res;
-  index->device = device;
-  index->index = nullptr;
-  index->metric = metric;
+  index->gpu_res   = gpu_res;
+  index->device    = device;
+  index->index     = nullptr;
+  index->metric    = metric;
   index->metricArg = metricArg;
 
   // perform preprocessing
@@ -148,21 +142,20 @@ void approx_knn_build_index(raft::handle_t &handle,
 
   query_metric_processor->preprocess(index_array);
 
-  if (dynamic_cast<IVFFlatParam *>(params)) {
-    IVFFlatParam *IVFFlat_param = dynamic_cast<IVFFlatParam *>(params);
+  if (dynamic_cast<IVFFlatParam*>(params)) {
+    IVFFlatParam* IVFFlat_param = dynamic_cast<IVFFlatParam*>(params);
     approx_knn_ivfflat_build_index(index, IVFFlat_param, metric, n, D);
     std::vector<float> h_index_array(n * D);
-    raft::update_host(h_index_array.data(), index_array, h_index_array.size(),
-                      handle.get_stream());
+    raft::update_host(h_index_array.data(), index_array, h_index_array.size(), handle.get_stream());
     query_metric_processor->revert(index_array);
     index->index->train(n, h_index_array.data());
     index->index->add(n, h_index_array.data());
   } else {
-    if (dynamic_cast<IVFPQParam *>(params)) {
-      IVFPQParam *IVFPQ_param = dynamic_cast<IVFPQParam *>(params);
+    if (dynamic_cast<IVFPQParam*>(params)) {
+      IVFPQParam* IVFPQ_param = dynamic_cast<IVFPQParam*>(params);
       approx_knn_ivfpq_build_index(index, IVFPQ_param, metric, n, D);
-    } else if (dynamic_cast<IVFSQParam *>(params)) {
-      IVFSQParam *IVFSQ_param = dynamic_cast<IVFSQParam *>(params);
+    } else if (dynamic_cast<IVFSQParam*>(params)) {
+      IVFSQParam* IVFSQ_param = dynamic_cast<IVFSQParam*>(params);
       approx_knn_ivfsq_build_index(index, IVFSQ_param, metric, n, D);
     } else {
       ASSERT(index->index, "KNN index could not be initialized");
@@ -175,13 +168,17 @@ void approx_knn_build_index(raft::handle_t &handle,
 }
 
 template <typename IntType = int>
-void approx_knn_search(raft::handle_t &handle, float *distances,
-                       int64_t *indices, raft::spatial::knn::knnIndex *index,
-                       IntType k, float *query_array, IntType n) {
+void approx_knn_search(raft::handle_t& handle,
+                       float* distances,
+                       int64_t* indices,
+                       raft::spatial::knn::knnIndex* index,
+                       IntType k,
+                       float* query_array,
+                       IntType n)
+{
   // perform preprocessing
   std::unique_ptr<MetricProcessor<float>> query_metric_processor =
-    create_processor<float>(index->metric, n, index->index->d, k, false,
-                            handle.get_stream());
+    create_processor<float>(index->metric, n, index->index->d, k, false, handle.get_stream());
 
   query_metric_processor->preprocess(query_array);
   index->index->search(n, query_array, k, distances, indices);
@@ -192,13 +189,14 @@ void approx_knn_search(raft::handle_t &handle, float *distances,
       index->metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
       index->metric == raft::distance::DistanceType::LpUnexpanded) {
     /**
-  * post-processing
-  */
+     * post-processing
+     */
     float p = 0.5;  // standard l2
-    if (index->metric == raft::distance::DistanceType::LpUnexpanded)
-      p = 1.0 / index->metricArg;
+    if (index->metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / index->metricArg;
     raft::linalg::unaryOp<float>(
-      distances, distances, n * k,
+      distances,
+      distances,
+      n * k,
       [p] __device__(float input) { return powf(input, p); },
       handle.get_stream());
   }
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index 7354fa3497..7b54c3d25b 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -60,34 +60,43 @@ namespace detail {
  * @param handle
  * @param index
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t>
-void sample_landmarks(const raft::handle_t &handle,
-                      BallCoverIndex<value_idx, value_t, value_int> &index) {
-  rmm::device_uvector<value_idx> R_1nn_cols2(index.n_landmarks,
-                                             handle.get_stream());
+template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
+void sample_landmarks(const raft::handle_t& handle,
+                      BallCoverIndex<value_idx, value_t, value_int>& index)
+{
+  rmm::device_uvector<value_idx> R_1nn_cols2(index.n_landmarks, handle.get_stream());
   rmm::device_uvector<value_t> R_1nn_ones(index.m, handle.get_stream());
-  rmm::device_uvector<value_idx> R_indices(index.n_landmarks,
-                                           handle.get_stream());
+  rmm::device_uvector<value_idx> R_indices(index.n_landmarks, handle.get_stream());
 
-  thrust::sequence(handle.get_thrust_policy(), index.get_R_1nn_cols(),
-                   index.get_R_1nn_cols() + index.m, (value_idx)0);
+  thrust::sequence(handle.get_thrust_policy(),
+                   index.get_R_1nn_cols(),
+                   index.get_R_1nn_cols() + index.m,
+                   (value_idx)0);
 
-  thrust::fill(handle.get_thrust_policy(), R_1nn_ones.data(),
-               R_1nn_ones.data() + R_1nn_ones.size(), 1.0);
+  thrust::fill(
+    handle.get_thrust_policy(), R_1nn_ones.data(), R_1nn_ones.data() + R_1nn_ones.size(), 1.0);
 
   /**
- * 1. Randomly sample sqrt(n) points from X
- */
+   * 1. Randomly sample sqrt(n) points from X
+   */
   auto rng = raft::random::Rng(12345);
-  rng.sampleWithoutReplacement(handle, R_indices.data(), R_1nn_cols2.data(),
-                               index.get_R_1nn_cols(), R_1nn_ones.data(),
-                               (value_idx)index.n_landmarks, (value_idx)index.m,
+  rng.sampleWithoutReplacement(handle,
+                               R_indices.data(),
+                               R_1nn_cols2.data(),
+                               index.get_R_1nn_cols(),
+                               R_1nn_ones.data(),
+                               (value_idx)index.n_landmarks,
+                               (value_idx)index.m,
                                handle.get_stream());
 
-  raft::matrix::copyRows<value_t, value_idx, size_t>(
-    index.get_X(), index.m, index.n, index.get_R(), R_1nn_cols2.data(),
-    index.n_landmarks, handle.get_stream(), true);
+  raft::matrix::copyRows<value_t, value_idx, size_t>(index.get_X(),
+                                                     index.m,
+                                                     index.n,
+                                                     index.get_R(),
+                                                     R_1nn_cols2.data(),
+                                                     index.n_landmarks,
+                                                     handle.get_stream(),
+                                                     true);
 }
 
 /**
@@ -100,35 +109,34 @@ void sample_landmarks(const raft::handle_t &handle,
  * @param k
  * @param index
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t>
-void construct_landmark_1nn(
-  const raft::handle_t &handle, const value_idx *R_knn_inds_ptr,
-  const value_t *R_knn_dists_ptr, value_int k,
-  BallCoverIndex<value_idx, value_t, value_int> &index) {
+template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
+void construct_landmark_1nn(const raft::handle_t& handle,
+                            const value_idx* R_knn_inds_ptr,
+                            const value_t* R_knn_dists_ptr,
+                            value_int k,
+                            BallCoverIndex<value_idx, value_t, value_int>& index)
+{
   rmm::device_uvector<value_idx> R_1nn_inds(index.m, handle.get_stream());
 
-  value_idx *R_1nn_inds_ptr = R_1nn_inds.data();
-  value_t *R_1nn_dists_ptr = index.get_R_1nn_dists();
+  value_idx* R_1nn_inds_ptr = R_1nn_inds.data();
+  value_t* R_1nn_dists_ptr  = index.get_R_1nn_dists();
 
   auto idxs = thrust::make_counting_iterator<value_idx>(0);
-  thrust::for_each(handle.get_thrust_policy(), idxs, idxs + index.m,
-                   [=] __device__(value_idx i) {
-                     R_1nn_inds_ptr[i] = R_knn_inds_ptr[i * k];
-                     R_1nn_dists_ptr[i] = R_knn_dists_ptr[i * k];
-                   });
+  thrust::for_each(handle.get_thrust_policy(), idxs, idxs + index.m, [=] __device__(value_idx i) {
+    R_1nn_inds_ptr[i]  = R_knn_inds_ptr[i * k];
+    R_1nn_dists_ptr[i] = R_knn_dists_ptr[i * k];
+  });
 
-  auto keys = thrust::make_zip_iterator(
-    thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists()));
+  auto keys =
+    thrust::make_zip_iterator(thrust::make_tuple(R_1nn_inds.data(), index.get_R_1nn_dists()));
 
   // group neighborhoods for each reference landmark and sort each group by distance
-  thrust::sort_by_key(handle.get_thrust_policy(), keys, keys + index.m,
-                      index.get_R_1nn_cols(), NNComp());
+  thrust::sort_by_key(
+    handle.get_thrust_policy(), keys, keys + index.m, index.get_R_1nn_cols(), NNComp());
 
   // convert to CSR for fast lookup
   raft::sparse::convert::sorted_coo_to_csr(
-    R_1nn_inds.data(), index.m, index.get_R_indptr(), index.n_landmarks + 1,
-    handle.get_stream());
+    R_1nn_inds.data(), index.m, index.get_R_indptr(), index.n_landmarks + 1, handle.get_stream());
 }
 
 /**
@@ -144,20 +152,33 @@ void construct_landmark_1nn(
  * @param R_knn_inds
  * @param R_knn_dists
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t>
-void k_closest_landmarks(const raft::handle_t &handle,
-                         BallCoverIndex<value_idx, value_t, value_int> &index,
-                         const value_t *query_pts, value_int n_query_pts,
-                         value_int k, value_idx *R_knn_inds,
-                         value_t *R_knn_dists) {
-  std::vector<value_t *> input = {index.get_R()};
+template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
+void k_closest_landmarks(const raft::handle_t& handle,
+                         BallCoverIndex<value_idx, value_t, value_int>& index,
+                         const value_t* query_pts,
+                         value_int n_query_pts,
+                         value_int k,
+                         value_idx* R_knn_inds,
+                         value_t* R_knn_dists)
+{
+  std::vector<value_t*> input      = {index.get_R()};
   std::vector<std::uint32_t> sizes = {index.n_landmarks};
 
-  brute_force_knn_impl<std::uint32_t, std::int64_t>(
-    input, sizes, index.n, const_cast<value_t *>(query_pts), n_query_pts,
-    R_knn_inds, R_knn_dists, k, handle.get_stream(), nullptr, 0, true, true,
-    nullptr, index.metric);
+  brute_force_knn_impl<std::uint32_t, std::int64_t>(input,
+                                                    sizes,
+                                                    index.n,
+                                                    const_cast<value_t*>(query_pts),
+                                                    n_query_pts,
+                                                    R_knn_inds,
+                                                    R_knn_dists,
+                                                    k,
+                                                    handle.get_stream(),
+                                                    nullptr,
+                                                    0,
+                                                    true,
+                                                    true,
+                                                    nullptr,
+                                                    index.metric);
 }
 
 /**
@@ -168,21 +189,21 @@ void k_closest_landmarks(const raft::handle_t &handle,
  * @param handle
  * @param index
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t>
-void compute_landmark_radii(
-  const raft::handle_t &handle,
-  BallCoverIndex<value_idx, value_t, value_int> &index) {
+template <typename value_idx, typename value_t, typename value_int = std::uint32_t>
+void compute_landmark_radii(const raft::handle_t& handle,
+                            BallCoverIndex<value_idx, value_t, value_int>& index)
+{
   auto entries = thrust::make_counting_iterator<value_idx>(0);
 
-  const value_idx *R_indptr_ptr = index.get_R_indptr();
-  const value_t *R_1nn_dists_ptr = index.get_R_1nn_dists();
-  value_t *R_radius_ptr = index.get_R_radius();
-  thrust::for_each(handle.get_thrust_policy(), entries,
+  const value_idx* R_indptr_ptr  = index.get_R_indptr();
+  const value_t* R_1nn_dists_ptr = index.get_R_1nn_dists();
+  value_t* R_radius_ptr          = index.get_R_radius();
+  thrust::for_each(handle.get_thrust_policy(),
+                   entries,
                    entries + index.n_landmarks,
                    [=] __device__(value_idx input) {
                      value_idx last_row_idx = R_indptr_ptr[input + 1] - 1;
-                     R_radius_ptr[input] = R_1nn_dists_ptr[last_row_idx];
+                     R_radius_ptr[input]    = R_1nn_dists_ptr[last_row_idx];
                    });
 }
 
@@ -196,23 +217,51 @@ void compute_landmark_radii(
  * marking the distance to be computed between x, y only
  * if knn[k].distance >= d(x_i, R_k) + d(R_k, y)
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t, typename dist_func>
-void perform_rbc_query(const raft::handle_t &handle,
-                       BallCoverIndex<value_idx, value_t, value_int> &index,
-                       const value_t *query, value_int n_query_pts,
-                       std::uint32_t k, const value_idx *R_knn_inds,
-                       const value_t *R_knn_dists, dist_func dfunc,
-                       value_idx *inds, value_t *dists,
-                       value_int *dists_counter, value_int *post_dists_counter,
-                       float weight = 1.0, bool perform_post_filtering = true) {
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          typename dist_func>
+void perform_rbc_query(const raft::handle_t& handle,
+                       BallCoverIndex<value_idx, value_t, value_int>& index,
+                       const value_t* query,
+                       value_int n_query_pts,
+                       std::uint32_t k,
+                       const value_idx* R_knn_inds,
+                       const value_t* R_knn_dists,
+                       dist_func dfunc,
+                       value_idx* inds,
+                       value_t* dists,
+                       value_int* dists_counter,
+                       value_int* post_dists_counter,
+                       float weight                = 1.0,
+                       bool perform_post_filtering = true)
+{
   // Compute nearest k for each neighborhood in each closest R
-  rbc_low_dim_pass_one(handle, index, query, n_query_pts, k, R_knn_inds,
-                       R_knn_dists, dfunc, inds, dists, weight, dists_counter);
+  rbc_low_dim_pass_one(handle,
+                       index,
+                       query,
+                       n_query_pts,
+                       k,
+                       R_knn_inds,
+                       R_knn_dists,
+                       dfunc,
+                       inds,
+                       dists,
+                       weight,
+                       dists_counter);
 
   if (perform_post_filtering) {
-    rbc_low_dim_pass_two(handle, index, query, n_query_pts, k, R_knn_inds,
-                         R_knn_dists, dfunc, inds, dists, weight,
+    rbc_low_dim_pass_two(handle,
+                         index,
+                         query,
+                         n_query_pts,
+                         k,
+                         R_knn_inds,
+                         R_knn_dists,
+                         dfunc,
+                         inds,
+                         dists,
+                         weight,
                          post_dists_counter);
   }
 }
@@ -228,13 +277,15 @@ void perform_rbc_query(const raft::handle_t &handle,
  * query which is useful for algorithms that need to perform
  * A * A.T.
  */
-template <typename value_idx = std::int64_t, typename value_t,
-          typename value_int = std::uint32_t, typename distance_func>
-void rbc_build_index(const raft::handle_t &handle,
-                     BallCoverIndex<value_idx, value_t, value_int> &index,
-                     distance_func dfunc) {
-  ASSERT(index.n == 2,
-         "only 2d vectors are supported in current implementation");
+template <typename value_idx = std::int64_t,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          typename distance_func>
+void rbc_build_index(const raft::handle_t& handle,
+                     BallCoverIndex<value_idx, value_t, value_int>& index,
+                     distance_func dfunc)
+{
+  ASSERT(index.n == 2, "only 2d vectors are supported in current implementation");
   ASSERT(!index.is_index_trained(), "index cannot be previously trained");
 
   rmm::device_uvector<value_idx> R_knn_inds(index.m, handle.get_stream());
@@ -249,8 +300,8 @@ void rbc_build_index(const raft::handle_t &handle,
    * 2. Perform knn = bfknn(X, R, k)
    */
   value_int k = 1;
-  k_closest_landmarks(handle, index, index.get_X(), index.m, k,
-                      R_knn_inds.data(), R_knn_dists.data());
+  k_closest_landmarks(
+    handle, index, index.get_X(), index.m, k, R_knn_inds.data(), R_knn_dists.data());
 
   /**
    * 3. Create L_r = knn[:,0].T (CSR)
@@ -258,8 +309,7 @@ void rbc_build_index(const raft::handle_t &handle,
    * Slice closest neighboring R
    * Secondary sort by (R_knn_inds, R_knn_dists)
    */
-  construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k,
-                         index);
+  construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, index);
 
   /**
    * Compute radius of each R for filtering: p(q, r) <= p(q, q_r) + radius(r)
@@ -271,16 +321,21 @@ void rbc_build_index(const raft::handle_t &handle,
 /**
  * Performs an all neighbors knn query (e.g. index == query)
  */
-template <typename value_idx = std::int64_t, typename value_t,
-          typename value_int = std::uint32_t, typename distance_func>
-void rbc_all_knn_query(const raft::handle_t &handle,
-                       BallCoverIndex<value_idx, value_t, value_int> &index,
-                       value_int k, value_idx *inds, value_t *dists,
+template <typename value_idx = std::int64_t,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          typename distance_func>
+void rbc_all_knn_query(const raft::handle_t& handle,
+                       BallCoverIndex<value_idx, value_t, value_int>& index,
+                       value_int k,
+                       value_idx* inds,
+                       value_t* dists,
                        distance_func dfunc,
                        // approximate nn options
-                       bool perform_post_filtering = true, float weight = 1.0) {
-  ASSERT(index.n == 2,
-         "only 2d vectors are supported in current implementation");
+                       bool perform_post_filtering = true,
+                       float weight                = 1.0)
+{
+  ASSERT(index.n == 2, "only 2d vectors are supported in current implementation");
   ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k");
   ASSERT(!index.is_index_trained(), "index cannot be previously trained");
 
@@ -289,22 +344,30 @@ void rbc_all_knn_query(const raft::handle_t &handle,
 
   // For debugging / verification. Remove before releasing
   rmm::device_uvector<value_int> dists_counter(index.m, handle.get_stream());
-  rmm::device_uvector<value_int> post_dists_counter(index.m,
-                                                    handle.get_stream());
+  rmm::device_uvector<value_int> post_dists_counter(index.m, handle.get_stream());
 
   sample_landmarks<value_idx, value_t>(handle, index);
 
-  k_closest_landmarks(handle, index, index.get_X(), index.m, k,
-                      R_knn_inds.data(), R_knn_dists.data());
+  k_closest_landmarks(
+    handle, index, index.get_X(), index.m, k, R_knn_inds.data(), R_knn_dists.data());
 
-  construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k,
-                         index);
+  construct_landmark_1nn(handle, R_knn_inds.data(), R_knn_dists.data(), k, index);
 
   compute_landmark_radii(handle, index);
 
-  perform_rbc_query(handle, index, index.get_X(), index.m, k, R_knn_inds.data(),
-                    R_knn_dists.data(), dfunc, inds, dists,
-                    dists_counter.data(), post_dists_counter.data(), weight,
+  perform_rbc_query(handle,
+                    index,
+                    index.get_X(),
+                    index.m,
+                    k,
+                    R_knn_inds.data(),
+                    R_knn_dists.data(),
+                    dfunc,
+                    inds,
+                    dists,
+                    dists_counter.data(),
+                    post_dists_counter.data(),
+                    weight,
                     perform_post_filtering);
 }
 
@@ -312,35 +375,50 @@ void rbc_all_knn_query(const raft::handle_t &handle,
  * Performs a knn query against an index. This assumes the index has
  * already been built.
  */
-template <typename value_idx = std::int64_t, typename value_t,
-          typename value_int = std::uint32_t, typename distance_func>
-void rbc_knn_query(const raft::handle_t &handle,
-                   BallCoverIndex<value_idx, value_t, value_int> &index,
-                   value_int k, const value_t *query, value_int n_query_pts,
-                   value_idx *inds, value_t *dists, distance_func dfunc,
+template <typename value_idx = std::int64_t,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          typename distance_func>
+void rbc_knn_query(const raft::handle_t& handle,
+                   BallCoverIndex<value_idx, value_t, value_int>& index,
+                   value_int k,
+                   const value_t* query,
+                   value_int n_query_pts,
+                   value_idx* inds,
+                   value_t* dists,
+                   distance_func dfunc,
                    // approximate nn options
-                   bool perform_post_filtering = true, float weight = 1.0) {
-  ASSERT(index.n == 2,
-         "only 2d vectors are supported in current implementation");
+                   bool perform_post_filtering = true,
+                   float weight                = 1.0)
+{
+  ASSERT(index.n == 2, "only 2d vectors are supported in current implementation");
   ASSERT(index.n_landmarks >= k, "number of landmark samples must be >= k");
   ASSERT(index.is_index_trained(), "index must be previously trained");
 
   rmm::device_uvector<value_idx> R_knn_inds(k * index.m, handle.get_stream());
   rmm::device_uvector<value_t> R_knn_dists(k * index.m, handle.get_stream());
 
-  k_closest_landmarks(handle, index, query, n_query_pts, k, R_knn_inds.data(),
-                      R_knn_dists.data());
+  k_closest_landmarks(handle, index, query, n_query_pts, k, R_knn_inds.data(), R_knn_dists.data());
 
   // For debugging / verification. Remove before releasing
   rmm::device_uvector<value_int> dists_counter(index.m, handle.get_stream());
-  rmm::device_uvector<value_int> post_dists_counter(index.m,
-                                                    handle.get_stream());
-  thrust::fill(handle.get_thrust_policy(), post_dists_counter.data(),
-               post_dists_counter.data() + index.m, 0);
-
-  perform_rbc_query(handle, index, query, n_query_pts, k, R_knn_inds.data(),
-                    R_knn_dists.data(), dfunc, inds, dists,
-                    dists_counter.data(), post_dists_counter.data(), weight,
+  rmm::device_uvector<value_int> post_dists_counter(index.m, handle.get_stream());
+  thrust::fill(
+    handle.get_thrust_policy(), post_dists_counter.data(), post_dists_counter.data() + index.m, 0);
+
+  perform_rbc_query(handle,
+                    index,
+                    query,
+                    n_query_pts,
+                    k,
+                    R_knn_inds.data(),
+                    R_knn_dists.data(),
+                    dfunc,
+                    inds,
+                    dists,
+                    dists_counter.data(),
+                    post_dists_counter.data(),
+                    weight,
                     perform_post_filtering);
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
index c6cb679408..181dad1a90 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/common.cuh
@@ -27,7 +27,8 @@ namespace detail {
 
 struct NNComp {
   template <typename one, typename two>
-  __host__ __device__ bool operator()(const one &t1, const two &t2) {
+  __host__ __device__ bool operator()(const one& t1, const two& t2)
+  {
     // sort first by each sample's reference landmark,
     if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true;
     if (thrust::get<0>(t1) > thrust::get<0>(t2)) return false;
@@ -39,17 +40,20 @@ struct NNComp {
 
 struct HaversineFunc {
   template <typename value_t, typename value_int = std::uint32_t>
-  __device__ __host__ __forceinline__ value_t
-  operator()(const value_t *a, const value_t *b, const value_int n_dims) {
-    return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1],
-                                                         b[1]);
+  __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
+                                                         const value_t* b,
+                                                         const value_int n_dims)
+  {
+    return raft::spatial::knn::detail::compute_haversine(a[0], b[0], a[1], b[1]);
   }
 };
 
 struct EuclideanFunc {
   template <typename value_t, typename value_int = std::uint32_t>
-  __device__ __host__ __forceinline__ value_t
-  operator()(const value_t *a, const value_t *b, const value_int n_dims) {
+  __device__ __host__ __forceinline__ value_t operator()(const value_t* a,
+                                                         const value_t* b,
+                                                         const value_int n_dims)
+  {
     value_t sum_sq = 0;
     for (value_int i = 0; i < n_dims; ++i) {
       value_t diff = a[i] - b[i];
@@ -63,7 +67,8 @@ struct EuclideanFunc {
 /**
  * Zeros the bit at location h in a one-hot encoded 32-bit int array
  */
-__device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) {
+__device__ inline void _zero_bit(std::uint32_t* arr, std::uint32_t h)
+{
   int bit = h % 32;
   int idx = h / 32;
 
@@ -71,7 +76,7 @@ __device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) {
   std::uint32_t old = arr[idx];
   do {
     assumed = old;
-    old = atomicCAS(arr + idx, assumed, assumed & ~(1 << bit));
+    old     = atomicCAS(arr + idx, assumed, assumed & ~(1 << bit));
   } while (assumed != old);
 }
 
@@ -79,7 +84,8 @@ __device__ inline void _zero_bit(std::uint32_t *arr, std::uint32_t h) {
  * Returns whether or not bit at location h is nonzero in a one-hot
  * encoded 32-bit in array.
  */
-__device__ inline bool _get_val(std::uint32_t *arr, std::uint32_t h) {
+__device__ inline bool _get_val(std::uint32_t* arr, std::uint32_t h)
+{
   int bit = h % 32;
   int idx = h / 32;
   return (arr[idx] & (1 << bit)) > 0;
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
index 4a476274dd..5d28258f7a 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover/registers.cuh
@@ -58,14 +58,24 @@ namespace detail {
  * @param output
  * @param weight
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t, int tpb = 32,
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          int tpb            = 32,
           typename distance_func>
-__global__ void perform_post_filter_registers(
-  const value_t *X, value_int n_cols, const value_idx *R_knn_inds,
-  const value_t *R_knn_dists, const value_t *R_radius, const value_t *landmarks,
-  int n_landmarks, value_int bitset_size, value_int k, distance_func dfunc,
-  std::uint32_t *output, float weight = 1.0) {
+__global__ void perform_post_filter_registers(const value_t* X,
+                                              value_int n_cols,
+                                              const value_idx* R_knn_inds,
+                                              const value_t* R_knn_dists,
+                                              const value_t* R_radius,
+                                              const value_t* landmarks,
+                                              int n_landmarks,
+                                              value_int bitset_size,
+                                              value_int k,
+                                              distance_func dfunc,
+                                              std::uint32_t* output,
+                                              float weight = 1.0)
+{
   // allocate array of size n_landmarks / 32 ints
   extern __shared__ std::uint32_t shared_mem[];
 
@@ -98,8 +108,7 @@ __global__ void perform_post_filter_registers(
   for (value_int l = threadIdx.x; l < n_landmarks; l += tpb) {
     // compute p(q, r)
     value_t dist = dfunc(local_x_ptr, landmarks + (n_cols * l), n_cols);
-    if (dist > weight * (closest_R_dist + R_radius[l]) ||
-        dist > 3 * closest_R_dist) {
+    if (dist > weight * (closest_R_dist + R_radius[l]) || dist > 3 * closest_R_dist) {
       _zero_bit(shared_mem, l);
     }
   }
@@ -136,38 +145,58 @@ __global__ void perform_post_filter_registers(
  * @param k
  * @param dist_counter
  */
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t,
-          typename bitset_type = std::uint32_t, typename dist_func,
-          int warp_q = 32, int thread_q = 2, int tpb = 128, int col_q = 2>
-__global__ void compute_final_dists_registers(
-  const value_t *X_index, const value_t *X, const value_int n_cols,
-  bitset_type *bitset, value_int bitset_size, const value_t *R_knn_dists,
-  const value_idx *R_indptr, const value_idx *R_1nn_inds,
-  const value_t *R_1nn_dists, value_idx *knn_inds, value_t *knn_dists,
-  value_int n_landmarks, value_int k, dist_func dfunc,
-  value_int *dist_counter) {
+template <typename value_idx,
+          typename value_t,
+          typename value_int   = std::uint32_t,
+          typename bitset_type = std::uint32_t,
+          typename dist_func,
+          int warp_q   = 32,
+          int thread_q = 2,
+          int tpb      = 128,
+          int col_q    = 2>
+__global__ void compute_final_dists_registers(const value_t* X_index,
+                                              const value_t* X,
+                                              const value_int n_cols,
+                                              bitset_type* bitset,
+                                              value_int bitset_size,
+                                              const value_t* R_knn_dists,
+                                              const value_idx* R_indptr,
+                                              const value_idx* R_1nn_inds,
+                                              const value_t* R_1nn_dists,
+                                              value_idx* knn_inds,
+                                              value_t* knn_dists,
+                                              value_int n_landmarks,
+                                              value_int k,
+                                              dist_func dfunc,
+                                              value_int* dist_counter)
+{
   static constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
-  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx>
-    shared_memV[kNumWarps * warp_q];
+  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
 
-  const value_t *x_ptr = X + (n_cols * blockIdx.x);
+  const value_t* x_ptr = X + (n_cols * blockIdx.x);
   value_t local_x_ptr[col_q];
   for (value_int j = 0; j < n_cols; ++j) {
     local_x_ptr[j] = x_ptr[j];
   }
 
-  faiss::gpu::KeyValueBlockSelect<value_t, value_idx, false,
-                                  faiss::gpu::Comparator<value_t>, warp_q,
-                                  thread_q, tpb>
+  faiss::gpu::KeyValueBlockSelect<value_t,
+                                  value_idx,
+                                  false,
+                                  faiss::gpu::Comparator<value_t>,
+                                  warp_q,
+                                  thread_q,
+                                  tpb>
     heap(faiss::gpu::Limits<value_t>::getMax(),
-         faiss::gpu::Limits<value_t>::getMax(), -1, shared_memK, shared_memV,
+         faiss::gpu::Limits<value_t>::getMax(),
+         -1,
+         shared_memK,
+         shared_memV,
          k);
 
   const value_int n_k = faiss::gpu::utils::roundDown(k, faiss::gpu::kWarpSize);
-  value_int i = threadIdx.x;
+  value_int i         = threadIdx.x;
   for (; i < n_k; i += tpb) {
     value_idx ind = knn_inds[blockIdx.x * k + i];
     heap.add(knn_dists[blockIdx.x * k + i], R_knn_dists[ind * k], ind);
@@ -185,33 +214,31 @@ __global__ void compute_final_dists_registers(
     // candidate
     if (_get_val(bitset + (blockIdx.x * bitset_size), cur_R_ind)) {
       value_idx R_start_offset = R_indptr[cur_R_ind];
-      value_idx R_stop_offset = R_indptr[cur_R_ind + 1];
-      value_idx R_size = R_stop_offset - R_start_offset;
+      value_idx R_stop_offset  = R_indptr[cur_R_ind + 1];
+      value_idx R_size         = R_stop_offset - R_start_offset;
 
       // Loop through R's neighborhood in parallel
 
       // Round R_size to the nearest warp threads so they can
       // all be computing in parallel.
 
-      const value_int limit =
-        faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
+      const value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
 
       i = threadIdx.x;
       for (; i < limit; i += tpb) {
         value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i];
-        value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i];
-        value_t z = heap.warpKTopRDist == 0.00
-                      ? 0.0
-                      : (abs(heap.warpKTop - heap.warpKTopRDist) *
-                           abs(heap.warpKTopRDist - cur_candidate_dist) -
-                         heap.warpKTop * cur_candidate_dist) /
-                          heap.warpKTopRDist;
-        z = isnan(z) ? 0.0 : z;
+        value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
+        value_t z                   = heap.warpKTopRDist == 0.00 ? 0.0
+                                                                 : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                                                    abs(heap.warpKTopRDist - cur_candidate_dist) -
+                                                  heap.warpKTop * cur_candidate_dist) /
+                                                   heap.warpKTopRDist;
+        z                           = isnan(z) ? 0.0 : z;
         // If lower bound on distance could possibly be in
         // the closest k neighbors, compute it and add to k-select
         value_t dist = std::numeric_limits<value_t>::max();
         if (z <= heap.warpKTop) {
-          const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind);
+          const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
           value_t local_y_ptr[col_q];
           for (value_int j = 0; j < n_cols; ++j) {
             local_y_ptr[j] = y_ptr[j];
@@ -226,21 +253,20 @@ __global__ void compute_final_dists_registers(
       // second round guarantees to be only a single warp.
       if (i < R_size) {
         value_idx cur_candidate_ind = R_1nn_inds[R_start_offset + i];
-        value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i];
+        value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
 
-        value_t z = heap.warpKTopRDist == 0.00
-                      ? 0.0
-                      : (abs(heap.warpKTop - heap.warpKTopRDist) *
-                           abs(heap.warpKTopRDist - cur_candidate_dist) -
-                         heap.warpKTop * cur_candidate_dist) /
-                          heap.warpKTopRDist;
+        value_t z = heap.warpKTopRDist == 0.00 ? 0.0
+                                               : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                                                    abs(heap.warpKTopRDist - cur_candidate_dist) -
+                                                  heap.warpKTop * cur_candidate_dist) /
+                                                   heap.warpKTopRDist;
 
         z = isnan(z) ? 0.0 : z;
         // If lower bound on distance could possibly be in
         // the closest k neighbors, compute it and add to k-select
         value_t dist = std::numeric_limits<value_t>::max();
         if (z <= heap.warpKTop) {
-          const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind);
+          const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
           value_t local_y_ptr[col_q];
           for (value_int j = 0; j < n_cols; ++j) {
             local_y_ptr[j] = y_ptr[j];
@@ -257,7 +283,7 @@ __global__ void compute_final_dists_registers(
 
   for (value_int i = threadIdx.x; i < k; i += tpb) {
     knn_dists[blockIdx.x * k + i] = shared_memK[i];
-    knn_inds[blockIdx.x * k + i] = shared_memV[i].value;
+    knn_inds[blockIdx.x * k + i]  = shared_memV[i].value;
   }
 }
 
@@ -278,28 +304,41 @@ __global__ void compute_final_dists_registers(
  * @param R_1nn_cols
  * @param R_1nn_dists
  */
-template <typename value_idx = std::int64_t, typename value_t, int warp_q = 32,
-          int thread_q = 2, int tpb = 128, int col_q = 2,
-          typename value_int = std::uint32_t, typename distance_func>
-__global__ void block_rbc_kernel_registers(
-  const value_t *X_index, const value_t *X,
-  value_int n_cols,  // n_cols should be 2 or 3 dims
-  const value_idx *R_knn_inds, const value_t *R_knn_dists, value_int m,
-  value_int k, const value_idx *R_indptr, const value_idx *R_1nn_cols,
-  const value_t *R_1nn_dists, value_idx *out_inds, value_t *out_dists,
-  value_int *dist_counter, value_t *R_radius, distance_func dfunc,
-  float weight = 1.0) {
+template <typename value_idx = std::int64_t,
+          typename value_t,
+          int warp_q         = 32,
+          int thread_q       = 2,
+          int tpb            = 128,
+          int col_q          = 2,
+          typename value_int = std::uint32_t,
+          typename distance_func>
+__global__ void block_rbc_kernel_registers(const value_t* X_index,
+                                           const value_t* X,
+                                           value_int n_cols,  // n_cols should be 2 or 3 dims
+                                           const value_idx* R_knn_inds,
+                                           const value_t* R_knn_dists,
+                                           value_int m,
+                                           value_int k,
+                                           const value_idx* R_indptr,
+                                           const value_idx* R_1nn_cols,
+                                           const value_t* R_1nn_dists,
+                                           value_idx* out_inds,
+                                           value_t* out_dists,
+                                           value_int* dist_counter,
+                                           value_t* R_radius,
+                                           distance_func dfunc,
+                                           float weight = 1.0)
+{
   static constexpr value_int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t shared_memK[kNumWarps * warp_q];
-  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx>
-    shared_memV[kNumWarps * warp_q];
+  __shared__ faiss::gpu::KeyValuePair<value_t, value_idx> shared_memV[kNumWarps * warp_q];
 
   // TODO: Separate kernels for different widths:
   // 1. Very small (between 3 and 32) just use registers for columns of "blockIdx.x"
   // 2. Can fit comfortably in shared memory (32 to a few thousand?)
   // 3. Load each time individually.
-  const value_t *x_ptr = X + (n_cols * blockIdx.x);
+  const value_t* x_ptr = X + (n_cols * blockIdx.x);
 
   // Use registers only for 2d or 3d
   value_t local_x_ptr[col_q];
@@ -308,11 +347,18 @@ __global__ void block_rbc_kernel_registers(
   }
 
   // Each warp works on 1 R
-  faiss::gpu::KeyValueBlockSelect<value_t, value_idx, false,
-                                  faiss::gpu::Comparator<value_t>, warp_q,
-                                  thread_q, tpb>
+  faiss::gpu::KeyValueBlockSelect<value_t,
+                                  value_idx,
+                                  false,
+                                  faiss::gpu::Comparator<value_t>,
+                                  warp_q,
+                                  thread_q,
+                                  tpb>
     heap(faiss::gpu::Limits<value_t>::getMax(),
-         faiss::gpu::Limits<value_t>::getMax(), -1, shared_memK, shared_memV,
+         faiss::gpu::Limits<value_t>::getMax(),
+         -1,
+         shared_memK,
+         shared_memV,
          k);
 
   value_t min_R_dist = R_knn_dists[blockIdx.x * k + (k - 1)];
@@ -327,7 +373,7 @@ __global__ void block_rbc_kernel_registers(
   // determining if the distance could even potentially be in the heap.
   for (value_int cur_k = 0; cur_k < k; ++cur_k) {
     // index and distance to current blockIdx.x's closest landmark
-    value_t cur_R_dist = R_knn_dists[blockIdx.x * k + cur_k];
+    value_t cur_R_dist  = R_knn_dists[blockIdx.x * k + cur_k];
     value_idx cur_R_ind = R_knn_inds[blockIdx.x * k + cur_k];
 
     // Equation (2) in Cayton's paper- prune out R's which are > 3 * p(q, r_q)
@@ -336,38 +382,37 @@ __global__ void block_rbc_kernel_registers(
 
     // The whole warp should iterate through the elements in the current R
     value_idx R_start_offset = R_indptr[cur_R_ind];
-    value_idx R_stop_offset = R_indptr[cur_R_ind + 1];
+    value_idx R_stop_offset  = R_indptr[cur_R_ind + 1];
 
     value_idx R_size = R_stop_offset - R_start_offset;
 
-    value_int limit =
-      faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
-    value_int i = threadIdx.x;
+    value_int limit = faiss::gpu::utils::roundDown(R_size, faiss::gpu::kWarpSize);
+    value_int i     = threadIdx.x;
     for (; i < limit; i += tpb) {
       // Index and distance of current candidate's nearest landmark
       value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i];
-      value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i];
+      value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
 
       // Take 2 landmarks l_1 and l_2 where l_1 is the furthest point in the heap
       // and l_2 is the current landmark R. s is the current data point and
       // t is the new candidate data point. We know that:
-      // d(s, t) cannot possibly be any smaller than | d(s, l_1) - d(l_1, l_2) | * | d(l_1, l_2) - d(l_2, t) | - d(s, l_1) * d(l_2, t)
+      // d(s, t) cannot possibly be any smaller than | d(s, l_1) - d(l_1, l_2) | * | d(l_1, l_2) -
+      // d(l_2, t) | - d(s, l_1) * d(l_2, t)
 
-      // Therefore, if d(s, t) >= d(s, l_1) from the computation above, we know that the distance to the candidate point
-      // cannot possibly be in the nearest neighbors. However, if d(s, t) < d(s, l_1) then we should compute the
-      // distance because it's possible it could be smaller.
+      // Therefore, if d(s, t) >= d(s, l_1) from the computation above, we know that the distance to
+      // the candidate point cannot possibly be in the nearest neighbors. However, if d(s, t) < d(s,
+      // l_1) then we should compute the distance because it's possible it could be smaller.
       //
-      value_t z = heap.warpKTopRDist == 0.00
-                    ? 0.0
-                    : (abs(heap.warpKTop - heap.warpKTopRDist) *
-                         abs(heap.warpKTopRDist - cur_candidate_dist) -
-                       heap.warpKTop * cur_candidate_dist) /
-                        heap.warpKTopRDist;
-
-      z = isnan(z) ? 0.0 : z;
+      value_t z = heap.warpKTopRDist == 0.00 ? 0.0
+                                             : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                                                  abs(heap.warpKTopRDist - cur_candidate_dist) -
+                                                heap.warpKTop * cur_candidate_dist) /
+                                                 heap.warpKTopRDist;
+
+      z            = isnan(z) ? 0.0 : z;
       value_t dist = std::numeric_limits<value_t>::max();
       if (i < k || z <= heap.warpKTop) {
-        const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind);
+        const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
         value_t local_y_ptr[col_q];
         for (value_int j = 0; j < n_cols; ++j) {
           local_y_ptr[j] = y_ptr[j];
@@ -381,18 +426,17 @@ __global__ void block_rbc_kernel_registers(
 
     if (i < R_size) {
       value_idx cur_candidate_ind = R_1nn_cols[R_start_offset + i];
-      value_t cur_candidate_dist = R_1nn_dists[R_start_offset + i];
-      value_t z = heap.warpKTopRDist == 0.0
-                    ? 0.0
-                    : (abs(heap.warpKTop - heap.warpKTopRDist) *
-                         abs(heap.warpKTopRDist - cur_candidate_dist) -
-                       heap.warpKTop * cur_candidate_dist) /
-                        heap.warpKTopRDist;
-
-      z = isnan(z) ? 0.0 : z;
+      value_t cur_candidate_dist  = R_1nn_dists[R_start_offset + i];
+      value_t z                   = heap.warpKTopRDist == 0.0 ? 0.0
+                                                              : (abs(heap.warpKTop - heap.warpKTopRDist) *
+                                                 abs(heap.warpKTopRDist - cur_candidate_dist) -
+                                               heap.warpKTop * cur_candidate_dist) /
+                                                heap.warpKTopRDist;
+
+      z            = isnan(z) ? 0.0 : z;
       value_t dist = std::numeric_limits<value_t>::max();
       if (i < k || z <= heap.warpKTop) {
-        const value_t *y_ptr = X_index + (n_cols * cur_candidate_ind);
+        const value_t* y_ptr = X_index + (n_cols * cur_candidate_ind);
         value_t local_y_ptr[col_q];
         for (value_int j = 0; j < n_cols; ++j) {
           local_y_ptr[j] = y_ptr[j];
@@ -411,124 +455,327 @@ __global__ void block_rbc_kernel_registers(
 
   for (int i = threadIdx.x; i < k; i += tpb) {
     out_dists[blockIdx.x * k + i] = shared_memK[i];
-    out_inds[blockIdx.x * k + i] = shared_memV[i].value;
+    out_inds[blockIdx.x * k + i]  = shared_memV[i].value;
   }
 }
 
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t, typename dist_func>
-void rbc_low_dim_pass_one(const raft::handle_t &handle,
-                          BallCoverIndex<value_idx, value_t, value_int> &index,
-                          const value_t *query, const value_int n_query_rows,
-                          value_int k, const value_idx *R_knn_inds,
-                          const value_t *R_knn_dists, dist_func dfunc,
-                          value_idx *inds, value_t *dists, float weight,
-                          value_int *dists_counter) {
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          typename dist_func>
+void rbc_low_dim_pass_one(const raft::handle_t& handle,
+                          BallCoverIndex<value_idx, value_t, value_int>& index,
+                          const value_t* query,
+                          const value_int n_query_rows,
+                          value_int k,
+                          const value_idx* R_knn_inds,
+                          const value_t* R_knn_dists,
+                          dist_func dfunc,
+                          value_idx* inds,
+                          value_t* dists,
+                          float weight,
+                          value_int* dists_counter)
+{
   if (k <= 32)
     block_rbc_kernel_registers<value_idx, value_t, 32, 2, 128, 2, value_int>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      R_knn_inds,
+                                                      R_knn_dists,
+                                                      index.m,
+                                                      k,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      dists_counter,
+                                                      index.get_R_radius(),
+                                                      dfunc,
+                                                      weight);
 
   else if (k <= 64)
     block_rbc_kernel_registers<value_idx, value_t, 64, 3, 128, 2, value_int>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      R_knn_inds,
+                                                      R_knn_dists,
+                                                      index.m,
+                                                      k,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      dists_counter,
+                                                      index.get_R_radius(),
+                                                      dfunc,
+                                                      weight);
   else if (k <= 128)
     block_rbc_kernel_registers<value_idx, value_t, 128, 3, 128, 2, value_int>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      R_knn_inds,
+                                                      R_knn_dists,
+                                                      index.m,
+                                                      k,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      dists_counter,
+                                                      index.get_R_radius(),
+                                                      dfunc,
+                                                      weight);
 
   else if (k <= 256)
     block_rbc_kernel_registers<value_idx, value_t, 256, 4, 128, 2, value_int>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      R_knn_inds,
+                                                      R_knn_dists,
+                                                      index.m,
+                                                      k,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      dists_counter,
+                                                      index.get_R_radius(),
+                                                      dfunc,
+                                                      weight);
 
   else if (k <= 512)
     block_rbc_kernel_registers<value_idx, value_t, 512, 8, 64, 2, value_int>
-      <<<n_query_rows, 64, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X(),
+                                                     query,
+                                                     index.n,
+                                                     R_knn_inds,
+                                                     R_knn_dists,
+                                                     index.m,
+                                                     k,
+                                                     index.get_R_indptr(),
+                                                     index.get_R_1nn_cols(),
+                                                     index.get_R_1nn_dists(),
+                                                     inds,
+                                                     dists,
+                                                     dists_counter,
+                                                     index.get_R_radius(),
+                                                     dfunc,
+                                                     weight);
 
   else if (k <= 1024)
     block_rbc_kernel_registers<value_idx, value_t, 1024, 8, 64, 2, value_int>
-      <<<n_query_rows, 64, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, R_knn_inds, R_knn_dists, index.m, k,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, dists_counter, index.get_R_radius(), dfunc, weight);
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X(),
+                                                     query,
+                                                     index.n,
+                                                     R_knn_inds,
+                                                     R_knn_dists,
+                                                     index.m,
+                                                     k,
+                                                     index.get_R_indptr(),
+                                                     index.get_R_1nn_cols(),
+                                                     index.get_R_1nn_dists(),
+                                                     inds,
+                                                     dists,
+                                                     dists_counter,
+                                                     index.get_R_radius(),
+                                                     dfunc,
+                                                     weight);
 }
 
-template <typename value_idx, typename value_t,
-          typename value_int = std::uint32_t, typename dist_func>
-void rbc_low_dim_pass_two(const raft::handle_t &handle,
-                          BallCoverIndex<value_idx, value_t, value_int> &index,
-                          const value_t *query, const value_int n_query_rows,
-                          value_int k, const value_idx *R_knn_inds,
-                          const value_t *R_knn_dists, dist_func dfunc,
-                          value_idx *inds, value_t *dists, float weight,
-                          value_int *post_dists_counter) {
+template <typename value_idx,
+          typename value_t,
+          typename value_int = std::uint32_t,
+          typename dist_func>
+void rbc_low_dim_pass_two(const raft::handle_t& handle,
+                          BallCoverIndex<value_idx, value_t, value_int>& index,
+                          const value_t* query,
+                          const value_int n_query_rows,
+                          value_int k,
+                          const value_idx* R_knn_inds,
+                          const value_t* R_knn_dists,
+                          dist_func dfunc,
+                          value_idx* inds,
+                          value_t* dists,
+                          float weight,
+                          value_int* post_dists_counter)
+{
   const value_int bitset_size = ceil(index.n_landmarks / 32.0);
 
-  rmm::device_uvector<std::uint32_t> bitset(bitset_size * index.m,
-                                            handle.get_stream());
+  rmm::device_uvector<std::uint32_t> bitset(bitset_size * index.m, handle.get_stream());
 
   perform_post_filter_registers<value_idx, value_t, value_int, 128, dist_func>
-    <<<n_query_rows, 128, bitset_size * sizeof(std::uint32_t),
-       handle.get_stream()>>>(index.get_X(), index.n, R_knn_inds, R_knn_dists,
-                              index.get_R_radius(), index.get_R(),
-                              index.n_landmarks, bitset_size, k, dfunc,
-                              bitset.data(), weight);
+    <<<n_query_rows, 128, bitset_size * sizeof(std::uint32_t), handle.get_stream()>>>(
+      index.get_X(),
+      index.n,
+      R_knn_inds,
+      R_knn_dists,
+      index.get_R_radius(),
+      index.get_R(),
+      index.n_landmarks,
+      bitset_size,
+      k,
+      dfunc,
+      bitset.data(),
+      weight);
 
   if (k <= 32)
-    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
-                                  dist_func, 32, 2, 128, 2>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  32,
+                                  2,
+                                  128,
+                                  2>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      bitset.data(),
+                                                      bitset_size,
+                                                      R_knn_dists,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      index.n_landmarks,
+                                                      k,
+                                                      dfunc,
+                                                      post_dists_counter);
   else if (k <= 64)
-    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
-                                  dist_func, 64, 3, 128, 2>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  64,
+                                  3,
+                                  128,
+                                  2>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      bitset.data(),
+                                                      bitset_size,
+                                                      R_knn_dists,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      index.n_landmarks,
+                                                      k,
+                                                      dfunc,
+                                                      post_dists_counter);
   else if (k <= 128)
-    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
-                                  dist_func, 128, 3, 128, 2>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  128,
+                                  3,
+                                  128,
+                                  2>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      bitset.data(),
+                                                      bitset_size,
+                                                      R_knn_dists,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      index.n_landmarks,
+                                                      k,
+                                                      dfunc,
+                                                      post_dists_counter);
   else if (k <= 256)
-    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
-                                  dist_func, 256, 4, 128, 2>
-      <<<n_query_rows, 128, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  256,
+                                  4,
+                                  128,
+                                  2>
+      <<<n_query_rows, 128, 0, handle.get_stream()>>>(index.get_X(),
+                                                      query,
+                                                      index.n,
+                                                      bitset.data(),
+                                                      bitset_size,
+                                                      R_knn_dists,
+                                                      index.get_R_indptr(),
+                                                      index.get_R_1nn_cols(),
+                                                      index.get_R_1nn_dists(),
+                                                      inds,
+                                                      dists,
+                                                      index.n_landmarks,
+                                                      k,
+                                                      dfunc,
+                                                      post_dists_counter);
   else if (k <= 512)
-    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
-                                  dist_func, 512, 8, 64, 2>
-      <<<n_query_rows, 64, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  512,
+                                  8,
+                                  64,
+                                  2>
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X(),
+                                                     query,
+                                                     index.n,
+                                                     bitset.data(),
+                                                     bitset_size,
+                                                     R_knn_dists,
+                                                     index.get_R_indptr(),
+                                                     index.get_R_1nn_cols(),
+                                                     index.get_R_1nn_dists(),
+                                                     inds,
+                                                     dists,
+                                                     index.n_landmarks,
+                                                     k,
+                                                     dfunc,
+                                                     post_dists_counter);
   else if (k <= 1024)
-    compute_final_dists_registers<value_idx, value_t, value_int, std::uint32_t,
-                                  dist_func, 1024, 8, 64, 2>
-      <<<n_query_rows, 64, 0, handle.get_stream()>>>(
-        index.get_X(), query, index.n, bitset.data(), bitset_size, R_knn_dists,
-        index.get_R_indptr(), index.get_R_1nn_cols(), index.get_R_1nn_dists(),
-        inds, dists, index.n_landmarks, k, dfunc, post_dists_counter);
+    compute_final_dists_registers<value_idx,
+                                  value_t,
+                                  value_int,
+                                  std::uint32_t,
+                                  dist_func,
+                                  1024,
+                                  8,
+                                  64,
+                                  2>
+      <<<n_query_rows, 64, 0, handle.get_stream()>>>(index.get_X(),
+                                                     query,
+                                                     index.n,
+                                                     bitset.data(),
+                                                     bitset_size,
+                                                     R_knn_dists,
+                                                     index.get_R_indptr(),
+                                                     index.get_R_1nn_cols(),
+                                                     index.get_R_1nn_dists(),
+                                                     inds,
+                                                     dists,
+                                                     index.n_landmarks,
+                                                     k,
+                                                     dfunc,
+                                                     post_dists_counter);
 }
 
 };  // namespace detail
diff --git a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
index d2f7bc2210..a53a5b03e6 100644
--- a/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/block_select_faiss.cuh
@@ -25,15 +25,19 @@ namespace gpu {
 
 // `Dir` true, produce largest values.
 // `Dir` false, produce smallest values.
-template <typename K, typename V, bool Dir, typename Comp, int NumWarpQ,
-          int NumThreadQ, int ThreadsPerBlock>
+template <typename K,
+          typename V,
+          bool Dir,
+          typename Comp,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
 struct KeyValueBlockSelect {
-  static constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
+  static constexpr int kNumWarps          = ThreadsPerBlock / kWarpSize;
   static constexpr int kTotalWarpSortSize = NumWarpQ;
 
-  __device__ inline KeyValueBlockSelect(K initKVal, K initVKey, V initVVal,
-                                        K* smemK, KeyValuePair<K, V>* smemV,
-                                        int k)
+  __device__ inline KeyValueBlockSelect(
+    K initKVal, K initVKey, V initVVal, K* smemK, KeyValuePair<K, V>* smemV, int k)
     : initK(initKVal),
       initVk(initVKey),
       initVv(initVVal),
@@ -42,53 +46,55 @@ struct KeyValueBlockSelect {
       warpKTopRDist(initKVal),
       sharedK(smemK),
       sharedV(smemV),
-      kMinus1(k - 1) {
-    static_assert(utils::isPowerOf2(ThreadsPerBlock),
-                  "threads must be a power-of-2");
+      kMinus1(k - 1)
+  {
+    static_assert(utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2");
     static_assert(utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2");
 
     // Fill the per-thread queue keys with the default value
 #pragma unroll
     for (int i = 0; i < NumThreadQ; ++i) {
-      threadK[i] = initK;
-      threadV[i].key = initVk;
+      threadK[i]       = initK;
+      threadV[i].key   = initVk;
       threadV[i].value = initVv;
     }
 
     int laneId = getLaneId();
     int warpId = threadIdx.x / kWarpSize;
-    warpK = sharedK + warpId * kTotalWarpSortSize;
-    warpV = sharedV + warpId * kTotalWarpSortSize;
+    warpK      = sharedK + warpId * kTotalWarpSortSize;
+    warpV      = sharedV + warpId * kTotalWarpSortSize;
 
     // Fill warp queue (only the actual queue space is fine, not where
     // we write the per-thread queues for merging)
     for (int i = laneId; i < NumWarpQ; i += kWarpSize) {
-      warpK[i] = initK;
-      warpV[i].key = initVk;
+      warpK[i]       = initK;
+      warpV[i].key   = initVk;
       warpV[i].value = initVv;
     }
 
     warpFence();
   }
 
-  __device__ inline void addThreadQ(K k, K vk, V vv) {
+  __device__ inline void addThreadQ(K k, K vk, V vv)
+  {
     if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
       // Rotate right
 #pragma unroll
       for (int i = NumThreadQ - 1; i > 0; --i) {
-        threadK[i] = threadK[i - 1];
-        threadV[i].key = threadV[i - 1].key;
+        threadK[i]       = threadK[i - 1];
+        threadV[i].key   = threadV[i - 1].key;
         threadV[i].value = threadV[i - 1].value;
       }
 
-      threadK[0] = k;
-      threadV[0].key = vk;
+      threadK[0]       = k;
+      threadV[0].key   = vk;
       threadV[0].value = vv;
       ++numVals;
     }
   }
 
-  __device__ inline void checkThreadQ() {
+  __device__ inline void checkThreadQ()
+  {
     bool needSort = (numVals == NumThreadQ);
 
 #if CUDA_VERSION >= 9000
@@ -111,13 +117,13 @@ struct KeyValueBlockSelect {
 
 #pragma unroll
     for (int i = 0; i < NumThreadQ; ++i) {
-      threadK[i] = initK;
-      threadV[i].key = initVk;
+      threadK[i]       = initK;
+      threadV[i].key   = initVk;
       threadV[i].value = initVv;
     }
 
     // We have to beat at least this element
-    warpKTop = warpK[kMinus1];
+    warpKTop      = warpK[kMinus1];
     warpKTopRDist = warpV[kMinus1].key;
 
     warpFence();
@@ -126,7 +132,8 @@ struct KeyValueBlockSelect {
   /// This function handles sorting and merging together the
   /// per-thread queues with the warp-wide queue, creating a sorted
   /// list across both
-  __device__ inline void mergeWarpQ() {
+  __device__ inline void mergeWarpQ()
+  {
     int laneId = getLaneId();
 
     // Sort all of the per-thread queues
@@ -138,8 +145,8 @@ struct KeyValueBlockSelect {
 
 #pragma unroll
     for (int i = 0; i < kNumWarpQRegisters; ++i) {
-      warpKRegisters[i] = warpK[i * kWarpSize + laneId];
-      warpVRegisters[i].key = warpV[i * kWarpSize + laneId].key;
+      warpKRegisters[i]       = warpK[i * kWarpSize + laneId];
+      warpVRegisters[i].key   = warpV[i * kWarpSize + laneId].key;
       warpVRegisters[i].value = warpV[i * kWarpSize + laneId].value;
     }
 
@@ -148,15 +155,14 @@ struct KeyValueBlockSelect {
     // The warp queue is already sorted, and now that we've sorted the
     // per-thread queue, merge both sorted lists together, producing
     // one sorted list
-    warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp,
-                             false>(warpKRegisters, warpVRegisters, threadK,
-                                    threadV);
+    warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
+      warpKRegisters, warpVRegisters, threadK, threadV);
 
     // Write back out the warp queue
 #pragma unroll
     for (int i = 0; i < kNumWarpQRegisters; ++i) {
-      warpK[i * kWarpSize + laneId] = warpKRegisters[i];
-      warpV[i * kWarpSize + laneId].key = warpVRegisters[i].key;
+      warpK[i * kWarpSize + laneId]       = warpKRegisters[i];
+      warpV[i * kWarpSize + laneId].key   = warpVRegisters[i].key;
       warpV[i * kWarpSize + laneId].value = warpVRegisters[i].value;
     }
 
@@ -165,12 +171,14 @@ struct KeyValueBlockSelect {
 
   /// WARNING: all threads in a warp must participate in this.
   /// Otherwise, you must call the constituent parts separately.
-  __device__ inline void add(K k, K vk, V vv) {
+  __device__ inline void add(K k, K vk, V vv)
+  {
     addThreadQ(k, vk, vv);
     checkThreadQ();
   }
 
-  __device__ inline void reduce() {
+  __device__ inline void reduce()
+  {
     // Have all warps dump and merge their queues; this will produce
     // the final per-warp results
     mergeWarpQ();
@@ -182,8 +190,8 @@ struct KeyValueBlockSelect {
     // All warp queues are contiguous in smem.
     // Now, we have kNumWarps lists of NumWarpQ elements.
     // This is a power of 2.
-    FinalBlockMerge<kNumWarps, ThreadsPerBlock, K, KeyValuePair<K, V>, NumWarpQ,
-                    Dir, Comp>::merge(sharedK, sharedV);
+    FinalBlockMerge<kNumWarps, ThreadsPerBlock, K, KeyValuePair<K, V>, NumWarpQ, Dir, Comp>::merge(
+      sharedK, sharedV);
 
     // The block-wide merge has a trailing syncthreads
   }
diff --git a/cpp/include/raft/spatial/knn/detail/common_faiss.h b/cpp/include/raft/spatial/knn/detail/common_faiss.h
index 0c0398a336..5618186dfc 100644
--- a/cpp/include/raft/spatial/knn/detail/common_faiss.h
+++ b/cpp/include/raft/spatial/knn/detail/common_faiss.h
@@ -27,37 +27,26 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-inline faiss::MetricType build_faiss_metric(
-  raft::distance::DistanceType metric) {
+inline faiss::MetricType build_faiss_metric(raft::distance::DistanceType metric)
+{
   switch (metric) {
     case raft::distance::DistanceType::CosineExpanded:
       return faiss::MetricType::METRIC_INNER_PRODUCT;
     case raft::distance::DistanceType::CorrelationExpanded:
       return faiss::MetricType::METRIC_INNER_PRODUCT;
-    case raft::distance::DistanceType::L2Expanded:
-      return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L2Unexpanded:
-      return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L2SqrtExpanded:
-      return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L2SqrtUnexpanded:
-      return faiss::MetricType::METRIC_L2;
-    case raft::distance::DistanceType::L1:
-      return faiss::MetricType::METRIC_L1;
-    case raft::distance::DistanceType::InnerProduct:
-      return faiss::MetricType::METRIC_INNER_PRODUCT;
-    case raft::distance::DistanceType::LpUnexpanded:
-      return faiss::MetricType::METRIC_Lp;
-    case raft::distance::DistanceType::Linf:
-      return faiss::MetricType::METRIC_Linf;
-    case raft::distance::DistanceType::Canberra:
-      return faiss::MetricType::METRIC_Canberra;
-    case raft::distance::DistanceType::BrayCurtis:
-      return faiss::MetricType::METRIC_BrayCurtis;
+    case raft::distance::DistanceType::L2Expanded: return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L2Unexpanded: return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L2SqrtExpanded: return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L2SqrtUnexpanded: return faiss::MetricType::METRIC_L2;
+    case raft::distance::DistanceType::L1: return faiss::MetricType::METRIC_L1;
+    case raft::distance::DistanceType::InnerProduct: return faiss::MetricType::METRIC_INNER_PRODUCT;
+    case raft::distance::DistanceType::LpUnexpanded: return faiss::MetricType::METRIC_Lp;
+    case raft::distance::DistanceType::Linf: return faiss::MetricType::METRIC_Linf;
+    case raft::distance::DistanceType::Canberra: return faiss::MetricType::METRIC_Canberra;
+    case raft::distance::DistanceType::BrayCurtis: return faiss::MetricType::METRIC_BrayCurtis;
     case raft::distance::DistanceType::JensenShannon:
       return faiss::MetricType::METRIC_JensenShannon;
-    default:
-      THROW("MetricType not supported: %d", metric);
+    default: THROW("MetricType not supported: %d", metric);
   }
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
index f774d9d1ea..47fc62066d 100644
--- a/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
+++ b/cpp/include/raft/spatial/knn/detail/fused_l2_knn.cuh
@@ -29,19 +29,21 @@ namespace knn {
 namespace detail {
 
 template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT>
-DI void loadAllWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT m,
-                          const unsigned int numOfNN) {
+DI void loadAllWarpQShmem(myWarpSelect& heapArr,
+                          Pair* shDumpKV,
+                          const IdxT m,
+                          const unsigned int numOfNN)
+{
   const int lid = raft::laneId();
 #pragma unroll
   for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-    const auto rowId =
-      (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+    const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
     if (rowId < m) {
 #pragma unroll
       for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
         const int idx = j * warpSize + lid;
         if (idx < numOfNN) {
-          Pair KVPair = shDumpKV[rowId * numOfNN + idx];
+          Pair KVPair          = shDumpKV[rowId * numOfNN + idx];
           heapArr[i]->warpV[j] = KVPair.key;
           heapArr[i]->warpK[j] = KVPair.value;
         }
@@ -51,14 +53,17 @@ DI void loadAllWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT m,
 }
 
 template <typename Policy, typename Pair, typename myWarpSelect>
-DI void loadWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const int rowId,
-                       const unsigned int numOfNN) {
+DI void loadWarpQShmem(myWarpSelect& heapArr,
+                       Pair* shDumpKV,
+                       const int rowId,
+                       const unsigned int numOfNN)
+{
   const int lid = raft::laneId();
 #pragma unroll
   for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) {
     const int idx = j * warpSize + lid;
     if (idx < numOfNN) {
-      Pair KVPair = shDumpKV[rowId * numOfNN + idx];
+      Pair KVPair       = shDumpKV[rowId * numOfNN + idx];
       heapArr->warpV[j] = KVPair.key;
       heapArr->warpK[j] = KVPair.value;
     }
@@ -66,25 +71,31 @@ DI void loadWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const int rowId,
 }
 
 template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT>
-DI void storeWarpQShmem(myWarpSelect &heapArr, Pair *shDumpKV, const IdxT rowId,
-                        const unsigned int numOfNN) {
+DI void storeWarpQShmem(myWarpSelect& heapArr,
+                        Pair* shDumpKV,
+                        const IdxT rowId,
+                        const unsigned int numOfNN)
+{
   const int lid = raft::laneId();
 
 #pragma unroll
   for (int j = 0; j < heapArr->kNumWarpQRegisters; ++j) {
     const int idx = j * warpSize + lid;
     if (idx < numOfNN) {
-      Pair otherKV = Pair(heapArr->warpV[j], heapArr->warpK[j]);
+      Pair otherKV                    = Pair(heapArr->warpV[j], heapArr->warpK[j]);
       shDumpKV[rowId * numOfNN + idx] = otherKV;
     }
   }
 }
 
-template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT,
-          typename OutT>
-DI void storeWarpQGmem(myWarpSelect &heapArr, OutT *out_dists, IdxT *out_inds,
-                       const IdxT m, const unsigned int numOfNN,
-                       const IdxT starty) {
+template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
+DI void storeWarpQGmem(myWarpSelect& heapArr,
+                       OutT* out_dists,
+                       IdxT* out_inds,
+                       const IdxT m,
+                       const unsigned int numOfNN,
+                       const IdxT starty)
+{
   const int lid = raft::laneId();
 #pragma unroll
   for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
@@ -95,18 +106,21 @@ DI void storeWarpQGmem(myWarpSelect &heapArr, OutT *out_dists, IdxT *out_inds,
         const auto idx = j * warpSize + lid;
         if (idx < numOfNN) {
           out_dists[gmemRowId * numOfNN + idx] = heapArr[i]->warpK[j];
-          out_inds[gmemRowId * numOfNN + idx] = (IdxT)heapArr[i]->warpV[j];
+          out_inds[gmemRowId * numOfNN + idx]  = (IdxT)heapArr[i]->warpV[j];
         }
       }
     }
   }
 }
 
-template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT,
-          typename OutT>
-DI void loadPrevTopKsGmemWarpQ(myWarpSelect &heapArr, OutT *out_dists,
-                               IdxT *out_inds, const IdxT m,
-                               const unsigned int numOfNN, const IdxT starty) {
+template <typename Policy, typename Pair, typename myWarpSelect, typename IdxT, typename OutT>
+DI void loadPrevTopKsGmemWarpQ(myWarpSelect& heapArr,
+                               OutT* out_dists,
+                               IdxT* out_inds,
+                               const IdxT m,
+                               const unsigned int numOfNN,
+                               const IdxT starty)
+{
   const int lid = raft::laneId();
 #pragma unroll
   for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
@@ -121,17 +135,17 @@ DI void loadPrevTopKsGmemWarpQ(myWarpSelect &heapArr, OutT *out_dists,
         }
       }
       auto constexpr kLaneWarpKTop = heapArr[i]->kNumWarpQRegisters - 1;
-      heapArr[i]->warpKTop =
-        raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane);
+      heapArr[i]->warpKTop = raft::shfl(heapArr[i]->warpK[kLaneWarpKTop], heapArr[i]->kLane);
     }
   }
 }
 
 template <typename Pair, int NumWarpQRegs, typename myWarpSelect>
-DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId,
-                          int finalNumVals, int startId = 0) {
+DI void updateSortedWarpQ(
+  myWarpSelect& heapArr, Pair* allWarpTopKs, int rowId, int finalNumVals, int startId = 0)
+{
   constexpr uint32_t mask = 0xffffffffu;
-  const int lid = raft::laneId();
+  const int lid           = raft::laneId();
   // calculate srcLane such that tid 0 -> 31, 1 -> 0,... 31 -> 30.
   // warp around 0 to 31 required for NN > 32
   const auto srcLane = (warpSize + (lid - 1)) & (warpSize - 1);
@@ -140,12 +154,11 @@ DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId,
     Pair KVPair = allWarpTopKs[rowId * (256) + k];
 #pragma unroll
     for (int i = 0; i < NumWarpQRegs; i++) {
-      unsigned activeLanes =
-        __ballot_sync(mask, KVPair.value < heapArr->warpK[i]);
+      unsigned activeLanes = __ballot_sync(mask, KVPair.value < heapArr->warpK[i]);
       if (activeLanes) {
         Pair tempKV;
-        tempKV.value = raft::shfl(heapArr->warpK[i], srcLane);
-        tempKV.key = raft::shfl(heapArr->warpV[i], srcLane);
+        tempKV.value               = raft::shfl(heapArr->warpK[i], srcLane);
+        tempKV.key                 = raft::shfl(heapArr->warpV[i], srcLane);
         const auto firstActiveLane = __ffs(activeLanes) - 1;
         if (firstActiveLane == lid) {
           heapArr->warpK[i] = KVPair.value;
@@ -168,43 +181,60 @@ DI void updateSortedWarpQ(myWarpSelect &heapArr, Pair *allWarpTopKs, int rowId,
   }
 }
 
-template <bool useNorms, typename DataT, typename AccT, typename OutT,
-          typename IdxT, typename Policy, typename CoreLambda,
-          typename FinalLambda, int NumWarpQ, int NumThreadQ,
-          bool usePrevTopKs = false, bool isRowMajor = true>
-__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
-  const DataT *x, const DataT *y, const DataT *_xn, const DataT *_yn,
-  const IdxT m, const IdxT n, const IdxT k, const IdxT lda, const IdxT ldb,
-  const IdxT ldd, CoreLambda core_op, FinalLambda fin_op, bool sqrt,
-  unsigned int numOfNN, int *mutexes, OutT *out_dists, IdxT *out_inds) {
+template <bool useNorms,
+          typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          typename Policy,
+          typename CoreLambda,
+          typename FinalLambda,
+          int NumWarpQ,
+          int NumThreadQ,
+          bool usePrevTopKs = false,
+          bool isRowMajor   = true>
+__global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(const DataT* x,
+                                                                  const DataT* y,
+                                                                  const DataT* _xn,
+                                                                  const DataT* _yn,
+                                                                  const IdxT m,
+                                                                  const IdxT n,
+                                                                  const IdxT k,
+                                                                  const IdxT lda,
+                                                                  const IdxT ldb,
+                                                                  const IdxT ldd,
+                                                                  CoreLambda core_op,
+                                                                  FinalLambda fin_op,
+                                                                  bool sqrt,
+                                                                  unsigned int numOfNN,
+                                                                  int* mutexes,
+                                                                  OutT* out_dists,
+                                                                  IdxT* out_inds)
+{
   extern __shared__ char smem[];
 
   typedef cub::KeyValuePair<uint32_t, AccT> Pair;
   constexpr auto identity = std::numeric_limits<AccT>::max();
-  constexpr auto keyMax = std::numeric_limits<uint32_t>::max();
-  constexpr auto Dir = false;
-  typedef faiss::gpu::WarpSelect<
-    AccT, uint32_t, Dir, faiss::gpu::Comparator<AccT>, NumWarpQ, NumThreadQ, 32>
-    myWarpSelect;
-
-  auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds,
-                           mutexes] __device__(IdxT gridStrideY) {
-    if (gridDim.x == 1) {
-      return;
-    }
+  constexpr auto keyMax   = std::numeric_limits<uint32_t>::max();
+  constexpr auto Dir      = false;
+  typedef faiss::gpu::
+    WarpSelect<AccT, uint32_t, Dir, faiss::gpu::Comparator<AccT>, NumWarpQ, NumThreadQ, 32>
+      myWarpSelect;
 
-    volatile int *mutex = mutexes;
+  auto rowEpilog_lambda = [m, n, numOfNN, out_dists, out_inds, mutexes] __device__(
+                            IdxT gridStrideY) {
+    if (gridDim.x == 1) { return; }
 
-    Pair *shDumpKV = nullptr;
+    volatile int* mutex = mutexes;
+
+    Pair* shDumpKV = nullptr;
     if (useNorms) {
-      shDumpKV =
-        (Pair *)(&smem[Policy::SmemSize +
-                       ((Policy::Mblk + Policy::Nblk) * sizeof(DataT))]);
+      shDumpKV = (Pair*)(&smem[Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT))]);
     } else {
-      shDumpKV = (Pair *)(&smem[Policy::SmemSize]);
+      shDumpKV = (Pair*)(&smem[Policy::SmemSize]);
     }
 
-    const int lid = threadIdx.x % warpSize;
+    const int lid     = threadIdx.x % warpSize;
     const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
 
     //  0 -> consumer done consuming the buffer.
@@ -215,7 +245,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
       auto cta_processed = 0;
       myWarpSelect heapArr1(identity, keyMax, numOfNN);
       myWarpSelect heapArr2(identity, keyMax, numOfNN);
-      myWarpSelect *heapArr[] = {&heapArr1, &heapArr2};
+      myWarpSelect* heapArr[] = {&heapArr1, &heapArr2};
       __syncwarp();
 
       loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
@@ -224,7 +254,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
         if (threadIdx.x == 0) {
           int32_t old = -3;
           while (old != -1) {
-            old = atomicCAS((int *)&mutex[gridStrideY / Policy::Mblk], -2, -1);
+            old = atomicCAS((int*)&mutex[gridStrideY / Policy::Mblk], -2, -1);
           }
           __threadfence();
         }
@@ -232,18 +262,17 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
 
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId = starty + i * Policy::AccThRows;
-          const auto shMemRowId =
-            (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          const auto rowId      = starty + i * Policy::AccThRows;
+          const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
 #pragma unroll
           for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
             Pair otherKV;
-            otherKV.value = identity;
-            otherKV.key = keyMax;
+            otherKV.value  = identity;
+            otherKV.key    = keyMax;
             const auto idx = j * warpSize + lid;
             if (idx < numOfNN && rowId < m) {
-              otherKV.value = out_dists[rowId * numOfNN + idx];
-              otherKV.key = (uint32_t)out_inds[rowId * numOfNN + idx];
+              otherKV.value                        = out_dists[rowId * numOfNN + idx];
+              otherKV.key                          = (uint32_t)out_inds[rowId * numOfNN + idx];
               shDumpKV[shMemRowId * numOfNN + idx] = otherKV;
             }
           }
@@ -260,19 +289,16 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
 
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-          const auto rowId = starty + i * Policy::AccThRows;
-          const auto shMemRowId =
-            (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+          const auto rowId      = starty + i * Policy::AccThRows;
+          const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
           if (rowId < m) {
 #pragma unroll
             for (int j = 0; j < heapArr[i]->kNumWarpQRegisters; ++j) {
               Pair otherKV;
-              otherKV.value = identity;
-              otherKV.key = keyMax;
+              otherKV.value  = identity;
+              otherKV.key    = keyMax;
               const auto idx = j * warpSize + lid;
-              if (idx < numOfNN) {
-                otherKV = shDumpKV[shMemRowId * numOfNN + idx];
-              }
+              if (idx < numOfNN) { otherKV = shDumpKV[shMemRowId * numOfNN + idx]; }
               heapArr[i]->add(otherKV.value, otherKV.key);
             }
           }
@@ -284,20 +310,17 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
         const auto rowId = starty + i * Policy::AccThRows;
         if (rowId < m) {
           bool needSort = (heapArr[i]->numVals > 0);
-          needSort = __any_sync(0xffffffff, needSort);
-          if (needSort) {
-            heapArr[i]->reduce();
-          }
+          needSort      = __any_sync(0xffffffff, needSort);
+          if (needSort) { heapArr[i]->reduce(); }
         }
       }
-      storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN,
-                                   starty);
+      storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
     } else {
       if (threadIdx.x == 0) {
-        int32_t old = -1;
+        int32_t old    = -1;
         int32_t blkIdX = (int32_t)blockIdx.x;
         while (old != blkIdX) {
-          old = atomicCAS((int *)&mutex[gridStrideY / Policy::Mblk], 0, blkIdX);
+          old = atomicCAS((int*)&mutex[gridStrideY / Policy::Mblk], 0, blkIdX);
         }
         __threadfence();
       }
@@ -305,14 +328,13 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
 
 #pragma unroll
       for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto rowId = starty + i * Policy::AccThRows;
-        const auto shMemRowId =
-          (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+        const auto rowId      = starty + i * Policy::AccThRows;
+        const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
         if (rowId < m) {
           for (int idx = lid; idx < numOfNN; idx += warpSize) {
-            Pair KVPair = shDumpKV[shMemRowId * numOfNN + idx];
+            Pair KVPair                      = shDumpKV[shMemRowId * numOfNN + idx];
             out_dists[rowId * numOfNN + idx] = KVPair.value;
-            out_inds[rowId * numOfNN + idx] = (IdxT)KVPair.key;
+            out_inds[rowId * numOfNN + idx]  = (IdxT)KVPair.key;
           }
         }
       }
@@ -328,7 +350,9 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
   // epilogue operation lambda for final value calculation
   auto epilog_lambda = [numOfNN, m, n, ldd, out_dists, out_inds] __device__(
                          AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
-                         DataT * regxn, DataT * regyn, IdxT gridStrideX,
+                         DataT * regxn,
+                         DataT * regyn,
+                         IdxT gridStrideX,
                          IdxT gridStrideY) {
     if (useNorms) {
 #pragma unroll
@@ -340,36 +364,34 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
       }
     }
 
-    Pair *shDumpKV = nullptr;
+    Pair* shDumpKV = nullptr;
     if (useNorms) {
       constexpr size_t shmemSize =
         Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
-      shDumpKV = (Pair *)(&smem[shmemSize]);
+      shDumpKV = (Pair*)(&smem[shmemSize]);
     } else {
-      shDumpKV = (Pair *)(&smem[Policy::SmemSize]);
+      shDumpKV = (Pair*)(&smem[Policy::SmemSize]);
     }
 
     constexpr uint32_t mask = 0xffffffffu;
-    const IdxT starty = gridStrideY + (threadIdx.x / Policy::AccThCols);
-    const IdxT startx = gridStrideX + (threadIdx.x % Policy::AccThCols);
-    const int lid = raft::laneId();
+    const IdxT starty       = gridStrideY + (threadIdx.x / Policy::AccThCols);
+    const IdxT startx       = gridStrideX + (threadIdx.x % Policy::AccThCols);
+    const int lid           = raft::laneId();
 
     myWarpSelect heapArr1(identity, keyMax, numOfNN);
     myWarpSelect heapArr2(identity, keyMax, numOfNN);
-    myWarpSelect *heapArr[] = {&heapArr1, &heapArr2};
+    myWarpSelect* heapArr[] = {&heapArr1, &heapArr2};
     if (usePrevTopKs) {
       if (gridStrideX == blockIdx.x * Policy::Nblk) {
-        loadPrevTopKsGmemWarpQ<Policy, Pair>(heapArr, out_dists, out_inds, m,
-                                             numOfNN, starty);
+        loadPrevTopKsGmemWarpQ<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
       }
     }
 
     if (gridStrideX > blockIdx.x * Policy::Nblk) {
 #pragma unroll
       for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto rowId =
-          (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
-        Pair tempKV = shDumpKV[(rowId * numOfNN) + numOfNN - 1];
+        const auto rowId     = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+        Pair tempKV          = shDumpKV[(rowId * numOfNN) + numOfNN - 1];
         heapArr[i]->warpKTop = tempKV.value;
       }
 
@@ -378,16 +400,14 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
       int anyWarpTopKs = 0;
 #pragma unroll
       for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto rowId = starty + i * Policy::AccThRows;
+        const auto rowId   = starty + i * Policy::AccThRows;
         numValsWarpTopK[i] = 0;
         if (rowId < m) {
 #pragma unroll
           for (int j = 0; j < Policy::AccColsPerTh; ++j) {
             const auto colId = startx + j * Policy::AccThCols;
             if (colId < ldd) {
-              if (acc[i][j] < heapArr[i]->warpKTop) {
-                numValsWarpTopK[i]++;
-              }
+              if (acc[i][j] < heapArr[i]->warpKTop) { numValsWarpTopK[i]++; }
             }
           }
           anyWarpTopKs += numValsWarpTopK[i];
@@ -395,24 +415,21 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
       }
       anyWarpTopKs = __syncthreads_or(anyWarpTopKs > 0);
       if (anyWarpTopKs) {
-        Pair *allWarpTopKs = (Pair *)(&smem[0]);
+        Pair* allWarpTopKs = (Pair*)(&smem[0]);
         uint32_t needScanSort[Policy::AccRowsPerTh];
 
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
           const auto gmemRowId = starty + i * Policy::AccThRows;
-          needScanSort[i] = 0;
+          needScanSort[i]      = 0;
           if (gmemRowId < m) {
-            int myVals = numValsWarpTopK[i];
+            int myVals      = numValsWarpTopK[i];
             needScanSort[i] = __ballot_sync(mask, myVals > 0);
             if (needScanSort[i]) {
 #pragma unroll
               for (unsigned int k = 1; k <= 16; k *= 2) {
-                const unsigned int n =
-                  __shfl_up_sync(mask, numValsWarpTopK[i], k);
-                if (lid >= k) {
-                  numValsWarpTopK[i] += n;
-                }
+                const unsigned int n = __shfl_up_sync(mask, numValsWarpTopK[i], k);
+                if (lid >= k) { numValsWarpTopK[i] += n; }
               }
             }
             // As each thread will know its total vals to write.
@@ -421,8 +438,7 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
           }
 
           if (needScanSort[i]) {
-            const auto rowId =
-              (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+            const auto rowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
             if (gmemRowId < m) {
               if (needScanSort[i] & ((uint32_t)1 << lid)) {
 #pragma unroll
@@ -430,17 +446,15 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
                   const auto colId = startx + j * Policy::AccThCols;
                   if (colId < ldd) {
                     if (acc[i][j] < heapArr[i]->warpKTop) {
-                      Pair otherKV = {colId, acc[i][j]};
-                      allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] =
-                        otherKV;
+                      Pair otherKV                                     = {colId, acc[i][j]};
+                      allWarpTopKs[rowId * (256) + numValsWarpTopK[i]] = otherKV;
                       numValsWarpTopK[i]++;
                     }
                   }
                 }
               }
               const int finalNumVals = raft::shfl(numValsWarpTopK[i], 31);
-              loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId,
-                                           numOfNN);
+              loadWarpQShmem<Policy, Pair>(heapArr[i], &shDumpKV[0], rowId, numOfNN);
               updateSortedWarpQ<Pair, heapArr[i]->kNumWarpQRegisters>(
                 heapArr[i], &allWarpTopKs[0], rowId, finalNumVals);
             }
@@ -450,12 +464,10 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
 #pragma unroll
         for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
           if (needScanSort[i]) {
-            const auto rowId =
-              (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+            const auto rowId     = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
             const auto gmemRowId = starty + i * Policy::AccThRows;
             if (gmemRowId < m) {
-              storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, rowId,
-                                            numOfNN);
+              storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, rowId, numOfNN);
             }
           }
         }
@@ -463,28 +475,24 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
     } else {
 #pragma unroll
       for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
-        const auto gmemRowId = starty + i * Policy::AccThRows;
-        const auto shMemRowId =
-          (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
+        const auto gmemRowId  = starty + i * Policy::AccThRows;
+        const auto shMemRowId = (threadIdx.x / Policy::AccThCols) + i * Policy::AccThRows;
         if (gmemRowId < m) {
 #pragma unroll
           for (int j = 0; j < Policy::AccColsPerTh; ++j) {
             const auto colId = startx + j * Policy::AccThCols;
-            Pair otherKV = {keyMax, identity};
+            Pair otherKV     = {keyMax, identity};
             if (colId < ldd) {
               otherKV.value = acc[i][j];
-              otherKV.key = colId;
+              otherKV.key   = colId;
             }
             heapArr[i]->add(otherKV.value, otherKV.key);
           }
 
           bool needSort = (heapArr[i]->numVals > 0);
-          needSort = __any_sync(mask, needSort);
-          if (needSort) {
-            heapArr[i]->reduce();
-          }
-          storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, shMemRowId,
-                                        numOfNN);
+          needSort      = __any_sync(mask, needSort);
+          if (needSort) { heapArr[i]->reduce(); }
+          storeWarpQShmem<Policy, Pair>(heapArr[i], shDumpKV, shMemRowId, numOfNN);
         }
       }
     }
@@ -492,27 +500,64 @@ __global__ __launch_bounds__(Policy::Nthreads, 2) void fusedL2kNN(
     if (((gridStrideX + Policy::Nblk * gridDim.x) > n) && gridDim.x == 1) {
       // This is last iteration of grid stride X
       loadAllWarpQShmem<Policy, Pair>(heapArr, &shDumpKV[0], m, numOfNN);
-      storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN,
-                                   starty);
+      storeWarpQGmem<Policy, Pair>(heapArr, out_dists, out_inds, m, numOfNN, starty);
     }
   };
 
-  raft::distance::detail::PairwiseDistances<
-    useNorms, DataT, AccT, OutT, IdxT, Policy, CoreLambda,
-    decltype(epilog_lambda), FinalLambda, decltype(rowEpilog_lambda),
-    isRowMajor, false>
-    obj(x, y, m, n, k, lda, ldb, ldd, _xn, _yn, nullptr, smem, core_op,
-        epilog_lambda, fin_op, rowEpilog_lambda);
+  raft::distance::detail::PairwiseDistances<useNorms,
+                                            DataT,
+                                            AccT,
+                                            OutT,
+                                            IdxT,
+                                            Policy,
+                                            CoreLambda,
+                                            decltype(epilog_lambda),
+                                            FinalLambda,
+                                            decltype(rowEpilog_lambda),
+                                            isRowMajor,
+                                            false>
+    obj(x,
+        y,
+        m,
+        n,
+        k,
+        lda,
+        ldb,
+        ldd,
+        _xn,
+        _yn,
+        nullptr,
+        smem,
+        core_op,
+        epilog_lambda,
+        fin_op,
+        rowEpilog_lambda);
   obj.run();
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, bool usePrevTopKs, bool isRowMajor>
-void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                         IdxT lda, IdxT ldb, IdxT ldd, bool sqrt,
-                         OutT *out_dists, IdxT *out_inds, IdxT numOfNN,
-                         cudaStream_t stream, void *workspace,
-                         size_t &worksize) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          bool usePrevTopKs,
+          bool isRowMajor>
+void fusedL2UnexpKnnImpl(const DataT* x,
+                         const DataT* y,
+                         IdxT m,
+                         IdxT n,
+                         IdxT k,
+                         IdxT lda,
+                         IdxT ldb,
+                         IdxT ldd,
+                         bool sqrt,
+                         OutT* out_dists,
+                         IdxT* out_inds,
+                         IdxT numOfNN,
+                         cudaStream_t stream,
+                         void* workspace,
+                         size_t& worksize)
+{
   typedef typename raft::linalg::Policy2x8<DataT, 1>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
@@ -532,12 +577,30 @@ void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
   typedef cub::KeyValuePair<uint32_t, AccT> Pair;
 
   if (isRowMajor) {
-    constexpr auto fusedL2UnexpKnn32RowMajor =
-      fusedL2kNN<false, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
-                 decltype(fin_op), 32, 2, usePrevTopKs, true>;
-    constexpr auto fusedL2UnexpKnn64RowMajor =
-      fusedL2kNN<false, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
-                 decltype(fin_op), 64, 3, usePrevTopKs, true>;
+    constexpr auto fusedL2UnexpKnn32RowMajor = fusedL2kNN<false,
+                                                          DataT,
+                                                          AccT,
+                                                          OutT,
+                                                          IdxT,
+                                                          KPolicy,
+                                                          decltype(core_lambda),
+                                                          decltype(fin_op),
+                                                          32,
+                                                          2,
+                                                          usePrevTopKs,
+                                                          true>;
+    constexpr auto fusedL2UnexpKnn64RowMajor = fusedL2kNN<false,
+                                                          DataT,
+                                                          AccT,
+                                                          OutT,
+                                                          IdxT,
+                                                          KPolicy,
+                                                          decltype(core_lambda),
+                                                          decltype(fin_op),
+                                                          64,
+                                                          3,
+                                                          usePrevTopKs,
+                                                          true>;
 
     auto fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn32RowMajor;
     if (numOfNN <= 32) {
@@ -545,13 +608,11 @@ void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
     } else if (numOfNN <= 64) {
       fusedL2UnexpKnnRowMajor = fusedL2UnexpKnn64RowMajor;
     } else {
-      ASSERT(numOfNN <= 64,
-             "fusedL2kNN: num of nearest neighbors must be <= 64");
+      ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64");
     }
 
-    const auto sharedMemSize =
-      KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair));
-    dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
+    const auto sharedMemSize = KPolicy::SmemSize + (KPolicy::Mblk * numOfNN * sizeof(Pair));
+    dim3 grid                = raft::distance::detail::launchConfigGenerator<KPolicy>(
       m, n, sharedMemSize, fusedL2UnexpKnnRowMajor);
 
     if (grid.x > 1) {
@@ -560,51 +621,133 @@ void fusedL2UnexpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
         worksize = sizeof(int32_t) * numMutexes;
         return;
       } else {
-        CUDA_CHECK(
-          cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream));
+        CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int32_t) * numMutexes, stream));
       }
     }
 
-    fusedL2UnexpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(
-      x, y, nullptr, nullptr, m, n, k, lda, ldb, ldd, core_lambda, fin_op, sqrt,
-      (uint32_t)numOfNN, (int *)workspace, out_dists, out_inds);
+    fusedL2UnexpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(x,
+                                                                  y,
+                                                                  nullptr,
+                                                                  nullptr,
+                                                                  m,
+                                                                  n,
+                                                                  k,
+                                                                  lda,
+                                                                  ldb,
+                                                                  ldd,
+                                                                  core_lambda,
+                                                                  fin_op,
+                                                                  sqrt,
+                                                                  (uint32_t)numOfNN,
+                                                                  (int*)workspace,
+                                                                  out_dists,
+                                                                  out_inds);
   } else {
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          bool usePrevTopKs, bool isRowMajor>
-void fusedL2UnexpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                     const DataT *x, const DataT *y, bool sqrt, OutT *out_dists,
-                     IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
-                     void *workspace, size_t &worksize) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          bool usePrevTopKs,
+          bool isRowMajor>
+void fusedL2UnexpKnn(IdxT m,
+                     IdxT n,
+                     IdxT k,
+                     IdxT lda,
+                     IdxT ldb,
+                     IdxT ldd,
+                     const DataT* x,
+                     const DataT* y,
+                     bool sqrt,
+                     OutT* out_dists,
+                     IdxT* out_inds,
+                     IdxT numOfNN,
+                     cudaStream_t stream,
+                     void* workspace,
+                     size_t& worksize)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT),
-                        usePrevTopKs, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
-      workspace, worksize);
+    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs, isRowMajor>(
+      x,
+      y,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldd,
+      sqrt,
+      out_dists,
+      out_inds,
+      numOfNN,
+      stream,
+      workspace,
+      worksize);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT),
-                        usePrevTopKs, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
-      workspace, worksize);
+    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs, isRowMajor>(
+      x,
+      y,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldd,
+      sqrt,
+      out_dists,
+      out_inds,
+      numOfNN,
+      stream,
+      workspace,
+      worksize);
   } else {
-    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
-      workspace, worksize);
+    fusedL2UnexpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(x,
+                                                                              y,
+                                                                              m,
+                                                                              n,
+                                                                              k,
+                                                                              lda,
+                                                                              ldb,
+                                                                              ldd,
+                                                                              sqrt,
+                                                                              out_dists,
+                                                                              out_inds,
+                                                                              numOfNN,
+                                                                              stream,
+                                                                              workspace,
+                                                                              worksize);
   }
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          int VecLen, bool usePrevTopKs, bool isRowMajor>
-void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
-                       IdxT lda, IdxT ldb, IdxT ldd, bool sqrt, OutT *out_dists,
-                       IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
-                       void *workspace, size_t &worksize) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          int VecLen,
+          bool usePrevTopKs,
+          bool isRowMajor>
+void fusedL2ExpKnnImpl(const DataT* x,
+                       const DataT* y,
+                       IdxT m,
+                       IdxT n,
+                       IdxT k,
+                       IdxT lda,
+                       IdxT ldb,
+                       IdxT ldd,
+                       bool sqrt,
+                       OutT* out_dists,
+                       IdxT* out_inds,
+                       IdxT numOfNN,
+                       cudaStream_t stream,
+                       void* workspace,
+                       size_t& worksize)
+{
   typedef typename raft::linalg::Policy2x8<DataT, 1>::Policy RowPolicy;
   typedef typename raft::linalg::Policy4x4<DataT, VecLen>::ColPolicy ColPolicy;
 
@@ -612,28 +755,43 @@ void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
 
   ASSERT(isRowMajor, "Only Row major inputs are allowed");
 
-  ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) ||
-           (worksize < m * sizeof(AccT))),
+  ASSERT(!(((x != y) && (worksize < (m + n) * sizeof(AccT))) || (worksize < m * sizeof(AccT))),
          "workspace size error");
   ASSERT(workspace != nullptr, "workspace is null");
 
   dim3 blk(KPolicy::Nthreads);
   // Accumulation operation lambda
-  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) {
-    acc += x * y;
-  };
+  auto core_lambda = [] __device__(AccT & acc, DataT & x, DataT & y) { acc += x * y; };
 
   auto fin_op = [] __device__(AccT d_val, int g_d_idx) { return d_val; };
 
   typedef cub::KeyValuePair<uint32_t, AccT> Pair;
 
   if (isRowMajor) {
-    constexpr auto fusedL2ExpKnn32RowMajor =
-      fusedL2kNN<true, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
-                 decltype(fin_op), 32, 2, usePrevTopKs, true>;
-    constexpr auto fusedL2ExpKnn64RowMajor =
-      fusedL2kNN<true, DataT, AccT, OutT, IdxT, KPolicy, decltype(core_lambda),
-                 decltype(fin_op), 64, 3, usePrevTopKs, true>;
+    constexpr auto fusedL2ExpKnn32RowMajor = fusedL2kNN<true,
+                                                        DataT,
+                                                        AccT,
+                                                        OutT,
+                                                        IdxT,
+                                                        KPolicy,
+                                                        decltype(core_lambda),
+                                                        decltype(fin_op),
+                                                        32,
+                                                        2,
+                                                        usePrevTopKs,
+                                                        true>;
+    constexpr auto fusedL2ExpKnn64RowMajor = fusedL2kNN<true,
+                                                        DataT,
+                                                        AccT,
+                                                        OutT,
+                                                        IdxT,
+                                                        KPolicy,
+                                                        decltype(core_lambda),
+                                                        decltype(fin_op),
+                                                        64,
+                                                        3,
+                                                        usePrevTopKs,
+                                                        true>;
 
     auto fusedL2ExpKnnRowMajor = fusedL2ExpKnn32RowMajor;
     if (numOfNN <= 32) {
@@ -641,77 +799,137 @@ void fusedL2ExpKnnImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
     } else if (numOfNN <= 64) {
       fusedL2ExpKnnRowMajor = fusedL2ExpKnn64RowMajor;
     } else {
-      ASSERT(numOfNN <= 64,
-             "fusedL2kNN: num of nearest neighbors must be <= 64");
+      ASSERT(numOfNN <= 64, "fusedL2kNN: num of nearest neighbors must be <= 64");
     }
 
-    const auto sharedMemSize =
-      KPolicy::SmemSize + ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)) +
-      (KPolicy::Mblk * numOfNN * sizeof(Pair));
+    const auto sharedMemSize = KPolicy::SmemSize +
+                               ((KPolicy::Mblk + KPolicy::Nblk) * sizeof(DataT)) +
+                               (KPolicy::Mblk * numOfNN * sizeof(Pair));
     dim3 grid = raft::distance::detail::launchConfigGenerator<KPolicy>(
       m, n, sharedMemSize, fusedL2ExpKnnRowMajor);
-    int32_t *mutexes = nullptr;
+    int32_t* mutexes = nullptr;
     if (grid.x > 1) {
-      const auto numMutexes = raft::ceildiv<int>(m, KPolicy::Mblk);
-      const auto normsSize =
-        (x != y) ? (m + n) * sizeof(DataT) : n * sizeof(DataT);
+      const auto numMutexes   = raft::ceildiv<int>(m, KPolicy::Mblk);
+      const auto normsSize    = (x != y) ? (m + n) * sizeof(DataT) : n * sizeof(DataT);
       const auto requiredSize = sizeof(int32_t) * numMutexes + normsSize;
       if (worksize < requiredSize) {
         worksize = requiredSize;
         return;
       } else {
-        mutexes = (int32_t *)((char *)workspace + normsSize);
-        CUDA_CHECK(
-          cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream));
+        mutexes = (int32_t*)((char*)workspace + normsSize);
+        CUDA_CHECK(cudaMemsetAsync(mutexes, 0, sizeof(int32_t) * numMutexes, stream));
       }
     }
 
-    DataT *xn = (DataT *)workspace;
-    DataT *yn = (DataT *)workspace;
+    DataT* xn = (DataT*)workspace;
+    DataT* yn = (DataT*)workspace;
 
     auto norm_op = [] __device__(DataT in) { return in; };
 
     if (x != y) {
       yn += m;
-      raft::linalg::rowNorm(xn, x, k, m, raft::linalg::L2Norm, isRowMajor,
-                            stream, norm_op);
-      raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, isRowMajor,
-                            stream, norm_op);
+      raft::linalg::rowNorm(xn, x, k, m, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
+      raft::linalg::rowNorm(yn, y, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
     } else {
-      raft::linalg::rowNorm(xn, x, k, n, raft::linalg::L2Norm, isRowMajor,
-                            stream, norm_op);
+      raft::linalg::rowNorm(xn, x, k, n, raft::linalg::L2Norm, isRowMajor, stream, norm_op);
     }
-    fusedL2ExpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(
-      x, y, xn, yn, m, n, k, lda, ldb, ldd, core_lambda, fin_op, sqrt,
-      (uint32_t)numOfNN, mutexes, out_dists, out_inds);
+    fusedL2ExpKnnRowMajor<<<grid, blk, sharedMemSize, stream>>>(x,
+                                                                y,
+                                                                xn,
+                                                                yn,
+                                                                m,
+                                                                n,
+                                                                k,
+                                                                lda,
+                                                                ldb,
+                                                                ldd,
+                                                                core_lambda,
+                                                                fin_op,
+                                                                sqrt,
+                                                                (uint32_t)numOfNN,
+                                                                mutexes,
+                                                                out_dists,
+                                                                out_inds);
   } else {
   }
 
   CUDA_CHECK(cudaGetLastError());
 }
 
-template <typename DataT, typename AccT, typename OutT, typename IdxT,
-          bool usePrevTopKs, bool isRowMajor>
-void fusedL2ExpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
-                   const DataT *x, const DataT *y, bool sqrt, OutT *out_dists,
-                   IdxT *out_inds, IdxT numOfNN, cudaStream_t stream,
-                   void *workspace, size_t &worksize) {
+template <typename DataT,
+          typename AccT,
+          typename OutT,
+          typename IdxT,
+          bool usePrevTopKs,
+          bool isRowMajor>
+void fusedL2ExpKnn(IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   IdxT lda,
+                   IdxT ldb,
+                   IdxT ldd,
+                   const DataT* x,
+                   const DataT* y,
+                   bool sqrt,
+                   OutT* out_dists,
+                   IdxT* out_inds,
+                   IdxT numOfNN,
+                   cudaStream_t stream,
+                   void* workspace,
+                   size_t& worksize)
+{
   size_t bytesA = sizeof(DataT) * lda;
   size_t bytesB = sizeof(DataT) * ldb;
   if (16 % sizeof(DataT) == 0 && bytesA % 16 == 0 && bytesB % 16 == 0) {
-    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs,
-                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists,
-                                  out_inds, numOfNN, stream, workspace,
-                                  worksize);
+    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 16 / sizeof(DataT), usePrevTopKs, isRowMajor>(
+      x,
+      y,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldd,
+      sqrt,
+      out_dists,
+      out_inds,
+      numOfNN,
+      stream,
+      workspace,
+      worksize);
   } else if (8 % sizeof(DataT) == 0 && bytesA % 8 == 0 && bytesB % 8 == 0) {
-    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs,
-                      isRowMajor>(x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists,
-                                  out_inds, numOfNN, stream, workspace,
-                                  worksize);
+    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 8 / sizeof(DataT), usePrevTopKs, isRowMajor>(
+      x,
+      y,
+      m,
+      n,
+      k,
+      lda,
+      ldb,
+      ldd,
+      sqrt,
+      out_dists,
+      out_inds,
+      numOfNN,
+      stream,
+      workspace,
+      worksize);
   } else {
-    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(
-      x, y, m, n, k, lda, ldb, ldd, sqrt, out_dists, out_inds, numOfNN, stream,
-      workspace, worksize);
+    fusedL2ExpKnnImpl<DataT, AccT, OutT, IdxT, 1, usePrevTopKs, isRowMajor>(x,
+                                                                            y,
+                                                                            m,
+                                                                            n,
+                                                                            k,
+                                                                            lda,
+                                                                            ldb,
+                                                                            ldd,
+                                                                            sqrt,
+                                                                            out_dists,
+                                                                            out_inds,
+                                                                            numOfNN,
+                                                                            stream,
+                                                                            workspace,
+                                                                            worksize);
   }
 }
 
@@ -732,11 +950,19 @@ void fusedL2ExpKnn(IdxT m, IdxT n, IdxT k, IdxT lda, IdxT ldb, IdxT ldd,
  * @param[in] stream stream to order kernel launch
  */
 template <typename value_idx, typename value_t, bool usePrevTopKs = false>
-void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists,
-                const value_t *index, const value_t *query, size_t n_index_rows,
-                size_t n_query_rows, int k, bool rowMajorIndex,
-                bool rowMajorQuery, cudaStream_t stream,
-                raft::distance::DistanceType metric) {
+void fusedL2Knn(size_t D,
+                value_idx* out_inds,
+                value_t* out_dists,
+                const value_t* index,
+                const value_t* query,
+                size_t n_index_rows,
+                size_t n_query_rows,
+                int k,
+                bool rowMajorIndex,
+                bool rowMajorQuery,
+                cudaStream_t stream,
+                raft::distance::DistanceType metric)
+{
   // Validate the input data
   ASSERT(k > 0, "l2Knn: k must be > 0");
   ASSERT(D > 0, "l2Knn: D must be > 0");
@@ -750,8 +976,7 @@ void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists,
   ASSERT(rowMajorIndex == rowMajorQuery,
          "l2Knn: rowMajorIndex and rowMajorQuery should have same layout");
   // TODO: Add support for column major layout
-  ASSERT(rowMajorIndex == true,
-         "l2Knn: only rowMajor inputs are supported for now.");
+  ASSERT(rowMajorIndex == true, "l2Knn: only rowMajor inputs are supported for now.");
 
   // Even for L2 Sqrt distance case we use non-sqrt version as FAISS bfKNN only support
   // non-sqrt metric & some tests in RAFT/cuML (like Linkage) fails if we use L2 sqrt.
@@ -764,37 +989,82 @@ void fusedL2Knn(size_t D, value_idx *out_inds, value_t *out_dists,
   switch (metric) {
     case raft::distance::DistanceType::L2SqrtExpanded:
     case raft::distance::DistanceType::L2Expanded:
-      tempWorksize = raft::distance::detail::getWorkspaceSize<
-        raft::distance::DistanceType::L2Expanded, float, float, float,
-        value_idx>(query, index, n_query_rows, n_index_rows, D);
+      tempWorksize = raft::distance::detail::
+        getWorkspaceSize<raft::distance::DistanceType::L2Expanded, float, float, float, value_idx>(
+          query, index, n_query_rows, n_index_rows, D);
       worksize = tempWorksize;
       workspace.resize(worksize, stream);
-      fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(
-        n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt,
-        out_dists, out_inds, k, stream, workspace.data(), worksize);
+      fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
+                                                                              n_index_rows,
+                                                                              D,
+                                                                              lda,
+                                                                              ldb,
+                                                                              ldd,
+                                                                              query,
+                                                                              index,
+                                                                              sqrt,
+                                                                              out_dists,
+                                                                              out_inds,
+                                                                              k,
+                                                                              stream,
+                                                                              workspace.data(),
+                                                                              worksize);
       if (worksize > tempWorksize) {
         workspace.resize(worksize, stream);
-        fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(
-          n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt,
-          out_dists, out_inds, k, stream, workspace.data(), worksize);
+        fusedL2ExpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
+                                                                                n_index_rows,
+                                                                                D,
+                                                                                lda,
+                                                                                ldb,
+                                                                                ldd,
+                                                                                query,
+                                                                                index,
+                                                                                sqrt,
+                                                                                out_dists,
+                                                                                out_inds,
+                                                                                k,
+                                                                                stream,
+                                                                                workspace.data(),
+                                                                                worksize);
       }
       break;
     case raft::distance::DistanceType::L2Unexpanded:
     case raft::distance::DistanceType::L2SqrtUnexpanded:
-      fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(
-        n_query_rows, n_index_rows, D, lda, ldb, ldd, query, index, sqrt,
-        out_dists, out_inds, k, stream, workspace.data(), worksize);
+      fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
+                                                                                n_index_rows,
+                                                                                D,
+                                                                                lda,
+                                                                                ldb,
+                                                                                ldd,
+                                                                                query,
+                                                                                index,
+                                                                                sqrt,
+                                                                                out_dists,
+                                                                                out_inds,
+                                                                                k,
+                                                                                stream,
+                                                                                workspace.data(),
+                                                                                worksize);
       if (worksize) {
         workspace.resize(worksize, stream);
-        fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs,
-                        true>(n_query_rows, n_index_rows, D, lda, ldb, ldd,
-                              query, index, sqrt, out_dists, out_inds, k,
-                              stream, workspace.data(), worksize);
+        fusedL2UnexpKnn<value_t, value_t, value_t, value_idx, usePrevTopKs, true>(n_query_rows,
+                                                                                  n_index_rows,
+                                                                                  D,
+                                                                                  lda,
+                                                                                  ldb,
+                                                                                  ldd,
+                                                                                  query,
+                                                                                  index,
+                                                                                  sqrt,
+                                                                                  out_dists,
+                                                                                  out_inds,
+                                                                                  k,
+                                                                                  stream,
+                                                                                  workspace.data(),
+                                                                                  worksize);
       }
       break;
-    default:
-      printf("only L2 distance metric is supported\n");
-      break;
+    default: printf("only L2 distance metric is supported\n"); break;
   };
 }
 
diff --git a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
index 7d87254cb6..049c11514c 100644
--- a/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
+++ b/cpp/include/raft/spatial/knn/detail/haversine_distance.cuh
@@ -35,7 +35,8 @@ namespace knn {
 namespace detail {
 
 template <typename value_t>
-DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) {
+DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2)
+{
   value_t sin_0 = sin(0.5 * (x1 - y1));
   value_t sin_1 = sin(0.5 * (x2 - y2));
   value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1;
@@ -56,34 +57,36 @@ DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) {
  * @param[in] n_index_rows number of rows in index array
  * @param[in] k number of closest neighbors to return
  */
-template <typename value_idx, typename value_t, int warp_q = 1024,
-          int thread_q = 8, int tpb = 128>
-__global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
-                                     const value_t *index, const value_t *query,
-                                     size_t n_index_rows, int k) {
+template <typename value_idx, typename value_t, int warp_q = 1024, int thread_q = 8, int tpb = 128>
+__global__ void haversine_knn_kernel(value_idx* out_inds,
+                                     value_t* out_dists,
+                                     const value_t* index,
+                                     const value_t* query,
+                                     size_t n_index_rows,
+                                     int k)
+{
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
   __shared__ value_idx smemV[kNumWarps * warp_q];
 
-  faiss::gpu::BlockSelect<value_t, value_idx, false,
-                          faiss::gpu::Comparator<value_t>, warp_q, thread_q,
-                          tpb>
-    heap(faiss::gpu::Limits<value_t>::getMax(), -1, smemK, smemV, k);
+  faiss::gpu::
+    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
+      heap(faiss::gpu::Limits<value_t>::getMax(), -1, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
   int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize);
 
-  const value_t *query_ptr = query + (blockIdx.x * 2);
-  value_t x1 = query_ptr[0];
-  value_t x2 = query_ptr[1];
+  const value_t* query_ptr = query + (blockIdx.x * 2);
+  value_t x1               = query_ptr[0];
+  value_t x2               = query_ptr[1];
 
   int i = threadIdx.x;
 
   for (; i < limit; i += tpb) {
-    const value_t *idx_ptr = index + (i * 2);
-    value_t y1 = idx_ptr[0];
-    value_t y2 = idx_ptr[1];
+    const value_t* idx_ptr = index + (i * 2);
+    value_t y1             = idx_ptr[0];
+    value_t y2             = idx_ptr[1];
 
     value_t dist = compute_haversine(x1, y1, x2, y2);
 
@@ -92,9 +95,9 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
 
   // Handle last remainder fraction of a warp of elements
   if (i < n_index_rows) {
-    const value_t *idx_ptr = index + (i * 2);
-    value_t y1 = idx_ptr[0];
-    value_t y2 = idx_ptr[1];
+    const value_t* idx_ptr = index + (i * 2);
+    value_t y1             = idx_ptr[0];
+    value_t y2             = idx_ptr[1];
 
     value_t dist = compute_haversine(x1, y1, x2, y2);
 
@@ -105,7 +108,7 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
 
   for (int i = threadIdx.x; i < k; i += tpb) {
     out_dists[blockIdx.x * k + i] = smemK[i];
-    out_inds[blockIdx.x * k + i] = smemV[i];
+    out_inds[blockIdx.x * k + i]  = smemV[i];
   }
 }
 
@@ -126,10 +129,15 @@ __global__ void haversine_knn_kernel(value_idx *out_inds, value_t *out_dists,
  * @param[in] stream stream to order kernel launch
  */
 template <typename value_idx, typename value_t>
-void haversine_knn(value_idx *out_inds, value_t *out_dists,
-                   const value_t *index, const value_t *query,
-                   size_t n_index_rows, size_t n_query_rows, int k,
-                   cudaStream_t stream) {
+void haversine_knn(value_idx* out_inds,
+                   value_t* out_dists,
+                   const value_t* index,
+                   const value_t* query,
+                   size_t n_index_rows,
+                   size_t n_query_rows,
+                   int k,
+                   cudaStream_t stream)
+{
   haversine_knn_kernel<<<n_query_rows, 128, 0, stream>>>(
     out_inds, out_dists, index, query, n_index_rows, k);
 }
diff --git a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
index da1217e3cf..2866049188 100644
--- a/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/knn_brute_force_faiss.cuh
@@ -46,13 +46,22 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-template <typename value_idx = std::int64_t, typename value_t = float,
-          int warp_q, int thread_q, int tpb>
-__global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
-                                       value_t *outK, value_idx *outV,
-                                       size_t n_samples, int n_parts,
-                                       value_t initK, value_idx initV, int k,
-                                       value_idx *translations) {
+template <typename value_idx = std::int64_t,
+          typename value_t   = float,
+          int warp_q,
+          int thread_q,
+          int tpb>
+__global__ void knn_merge_parts_kernel(value_t* inK,
+                                       value_idx* inV,
+                                       value_t* outK,
+                                       value_idx* outV,
+                                       size_t n_samples,
+                                       int n_parts,
+                                       value_t initK,
+                                       value_idx initV,
+                                       int k,
+                                       value_idx* translations)
+{
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ value_t smemK[kNumWarps * warp_q];
@@ -61,34 +70,33 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
   /**
    * Uses shared memory
    */
-  faiss::gpu::BlockSelect<value_t, value_idx, false,
-                          faiss::gpu::Comparator<value_t>, warp_q, thread_q,
-                          tpb>
-    heap(initK, initV, smemK, smemV, k);
+  faiss::gpu::
+    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
+      heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
-  int row = blockIdx.x;
+  int row     = blockIdx.x;
   int total_k = k * n_parts;
 
   int i = threadIdx.x;
 
   // Get starting pointers for cols in current thread
-  int part = i / k;
+  int part       = i / k;
   size_t row_idx = (row * k) + (part * n_samples * k);
 
   int col = i % k;
 
-  value_t *inKStart = inK + (row_idx + col);
-  value_idx *inVStart = inV + (row_idx + col);
+  value_t* inKStart   = inK + (row_idx + col);
+  value_idx* inVStart = inV + (row_idx + col);
 
-  int limit = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
+  int limit             = faiss::gpu::utils::roundDown(total_k, faiss::gpu::kWarpSize);
   value_idx translation = 0;
 
   for (; i < limit; i += tpb) {
     translation = translations[part];
     heap.add(*inKStart, (*inVStart) + translation);
 
-    part = (i + tpb) / k;
+    part    = (i + tpb) / k;
     row_idx = (row * k) + (part * n_samples * k);
 
     col = (i + tpb) % k;
@@ -111,22 +119,27 @@ __global__ void knn_merge_parts_kernel(value_t *inK, value_idx *inV,
   }
 }
 
-template <typename value_idx = std::int64_t, typename value_t = float,
-          int warp_q, int thread_q>
-inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK,
-                                 value_idx *outV, size_t n_samples, int n_parts,
-                                 int k, cudaStream_t stream,
-                                 value_idx *translations) {
+template <typename value_idx = std::int64_t, typename value_t = float, int warp_q, int thread_q>
+inline void knn_merge_parts_impl(value_t* inK,
+                                 value_idx* inV,
+                                 value_t* outK,
+                                 value_idx* outV,
+                                 size_t n_samples,
+                                 int n_parts,
+                                 int k,
+                                 cudaStream_t stream,
+                                 value_idx* translations)
+{
   auto grid = dim3(n_samples);
 
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
-  auto block = dim3(n_threads);
+  auto block              = dim3(n_threads);
 
   auto kInit = faiss::gpu::Limits<value_t>::getMax();
   auto vInit = -1;
   knn_merge_parts_kernel<value_idx, value_t, warp_q, thread_q, n_threads>
-    <<<grid, block, 0, stream>>>(inK, inV, outK, outV, n_samples, n_parts,
-                                 kInit, vInit, k, translations);
+    <<<grid, block, 0, stream>>>(
+      inK, inV, outK, outV, n_samples, n_parts, kInit, vInit, k, translations);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -145,10 +158,16 @@ inline void knn_merge_parts_impl(value_t *inK, value_idx *inV, value_t *outK,
  * @param translations mapping of index offsets for each partition
  */
 template <typename value_idx = std::int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
-                            value_idx *outV, size_t n_samples, int n_parts,
-                            int k, cudaStream_t stream,
-                            value_idx *translations) {
+inline void knn_merge_parts(value_t* inK,
+                            value_idx* inV,
+                            value_t* outK,
+                            value_idx* outV,
+                            size_t n_samples,
+                            int n_parts,
+                            int k,
+                            cudaStream_t stream,
+                            value_idx* translations)
+{
   if (k == 1)
     knn_merge_parts_impl<value_idx, value_t, 1, 1>(
       inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
@@ -197,26 +216,32 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
  * @param[in] metricArg metric argument to use. Corresponds to the p arg for lp norm
  */
 template <typename IntType = int, typename IdxType = std::int64_t>
-void brute_force_knn_impl(std::vector<float *> &input,
-                          std::vector<IntType> &sizes, IntType D,
-                          float *search_items, IntType n, IdxType *res_I,
-                          float *res_D, IntType k, cudaStream_t userStream,
-                          cudaStream_t *internalStreams = nullptr,
-                          int n_int_streams = 0, bool rowMajorIndex = true,
-                          bool rowMajorQuery = true,
-                          std::vector<IdxType> *translations = nullptr,
-                          raft::distance::DistanceType metric =
-                            raft::distance::DistanceType::L2Expanded,
-                          float metricArg = 0) {
-  ASSERT(input.size() == sizes.size(),
-         "input and sizes vectors should be the same size");
-
-  std::vector<IdxType> *id_ranges;
+void brute_force_knn_impl(
+  std::vector<float*>& input,
+  std::vector<IntType>& sizes,
+  IntType D,
+  float* search_items,
+  IntType n,
+  IdxType* res_I,
+  float* res_D,
+  IntType k,
+  cudaStream_t userStream,
+  cudaStream_t* internalStreams       = nullptr,
+  int n_int_streams                   = 0,
+  bool rowMajorIndex                  = true,
+  bool rowMajorQuery                  = true,
+  std::vector<IdxType>* translations  = nullptr,
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded,
+  float metricArg                     = 0)
+{
+  ASSERT(input.size() == sizes.size(), "input and sizes vectors should be the same size");
+
+  std::vector<IdxType>* id_ranges;
   if (translations == nullptr) {
     // If we don't have explicit translations
     // for offsets of the indices, build them
     // from the local partitions
-    id_ranges = new std::vector<IdxType>();
+    id_ranges       = new std::vector<IdxType>();
     IdxType total_n = 0;
     for (size_t i = 0; i < input.size(); i++) {
       id_ranges->push_back(total_n);
@@ -232,11 +257,10 @@ void brute_force_knn_impl(std::vector<float *> &input,
     create_processor<float>(metric, n, D, k, rowMajorQuery, userStream);
   query_metric_processor->preprocess(search_items);
 
-  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(
-    input.size());
+  std::vector<std::unique_ptr<MetricProcessor<float>>> metric_processors(input.size());
   for (size_t i = 0; i < input.size(); i++) {
-    metric_processors[i] = create_processor<float>(metric, sizes[i], D, k,
-                                                   rowMajorQuery, userStream);
+    metric_processors[i] =
+      create_processor<float>(metric, sizes[i], D, k, rowMajorQuery, userStream);
     metric_processors[i]->preprocess(input[i]);
   }
 
@@ -244,14 +268,13 @@ void brute_force_knn_impl(std::vector<float *> &input,
   CUDA_CHECK(cudaGetDevice(&device));
 
   rmm::device_uvector<std::int64_t> trans(id_ranges->size(), userStream);
-  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(),
-                      userStream);
+  raft::update_device(trans.data(), id_ranges->data(), id_ranges->size(), userStream);
 
   rmm::device_uvector<float> all_D(0, userStream);
   rmm::device_uvector<std::int64_t> all_I(0, userStream);
 
-  float *out_D = res_D;
-  IdxType *out_I = res_I;
+  float* out_D   = res_D;
+  IdxType* out_I = res_I;
 
   if (input.size() > 1) {
     all_D.resize(input.size() * k * n, userStream);
@@ -265,19 +288,28 @@ void brute_force_knn_impl(std::vector<float *> &input,
   if (n_int_streams > 0) CUDA_CHECK(cudaStreamSynchronize(userStream));
 
   for (size_t i = 0; i < input.size(); i++) {
-    float *out_d_ptr = out_D + (i * k * n);
-    IdxType *out_i_ptr = out_I + (i * k * n);
+    float* out_d_ptr   = out_D + (i * k * n);
+    IdxType* out_i_ptr = out_I + (i * k * n);
 
-    cudaStream_t stream =
-      raft::select_stream(userStream, internalStreams, n_int_streams, i);
+    cudaStream_t stream = raft::select_stream(userStream, internalStreams, n_int_streams, i);
 
     if (k <= 64 && rowMajorQuery == rowMajorIndex && rowMajorQuery == true &&
         (metric == raft::distance::DistanceType::L2Unexpanded ||
          metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
          metric == raft::distance::DistanceType::L2Expanded ||
          metric == raft::distance::DistanceType::L2SqrtExpanded)) {
-      fusedL2Knn(D, out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n,
-                 k, rowMajorIndex, rowMajorQuery, stream, metric);
+      fusedL2Knn(D,
+                 out_i_ptr,
+                 out_d_ptr,
+                 input[i],
+                 search_items,
+                 sizes[i],
+                 n,
+                 k,
+                 rowMajorIndex,
+                 rowMajorQuery,
+                 stream,
+                 metric);
     } else {
       switch (metric) {
         case raft::distance::DistanceType::Haversine:
@@ -286,8 +318,7 @@ void brute_force_knn_impl(std::vector<float *> &input,
                  "Haversine distance requires 2 dimensions "
                  "(latitude / longitude).");
 
-          haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i],
-                        n, k, stream);
+          haversine_knn(out_i_ptr, out_d_ptr, input[i], search_items, sizes[i], n, k, stream);
           break;
         default:
           faiss::MetricType m = build_faiss_metric(metric);
@@ -298,18 +329,18 @@ void brute_force_knn_impl(std::vector<float *> &input,
           gpu_res.setDefaultStream(device, stream);
 
           faiss::gpu::GpuDistanceParams args;
-          args.metric = m;
-          args.metricArg = metricArg;
-          args.k = k;
-          args.dims = D;
-          args.vectors = input[i];
+          args.metric          = m;
+          args.metricArg       = metricArg;
+          args.k               = k;
+          args.dims            = D;
+          args.vectors         = input[i];
           args.vectorsRowMajor = rowMajorIndex;
-          args.numVectors = sizes[i];
-          args.queries = search_items;
+          args.numVectors      = sizes[i];
+          args.queries         = search_items;
           args.queriesRowMajor = rowMajorQuery;
-          args.numQueries = n;
-          args.outDistances = out_d_ptr;
-          args.outIndices = out_i_ptr;
+          args.numQueries      = n;
+          args.outDistances    = out_d_ptr;
+          args.outIndices      = out_i_ptr;
 
           /**
            * @todo: Until FAISS supports pluggable allocation strategies,
@@ -333,8 +364,7 @@ void brute_force_knn_impl(std::vector<float *> &input,
   if (input.size() > 1 || translations != nullptr) {
     // This is necessary for proper index translations. If there are
     // no translations or partitions to combine, it can be skipped.
-    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream,
-                    trans.data());
+    knn_merge_parts(out_D, out_I, res_D, res_I, n, input.size(), k, userStream, trans.data());
   }
 
   // Perform necessary post-processing
@@ -342,14 +372,12 @@ void brute_force_knn_impl(std::vector<float *> &input,
       metric == raft::distance::DistanceType::L2SqrtUnexpanded ||
       metric == raft::distance::DistanceType::LpUnexpanded) {
     /**
-	* post-processing
-	*/
+     * post-processing
+     */
     float p = 0.5;  // standard l2
-    if (metric == raft::distance::DistanceType::LpUnexpanded)
-      p = 1.0 / metricArg;
+    if (metric == raft::distance::DistanceType::LpUnexpanded) p = 1.0 / metricArg;
     raft::linalg::unaryOp<float>(
-      res_D, res_D, n * k,
-      [p] __device__(float input) { return powf(input, p); }, userStream);
+      res_D, res_D, n * k, [p] __device__(float input) { return powf(input, p); }, userStream);
   }
 
   query_metric_processor->revert(search_items);
diff --git a/cpp/include/raft/spatial/knn/detail/processing.hpp b/cpp/include/raft/spatial/knn/detail/processing.hpp
index b66ea025a2..f87fffc6cf 100644
--- a/cpp/include/raft/spatial/knn/detail/processing.hpp
+++ b/cpp/include/raft/spatial/knn/detail/processing.hpp
@@ -37,11 +37,11 @@ namespace knn {
 template <typename math_t>
 class MetricProcessor {
  public:
-  virtual void preprocess(math_t *data) {}
+  virtual void preprocess(math_t* data) {}
 
-  virtual void revert(math_t *data) {}
+  virtual void revert(math_t* data) {}
 
-  virtual void postprocess(math_t *data) {}
+  virtual void postprocess(math_t* data) {}
 
   virtual ~MetricProcessor() = default;
 };
@@ -57,37 +57,57 @@ class CosineMetricProcessor : public MetricProcessor<math_t> {
   rmm::device_uvector<math_t> colsums_;
 
  public:
-  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major,
-                        cudaStream_t stream)
+  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
     : stream_(stream),
       colsums_(n_rows, stream),
       n_cols_(n_cols),
       n_rows_(n_rows),
       row_major_(row_major),
-      k_(k) {}
+      k_(k)
+  {
+  }
 
-  void preprocess(math_t *data) {
-    raft::linalg::rowNorm(colsums_.data(), data, n_cols_, n_rows_,
-                          raft::linalg::NormType::L2Norm, row_major_, stream_,
+  void preprocess(math_t* data)
+  {
+    raft::linalg::rowNorm(colsums_.data(),
+                          data,
+                          n_cols_,
+                          n_rows_,
+                          raft::linalg::NormType::L2Norm,
+                          row_major_,
+                          stream_,
                           [] __device__(math_t in) { return sqrtf(in); });
 
     raft::linalg::matrixVectorOp(
-      data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false,
+      data,
+      data,
+      colsums_.data(),
+      n_cols_,
+      n_rows_,
+      row_major_,
+      false,
       [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; },
       stream_);
   }
 
-  void revert(math_t *data) {
+  void revert(math_t* data)
+  {
     raft::linalg::matrixVectorOp(
-      data, data, colsums_.data(), n_cols_, n_rows_, row_major_, false,
+      data,
+      data,
+      colsums_.data(),
+      n_cols_,
+      n_rows_,
+      row_major_,
+      false,
       [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; },
       stream_);
   }
 
-  void postprocess(math_t *data) {
+  void postprocess(math_t* data)
+  {
     raft::linalg::unaryOp(
-      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; },
-      stream_);
+      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_);
   }
 
   ~CosineMetricProcessor() = default;
@@ -98,41 +118,59 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
   using cosine = CosineMetricProcessor<math_t>;
 
  public:
-  CorrelationMetricProcessor(size_t n_rows, size_t n_cols, int k,
-                             bool row_major, cudaStream_t stream)
-    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream),
-      means_(n_rows, stream) {}
+  CorrelationMetricProcessor(
+    size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
+    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream), means_(n_rows, stream)
+  {
+  }
 
-  void preprocess(math_t *data) {
+  void preprocess(math_t* data)
+  {
     math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
 
-    raft::linalg::reduce(means_.data(), data, cosine::n_cols_, cosine::n_rows_,
-                         (math_t)0.0, cosine::row_major_, true,
+    raft::linalg::reduce(means_.data(),
+                         data,
+                         cosine::n_cols_,
+                         cosine::n_rows_,
+                         (math_t)0.0,
+                         cosine::row_major_,
+                         true,
                          cosine::stream_);
 
     raft::linalg::unaryOp(
-      means_.data(), means_.data(), cosine::n_rows_,
+      means_.data(),
+      means_.data(),
+      cosine::n_rows_,
       [=] __device__(math_t in) { return in * normalizer_const; },
       cosine::stream_);
 
-    raft::stats::meanCenter(data, data, means_.data(), cosine::n_cols_,
-                            cosine::n_rows_, cosine::row_major_, false,
+    raft::stats::meanCenter(data,
+                            data,
+                            means_.data(),
+                            cosine::n_cols_,
+                            cosine::n_rows_,
+                            cosine::row_major_,
+                            false,
                             cosine::stream_);
 
     CosineMetricProcessor<math_t>::preprocess(data);
   }
 
-  void revert(math_t *data) {
+  void revert(math_t* data)
+  {
     CosineMetricProcessor<math_t>::revert(data);
 
-    raft::stats::meanAdd(data, data, means_.data(), cosine::n_cols_,
-                         cosine::n_rows_, cosine::row_major_, false,
+    raft::stats::meanAdd(data,
+                         data,
+                         means_.data(),
+                         cosine::n_cols_,
+                         cosine::n_rows_,
+                         cosine::row_major_,
+                         false,
                          cosine::stream_);
   }
 
-  void postprocess(math_t *data) {
-    CosineMetricProcessor<math_t>::postprocess(data);
-  }
+  void postprocess(math_t* data) { CosineMetricProcessor<math_t>::postprocess(data); }
 
   ~CorrelationMetricProcessor() = default;
 
@@ -142,33 +180,30 @@ class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
 template <typename math_t>
 class DefaultMetricProcessor : public MetricProcessor<math_t> {
  public:
-  void preprocess(math_t *data) {}
+  void preprocess(math_t* data) {}
 
-  void revert(math_t *data) {}
+  void revert(math_t* data) {}
 
-  void postprocess(math_t *data) {}
+  void postprocess(math_t* data) {}
 
   ~DefaultMetricProcessor() = default;
 };
 
 template <typename math_t>
 inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
-  distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery,
-  cudaStream_t userStream) {
-  MetricProcessor<math_t> *mp = nullptr;
+  distance::DistanceType metric, int n, int D, int k, bool rowMajorQuery, cudaStream_t userStream)
+{
+  MetricProcessor<math_t>* mp = nullptr;
 
   switch (metric) {
     case distance::DistanceType::CosineExpanded:
-      mp =
-        new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
+      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
       break;
 
     case distance::DistanceType::CorrelationExpanded:
-      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery,
-                                                  userStream);
+      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
       break;
-    default:
-      mp = new DefaultMetricProcessor<math_t>();
+    default: mp = new DefaultMetricProcessor<math_t>();
   }
 
   return std::unique_ptr<MetricProcessor<math_t>>(mp);
diff --git a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
index 045edad0e6..88fa58a4d7 100644
--- a/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/selection_faiss.cuh
@@ -31,27 +31,33 @@ namespace spatial {
 namespace knn {
 namespace detail {
 
-template <typename K, typename IndexType, bool select_min, int warp_q,
-          int thread_q, int tpb>
-__global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows,
-                                size_t n_cols, K *outK, IndexType *outV,
-                                K initK, IndexType initV, int k) {
+template <typename K, typename IndexType, bool select_min, int warp_q, int thread_q, int tpb>
+__global__ void select_k_kernel(K* inK,
+                                IndexType* inV,
+                                size_t n_rows,
+                                size_t n_cols,
+                                K* outK,
+                                IndexType* outV,
+                                K initK,
+                                IndexType initV,
+                                int k)
+{
   constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
 
   __shared__ K smemK[kNumWarps * warp_q];
   __shared__ IndexType smemV[kNumWarps * warp_q];
 
-  faiss::gpu::BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>,
-                          warp_q, thread_q, tpb>
-    heap(initK, initV, smemK, smemV, k);
+  faiss::gpu::
+    BlockSelect<K, IndexType, select_min, faiss::gpu::Comparator<K>, warp_q, thread_q, tpb>
+      heap(initK, initV, smemK, smemV, k);
 
   // Grid is exactly sized to rows available
   int row = blockIdx.x;
-  int i = threadIdx.x;
+  int i   = threadIdx.x;
 
-  int idx = row * n_cols;
-  K *inKStart = inK + idx + i;
-  IndexType *inVStart = inV + idx + i;
+  int idx             = row * n_cols;
+  K* inKStart         = inK + idx + i;
+  IndexType* inVStart = inV + idx + i;
 
   // Whole warps must participate in the selection
   int limit = faiss::gpu::utils::roundDown(n_cols, faiss::gpu::kWarpSize);
@@ -78,27 +84,31 @@ __global__ void select_k_kernel(K *inK, IndexType *inV, size_t n_rows,
   }
 }
 
-template <typename value_idx = int, typename value_t = float, int warp_q,
-          int thread_q>
-inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows,
-                          size_t n_cols, value_t *outK, value_idx *outV,
-                          bool select_min, int k, cudaStream_t stream) {
+template <typename value_idx = int, typename value_t = float, int warp_q, int thread_q>
+inline void select_k_impl(value_t* inK,
+                          value_idx* inV,
+                          size_t n_rows,
+                          size_t n_cols,
+                          value_t* outK,
+                          value_idx* outV,
+                          bool select_min,
+                          int k,
+                          cudaStream_t stream)
+{
   auto grid = dim3(n_rows);
 
   constexpr int n_threads = (warp_q <= 1024) ? 128 : 64;
-  auto block = dim3(n_threads);
+  auto block              = dim3(n_threads);
 
-  auto kInit = select_min ? faiss::gpu::Limits<value_t>::getMax()
-                          : faiss::gpu::Limits<value_t>::getMin();
+  auto kInit =
+    select_min ? faiss::gpu::Limits<value_t>::getMax() : faiss::gpu::Limits<value_t>::getMin();
   auto vInit = -1;
   if (select_min) {
     select_k_kernel<value_t, value_idx, false, warp_q, thread_q, n_threads>
-      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit,
-                                   vInit, k);
+      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   } else {
     select_k_kernel<value_t, value_idx, true, warp_q, thread_q, n_threads>
-      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit,
-                                   vInit, k);
+      <<<grid, block, 0, stream>>>(inK, inV, n_rows, n_cols, outK, outV, kInit, vInit, k);
   }
   CUDA_CHECK(cudaGetLastError());
 }
@@ -118,30 +128,37 @@ inline void select_k_impl(value_t *inK, value_idx *inV, size_t n_rows,
  * @param[in] stream CUDA stream to use
  */
 template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
-                     value_t *outK, value_idx *outV, bool select_min, int k,
-                     cudaStream_t stream) {
+inline void select_k(value_t* inK,
+                     value_idx* inV,
+                     size_t n_rows,
+                     size_t n_cols,
+                     value_t* outK,
+                     value_idx* outV,
+                     bool select_min,
+                     int k,
+                     cudaStream_t stream)
+{
   if (k == 1)
-    select_k_impl<value_idx, value_t, 1, 1>(inK, inV, n_rows, n_cols, outK,
-                                            outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 1, 1>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 32)
-    select_k_impl<value_idx, value_t, 32, 2>(inK, inV, n_rows, n_cols, outK,
-                                             outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 32, 2>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 64)
-    select_k_impl<value_idx, value_t, 64, 3>(inK, inV, n_rows, n_cols, outK,
-                                             outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 64, 3>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 128)
-    select_k_impl<value_idx, value_t, 128, 3>(inK, inV, n_rows, n_cols, outK,
-                                              outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 128, 3>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 256)
-    select_k_impl<value_idx, value_t, 256, 4>(inK, inV, n_rows, n_cols, outK,
-                                              outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 256, 4>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 512)
-    select_k_impl<value_idx, value_t, 512, 8>(inK, inV, n_rows, n_cols, outK,
-                                              outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 512, 8>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
   else if (k <= 1024)
-    select_k_impl<value_idx, value_t, 1024, 8>(inK, inV, n_rows, n_cols, outK,
-                                               outV, select_min, k, stream);
+    select_k_impl<value_idx, value_t, 1024, 8>(
+      inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
 }
 
 };  // namespace detail
diff --git a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
index 84719a0e4b..abc4cdf545 100644
--- a/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
+++ b/cpp/include/raft/spatial/knn/detail/warp_select_faiss.cuh
@@ -30,21 +30,25 @@ struct KeyValuePair {
   __host__ __device__ __forceinline__ KeyValuePair() {}
 
   /// Copy Constructors
-  __host__ __device__ __forceinline__
-  KeyValuePair(cub::KeyValuePair<_Key, _Value>& kvp)
-    : key(kvp.key), value(kvp.value) {}
+  __host__ __device__ __forceinline__ KeyValuePair(cub::KeyValuePair<_Key, _Value>& kvp)
+    : key(kvp.key), value(kvp.value)
+  {
+  }
 
-  __host__ __device__ __forceinline__
-  KeyValuePair(faiss::gpu::KeyValuePair<_Key, _Value>& kvp)
-    : key(kvp.key), value(kvp.value) {}
+  __host__ __device__ __forceinline__ KeyValuePair(faiss::gpu::KeyValuePair<_Key, _Value>& kvp)
+    : key(kvp.key), value(kvp.value)
+  {
+  }
 
   /// Constructor
-  __host__ __device__ __forceinline__ KeyValuePair(Key const& key,
-                                                   Value const& value)
-    : key(key), value(value) {}
+  __host__ __device__ __forceinline__ KeyValuePair(Key const& key, Value const& value)
+    : key(key), value(value)
+  {
+  }
 
   /// Inequality operator
-  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b) {
+  __host__ __device__ __forceinline__ bool operator!=(const KeyValuePair& b)
+  {
     return (value != b.value) || (key != b.key);
   }
 };
@@ -117,9 +121,9 @@ struct KeyValuePair {
 //
 // If IsBitonic is false, the first stage is reversed, so we don't
 // need to sort directionally. It's still technically a bitonic sort.
-template <typename K, typename V, int L, bool Dir, typename Comp,
-          bool IsBitonic>
-inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v) {
+template <typename K, typename V, int L, bool Dir, typename Comp, bool IsBitonic>
+inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v)
+{
   static_assert(utils::isPowerOf2(L), "L must be a power-of-2");
   static_assert(L <= kWarpSize / 2, "merge list size must be <= 16");
 
@@ -129,7 +133,7 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v) {
     // Reverse the first comparison stage.
     // For example, merging a list of size 8 has the exchanges:
     // 0 <-> 15, 1 <-> 14, ...
-    K otherK = shfl_xor(k, 2 * L - 1);
+    K otherK  = shfl_xor(k, 2 * L - 1);
     K otherVk = shfl_xor(v.key, 2 * L - 1);
     V otherVv = shfl_xor(v.value, 2 * L - 1);
 
@@ -157,7 +161,7 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v) {
 
 #pragma unroll
   for (int stride = IsBitonic ? L : L / 2; stride > 0; stride /= 2) {
-    K otherK = shfl_xor(k, stride);
+    K otherK  = shfl_xor(k, stride);
     K otherVk = shfl_xor(v.key, stride);
     V otherVv = shfl_xor(v.value, stride);
 
@@ -183,9 +187,9 @@ inline __device__ void warpBitonicMergeLE16KVP(K& k, KeyValuePair<K, V>& v) {
 
 // Template for performing a bitonic merge of an arbitrary set of
 // registers
-template <typename K, typename V, int N, bool Dir, typename Comp, bool Low,
-          bool Pow2>
-struct BitonicMergeStepKVP {};
+template <typename K, typename V, int N, bool Dir, typename Comp, bool Low, bool Pow2>
+struct BitonicMergeStepKVP {
+};
 
 //
 // Power-of-2 merge specialization
@@ -194,7 +198,8 @@ struct BitonicMergeStepKVP {};
 // All merges eventually call this
 template <typename K, typename V, bool Dir, typename Comp, bool Low>
 struct BitonicMergeStepKVP<K, V, 1, Dir, Comp, Low, true> {
-  static inline __device__ void merge(K k[1], KeyValuePair<K, V> v[1]) {
+  static inline __device__ void merge(K k[1], KeyValuePair<K, V> v[1])
+  {
     // Use warp shuffles
     warpBitonicMergeLE16KVP<K, V, 16, Dir, Comp, true>(k[0], v[0]);
   }
@@ -202,16 +207,17 @@ struct BitonicMergeStepKVP<K, V, 1, Dir, Comp, Low, true> {
 
 template <typename K, typename V, int N, bool Dir, typename Comp, bool Low>
 struct BitonicMergeStepKVP<K, V, N, Dir, Comp, Low, true> {
-  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N]) {
+  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N])
+  {
     static_assert(utils::isPowerOf2(N), "must be power of 2");
     static_assert(N > 1, "must be N > 1");
 
 #pragma unroll
     for (int i = 0; i < N / 2; ++i) {
-      K& ka = k[i];
+      K& ka                  = k[i];
       KeyValuePair<K, V>& va = v[i];
 
-      K& kb = k[i + N / 2];
+      K& kb                  = k[i + N / 2];
       KeyValuePair<K, V>& vb = v[i + N / 2];
 
       bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
@@ -226,18 +232,17 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, Low, true> {
 
 #pragma unroll
       for (int i = 0; i < N / 2; ++i) {
-        newK[i] = k[i];
-        newV[i].key = v[i].key;
+        newK[i]       = k[i];
+        newV[i].key   = v[i].key;
         newV[i].value = v[i].value;
       }
 
-      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, true, true>::merge(newK,
-                                                                     newV);
+      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, true, true>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < N / 2; ++i) {
-        k[i] = newK[i];
-        v[i].key = newV[i].key;
+        k[i]       = newK[i];
+        v[i].key   = newV[i].key;
         v[i].value = newV[i].value;
       }
     }
@@ -248,18 +253,17 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, Low, true> {
 
 #pragma unroll
       for (int i = 0; i < N / 2; ++i) {
-        newK[i] = k[i + N / 2];
-        newV[i].key = v[i + N / 2].key;
+        newK[i]       = k[i + N / 2];
+        newV[i].key   = v[i + N / 2].key;
         newV[i].value = v[i + N / 2].value;
       }
 
-      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, false, true>::merge(newK,
-                                                                      newV);
+      BitonicMergeStepKVP<K, V, N / 2, Dir, Comp, false, true>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < N / 2; ++i) {
-        k[i + N / 2] = newK[i];
-        v[i + N / 2].key = newV[i].key;
+        k[i + N / 2]       = newK[i];
+        v[i + N / 2].key   = newV[i].key;
         v[i + N / 2].value = newV[i].value;
       }
     }
@@ -273,7 +277,8 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, Low, true> {
 // Low recursion
 template <typename K, typename V, int N, bool Dir, typename Comp>
 struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
-  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N]) {
+  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N])
+  {
     static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
     static_assert(N >= 3, "must be N >= 3");
 
@@ -281,10 +286,10 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
 
 #pragma unroll
     for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
-      K& ka = k[i];
+      K& ka                  = k[i];
       KeyValuePair<K, V>& va = v[i];
 
-      K& kb = k[i + kNextHighestPowerOf2 / 2];
+      K& kb                  = k[i + kNextHighestPowerOf2 / 2];
       KeyValuePair<K, V>& vb = v[i + kNextHighestPowerOf2 / 2];
 
       bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
@@ -293,7 +298,7 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
       swap(s, va.value, vb.value);
     }
 
-    constexpr int kLowSize = N - kNextHighestPowerOf2 / 2;
+    constexpr int kLowSize  = N - kNextHighestPowerOf2 / 2;
     constexpr int kHighSize = kNextHighestPowerOf2 / 2;
     {
       K newK[kLowSize];
@@ -301,23 +306,26 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
 
 #pragma unroll
       for (int i = 0; i < kLowSize; ++i) {
-        newK[i] = k[i];
-        newV[i].key = v[i].key;
+        newK[i]       = k[i];
+        newV[i].key   = v[i].key;
         newV[i].value = v[i].value;
       }
 
-      constexpr bool kLowIsPowerOf2 =
-        utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
+      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
       // FIXME: compiler doesn't like this expression? compiler bug?
       //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
-      BitonicMergeStepKVP<K, V, kLowSize, Dir, Comp,
+      BitonicMergeStepKVP<K,
+                          V,
+                          kLowSize,
+                          Dir,
+                          Comp,
                           true,  // low
                           kLowIsPowerOf2>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < kLowSize; ++i) {
-        k[i] = newK[i];
-        v[i].key = newV[i].key;
+        k[i]       = newK[i];
+        v[i].key   = newV[i].key;
         v[i].value = newV[i].value;
       }
     }
@@ -328,23 +336,26 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
 
 #pragma unroll
       for (int i = 0; i < kHighSize; ++i) {
-        newK[i] = k[i + kLowSize];
-        newV[i].key = v[i + kLowSize].key;
+        newK[i]       = k[i + kLowSize];
+        newV[i].key   = v[i + kLowSize].key;
         newV[i].value = v[i + kLowSize].value;
       }
 
-      constexpr bool kHighIsPowerOf2 =
-        utils::isPowerOf2(kNextHighestPowerOf2 / 2);
+      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2);
       // FIXME: compiler doesn't like this expression? compiler bug?
       //      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize);
-      BitonicMergeStepKVP<K, V, kHighSize, Dir, Comp,
+      BitonicMergeStepKVP<K,
+                          V,
+                          kHighSize,
+                          Dir,
+                          Comp,
                           false,  // high
                           kHighIsPowerOf2>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < kHighSize; ++i) {
-        k[i + kLowSize] = newK[i];
-        v[i + kLowSize].key = newV[i].key;
+        k[i + kLowSize]       = newK[i];
+        v[i + kLowSize].key   = newV[i].key;
         v[i + kLowSize].value = newV[i].value;
       }
     }
@@ -354,7 +365,8 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, true, false> {
 // High recursion
 template <typename K, typename V, int N, bool Dir, typename Comp>
 struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
-  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N]) {
+  static inline __device__ void merge(K k[N], KeyValuePair<K, V> v[N])
+  {
     static_assert(!utils::isPowerOf2(N), "must be non-power-of-2");
     static_assert(N >= 3, "must be N >= 3");
 
@@ -362,10 +374,10 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
 
 #pragma unroll
     for (int i = 0; i < N - kNextHighestPowerOf2 / 2; ++i) {
-      K& ka = k[i];
+      K& ka                  = k[i];
       KeyValuePair<K, V>& va = v[i];
 
-      K& kb = k[i + kNextHighestPowerOf2 / 2];
+      K& kb                  = k[i + kNextHighestPowerOf2 / 2];
       KeyValuePair<K, V>& vb = v[i + kNextHighestPowerOf2 / 2];
 
       bool s = Dir ? Comp::gt(ka, kb) : Comp::lt(ka, kb);
@@ -374,7 +386,7 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
       swap(s, va.value, vb.value);
     }
 
-    constexpr int kLowSize = kNextHighestPowerOf2 / 2;
+    constexpr int kLowSize  = kNextHighestPowerOf2 / 2;
     constexpr int kHighSize = N - kNextHighestPowerOf2 / 2;
     {
       K newK[kLowSize];
@@ -382,23 +394,26 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
 
 #pragma unroll
       for (int i = 0; i < kLowSize; ++i) {
-        newK[i] = k[i];
-        newV[i].key = v[i].key;
+        newK[i]       = k[i];
+        newV[i].key   = v[i].key;
         newV[i].value = v[i].value;
       }
 
-      constexpr bool kLowIsPowerOf2 =
-        utils::isPowerOf2(kNextHighestPowerOf2 / 2);
+      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kNextHighestPowerOf2 / 2);
       // FIXME: compiler doesn't like this expression? compiler bug?
       //      constexpr bool kLowIsPowerOf2 = utils::isPowerOf2(kLowSize);
-      BitonicMergeStepKVP<K, V, kLowSize, Dir, Comp,
+      BitonicMergeStepKVP<K,
+                          V,
+                          kLowSize,
+                          Dir,
+                          Comp,
                           true,  // low
                           kLowIsPowerOf2>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < kLowSize; ++i) {
-        k[i] = newK[i];
-        v[i].key = newV[i].key;
+        k[i]       = newK[i];
+        v[i].key   = newV[i].key;
         v[i].value = newV[i].value;
       }
     }
@@ -409,23 +424,26 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
 
 #pragma unroll
       for (int i = 0; i < kHighSize; ++i) {
-        newK[i] = k[i + kLowSize];
-        newV[i].key = v[i + kLowSize].key;
+        newK[i]       = k[i + kLowSize];
+        newV[i].key   = v[i + kLowSize].key;
         newV[i].value = v[i + kLowSize].value;
       }
 
-      constexpr bool kHighIsPowerOf2 =
-        utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
+      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(N - kNextHighestPowerOf2 / 2);
       // FIXME: compiler doesn't like this expression? compiler bug?
       //      constexpr bool kHighIsPowerOf2 = utils::isPowerOf2(kHighSize);
-      BitonicMergeStepKVP<K, V, kHighSize, Dir, Comp,
+      BitonicMergeStepKVP<K,
+                          V,
+                          kHighSize,
+                          Dir,
+                          Comp,
                           false,  // high
                           kHighIsPowerOf2>::merge(newK, newV);
 
 #pragma unroll
       for (int i = 0; i < kHighSize; ++i) {
-        k[i + kLowSize] = newK[i];
-        v[i + kLowSize].key = newV[i].key;
+        k[i + kLowSize]       = newK[i];
+        v[i + kLowSize].key   = newV[i].key;
         v[i + kLowSize].value = newV[i].value;
       }
     }
@@ -436,20 +454,20 @@ struct BitonicMergeStepKVP<K, V, N, Dir, Comp, false, false> {
 /// i.e., merges a sorted k/v list of size kWarpSize * N1 with a
 /// sorted k/v list of size kWarpSize * N2, where N1 and N2 are any
 /// value >= 1
-template <typename K, typename V, int N1, int N2, bool Dir, typename Comp,
-          bool FullMerge = true>
+template <typename K, typename V, int N1, int N2, bool Dir, typename Comp, bool FullMerge = true>
 inline __device__ void warpMergeAnyRegistersKVP(K k1[N1],
                                                 KeyValuePair<K, V> v1[N1],
                                                 K k2[N2],
-                                                KeyValuePair<K, V> v2[N2]) {
+                                                KeyValuePair<K, V> v2[N2])
+{
   constexpr int kSmallestN = N1 < N2 ? N1 : N2;
 
 #pragma unroll
   for (int i = 0; i < kSmallestN; ++i) {
-    K& ka = k1[N1 - 1 - i];
+    K& ka                  = k1[N1 - 1 - i];
     KeyValuePair<K, V>& va = v1[N1 - 1 - i];
 
-    K& kb = k2[i];
+    K& kb                  = k2[i];
     KeyValuePair<K, V>& vb = v2[i];
 
     K otherKa;
@@ -457,13 +475,13 @@ inline __device__ void warpMergeAnyRegistersKVP(K k1[N1],
 
     if (FullMerge) {
       // We need the other values
-      otherKa = shfl_xor(ka, kWarpSize - 1);
+      otherKa    = shfl_xor(ka, kWarpSize - 1);
       K otherVak = shfl_xor(va.key, kWarpSize - 1);
       V otherVav = shfl_xor(va.value, kWarpSize - 1);
-      otherVa = KeyValuePair(otherVak, otherVav);
+      otherVa    = KeyValuePair(otherVak, otherVav);
     }
 
-    K otherKb = shfl_xor(kb, kWarpSize - 1);
+    K otherKb  = shfl_xor(kb, kWarpSize - 1);
     K otherVbk = shfl_xor(vb.key, kWarpSize - 1);
     V otherVbv = shfl_xor(vb.value, kWarpSize - 1);
 
@@ -487,12 +505,10 @@ inline __device__ void warpMergeAnyRegistersKVP(K k1[N1],
     }
   }
 
-  BitonicMergeStepKVP<K, V, N1, Dir, Comp, true, utils::isPowerOf2(N1)>::merge(
-    k1, v1);
+  BitonicMergeStepKVP<K, V, N1, Dir, Comp, true, utils::isPowerOf2(N1)>::merge(k1, v1);
   if (FullMerge) {
     // Only if we care about N2 do we need to bother merging it fully
-    BitonicMergeStepKVP<K, V, N2, Dir, Comp, false,
-                        utils::isPowerOf2(N2)>::merge(k2, v2);
+    BitonicMergeStepKVP<K, V, N2, Dir, Comp, false, utils::isPowerOf2(N2)>::merge(k2, v2);
   }
 }
 
@@ -500,7 +516,8 @@ inline __device__ void warpMergeAnyRegistersKVP(K k1[N1],
 // bitonic sort
 template <typename K, typename V, int N, bool Dir, typename Comp>
 struct BitonicSortStepKVP {
-  static inline __device__ void sort(K k[N], KeyValuePair<K, V> v[N]) {
+  static inline __device__ void sort(K k[N], KeyValuePair<K, V> v[N])
+  {
     static_assert(N > 1, "did not hit specialized case");
 
     // Sort recursively
@@ -512,8 +529,8 @@ struct BitonicSortStepKVP {
 
 #pragma unroll
     for (int i = 0; i < kSizeA; ++i) {
-      aK[i] = k[i];
-      aV[i].key = v[i].key;
+      aK[i]       = k[i];
+      aV[i].key   = v[i].key;
       aV[i].value = v[i].value;
     }
 
@@ -524,8 +541,8 @@ struct BitonicSortStepKVP {
 
 #pragma unroll
     for (int i = 0; i < kSizeB; ++i) {
-      bK[i] = k[i + kSizeA];
-      bV[i].key = v[i + kSizeA].key;
+      bK[i]       = k[i + kSizeA];
+      bV[i].key   = v[i + kSizeA].key;
       bV[i].value = v[i + kSizeA].value;
     }
 
@@ -536,15 +553,15 @@ struct BitonicSortStepKVP {
 
 #pragma unroll
     for (int i = 0; i < kSizeA; ++i) {
-      k[i] = aK[i];
-      v[i].key = aV[i].key;
+      k[i]       = aK[i];
+      v[i].key   = aV[i].key;
       v[i].value = aV[i].value;
     }
 
 #pragma unroll
     for (int i = 0; i < kSizeB; ++i) {
-      k[i + kSizeA] = bK[i];
-      v[i + kSizeA].key = bV[i].key;
+      k[i + kSizeA]       = bK[i];
+      v[i + kSizeA].key   = bV[i].key;
       v[i + kSizeA].value = bV[i].value;
     }
   }
@@ -553,7 +570,8 @@ struct BitonicSortStepKVP {
 // Single warp (N == 1) sorting specialization
 template <typename K, typename V, bool Dir, typename Comp>
 struct BitonicSortStepKVP<K, V, 1, Dir, Comp> {
-  static inline __device__ void sort(K k[1], KeyValuePair<K, V> v[1]) {
+  static inline __device__ void sort(K k[1], KeyValuePair<K, V> v[1])
+  {
     // Update this code if this changes
     // should go from 1 -> kWarpSize in multiples of 2
     static_assert(kWarpSize == 32, "unexpected warp size");
@@ -569,61 +587,64 @@ struct BitonicSortStepKVP<K, V, 1, Dir, Comp> {
 /// Sort a list of kWarpSize * N elements in registers, where N is an
 /// arbitrary >= 1
 template <typename K, typename V, int N, bool Dir, typename Comp>
-inline __device__ void warpSortAnyRegistersKVP(K k[N],
-                                               KeyValuePair<K, V> v[N]) {
+inline __device__ void warpSortAnyRegistersKVP(K k[N], KeyValuePair<K, V> v[N])
+{
   BitonicSortStepKVP<K, V, N, Dir, Comp>::sort(k, v);
 }
 
 // `Dir` true, produce largest values.
 // `Dir` false, produce smallest values.
-template <typename K, typename V, bool Dir, typename Comp, int NumWarpQ,
-          int NumThreadQ, int ThreadsPerBlock>
+template <typename K,
+          typename V,
+          bool Dir,
+          typename Comp,
+          int NumWarpQ,
+          int NumThreadQ,
+          int ThreadsPerBlock>
 struct KeyValueWarpSelect {
   static constexpr int kNumWarpQRegisters = NumWarpQ / faiss::gpu::kWarpSize;
 
-  __device__ inline KeyValueWarpSelect(K initKVal,
-                                       faiss::gpu::KeyValuePair<K, V> initVVal,
-                                       int k)
+  __device__ inline KeyValueWarpSelect(K initKVal, faiss::gpu::KeyValuePair<K, V> initVVal, int k)
     : initK(initKVal),
       initV(initVVal),
       numVals(0),
       warpKTop(initKVal),
       warpKTopRDist(initKVal),
-      kLane((k - 1) % faiss::gpu::kWarpSize) {
-    static_assert(faiss::gpu::utils::isPowerOf2(ThreadsPerBlock),
-                  "threads must be a power-of-2");
-    static_assert(faiss::gpu::utils::isPowerOf2(NumWarpQ),
-                  "warp queue must be power-of-2");
+      kLane((k - 1) % faiss::gpu::kWarpSize)
+  {
+    static_assert(faiss::gpu::utils::isPowerOf2(ThreadsPerBlock), "threads must be a power-of-2");
+    static_assert(faiss::gpu::utils::isPowerOf2(NumWarpQ), "warp queue must be power-of-2");
 
     // Fill the per-thread queue keys with the default value
 #pragma unroll
     for (int i = 0; i < NumThreadQ; ++i) {
-      threadK[i] = initK;
-      threadV[i].key = initV.key;
+      threadK[i]       = initK;
+      threadV[i].key   = initV.key;
       threadV[i].value = initV.value;
     }
 
     // Fill the warp queue with the default value
 #pragma unroll
     for (int i = 0; i < kNumWarpQRegisters; ++i) {
-      warpK[i] = initK;
-      warpV[i].key = initV.key;
+      warpK[i]       = initK;
+      warpV[i].key   = initV.key;
       warpV[i].value = initV.value;
     }
   }
 
-  __device__ inline void addThreadQ(K k, faiss::gpu::KeyValuePair<K, V>& v) {
+  __device__ inline void addThreadQ(K k, faiss::gpu::KeyValuePair<K, V>& v)
+  {
     if (Dir ? Comp::gt(k, warpKTop) : Comp::lt(k, warpKTop)) {
       // Rotate right
 #pragma unroll
       for (int i = NumThreadQ - 1; i > 0; --i) {
-        threadK[i] = threadK[i - 1];
-        threadV[i].key = threadV[i - 1].key;
+        threadK[i]       = threadK[i - 1];
+        threadV[i].key   = threadV[i - 1].key;
         threadV[i].value = threadV[i - 1].value;
       }
 
-      threadK[0] = k;
-      threadV[0].key = v.key;
+      threadK[0]       = k;
+      threadV[0].key   = v.key;
       threadV[0].value = v.value;
       ++numVals;
     }
@@ -633,33 +654,35 @@ struct KeyValueWarpSelect {
   /// list across both
 
   // TODO
-  __device__ inline void mergeWarpQ() {
+  __device__ inline void mergeWarpQ()
+  {
     // Sort all of the per-thread queues
-    faiss::gpu::warpSortAnyRegistersKVP<K, V, NumThreadQ, !Dir, Comp>(threadK,
-                                                                      threadV);
+    faiss::gpu::warpSortAnyRegistersKVP<K, V, NumThreadQ, !Dir, Comp>(threadK, threadV);
 
     // The warp queue is already sorted, and now that we've sorted the
     // per-thread queue, merge both sorted lists together, producing
     // one sorted list
-    faiss::gpu::warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ,
-                                         !Dir, Comp, false>(warpK, warpV,
-                                                            threadK, threadV);
+    faiss::gpu::warpMergeAnyRegistersKVP<K, V, kNumWarpQRegisters, NumThreadQ, !Dir, Comp, false>(
+      warpK, warpV, threadK, threadV);
   }
 
   /// WARNING: all threads in a warp must participate in this.
   /// Otherwise, you must call the constituent parts separately.
-  __device__ inline void add(K k, faiss::gpu::KeyValuePair<K, V>& v) {
+  __device__ inline void add(K k, faiss::gpu::KeyValuePair<K, V>& v)
+  {
     addThreadQ(k, v);
     checkThreadQ();
   }
 
-  __device__ inline void reduce() {
+  __device__ inline void reduce()
+  {
     // Have all warps dump and merge their queues; this will produce
     // the final per-warp results
     mergeWarpQ();
   }
 
-  __device__ inline void checkThreadQ() {
+  __device__ inline void checkThreadQ()
+  {
     bool needSort = (numVals == NumThreadQ);
 
 #if CUDA_VERSION >= 9000
@@ -681,18 +704,19 @@ struct KeyValueWarpSelect {
 
 #pragma unroll
     for (int i = 0; i < NumThreadQ; ++i) {
-      threadK[i] = initK;
-      threadV[i].key = initV.key;
+      threadK[i]       = initK;
+      threadV[i].key   = initV.key;
       threadV[i].value = initV.value;
     }
 
     // We have to beat at least this element
     warpKTopRDist = shfl(warpV[kNumWarpQRegisters - 1].key, kLane);
-    warpKTop = shfl(warpK[kNumWarpQRegisters - 1], kLane);
+    warpKTop      = shfl(warpK[kNumWarpQRegisters - 1], kLane);
   }
 
   /// Dump final k selected values for this warp out
-  __device__ inline void writeOut(K* outK, V* outV, int k) {
+  __device__ inline void writeOut(K* outK, V* outV, int k)
+  {
     int laneId = faiss::gpu::getLaneId();
 
 #pragma unroll
diff --git a/cpp/include/raft/spatial/knn/knn.hpp b/cpp/include/raft/spatial/knn/knn.hpp
index a2e9151dbc..eb9a8f1436 100644
--- a/cpp/include/raft/spatial/knn/knn.hpp
+++ b/cpp/include/raft/spatial/knn/knn.hpp
@@ -52,12 +52,17 @@ using deviceAllocator = raft::mr::device::allocator;
  * @param translations
  */
 template <typename value_idx = int64_t, typename value_t = float>
-inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
-                            value_idx *outV, size_t n_samples, int n_parts,
-                            int k, cudaStream_t stream,
-                            value_idx *translations) {
-  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream,
-                          translations);
+inline void knn_merge_parts(value_t* inK,
+                            value_idx* inV,
+                            value_t* outK,
+                            value_idx* outV,
+                            size_t n_samples,
+                            int n_parts,
+                            int k,
+                            cudaStream_t stream,
+                            value_idx* translations)
+{
+  detail::knn_merge_parts(inK, inV, outK, outV, n_samples, n_parts, k, stream, translations);
 }
 
 /**
@@ -82,9 +87,16 @@ inline void knn_merge_parts(value_t *inK, value_idx *inV, value_t *outK,
  * @param stream
  */
 template <typename value_idx = int, typename value_t = float>
-inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
-                     value_t *outK, value_idx *outV, bool select_min, int k,
-                     cudaStream_t stream) {
+inline void select_k(value_t* inK,
+                     value_idx* inV,
+                     size_t n_rows,
+                     size_t n_cols,
+                     value_t* outK,
+                     value_idx* outV,
+                     bool select_min,
+                     int k,
+                     cudaStream_t stream)
+{
   detail::select_k(inK, inV, n_rows, n_cols, outK, outV, select_min, k, stream);
 }
 
@@ -111,22 +123,41 @@ inline void select_k(value_t *inK, value_idx *inV, size_t n_rows, size_t n_cols,
  * @param[in] translations starting offsets for partitions. should be the same size
  *            as input vector.
  */
-inline void brute_force_knn(
-  raft::handle_t const &handle, std::vector<float *> &input,
-  std::vector<int> &sizes, int D, float *search_items, int n, int64_t *res_I,
-  float *res_D, int k, bool rowMajorIndex = true, bool rowMajorQuery = true,
-  std::vector<int64_t> *translations = nullptr,
-  distance::DistanceType metric = distance::DistanceType::L2Expanded,
-  float metric_arg = 2.0f) {
-  ASSERT(input.size() == sizes.size(),
-         "input and sizes vectors must be the same size");
+inline void brute_force_knn(raft::handle_t const& handle,
+                            std::vector<float*>& input,
+                            std::vector<int>& sizes,
+                            int D,
+                            float* search_items,
+                            int n,
+                            int64_t* res_I,
+                            float* res_D,
+                            int k,
+                            bool rowMajorIndex                 = true,
+                            bool rowMajorQuery                 = true,
+                            std::vector<int64_t>* translations = nullptr,
+                            distance::DistanceType metric      = distance::DistanceType::L2Expanded,
+                            float metric_arg                   = 2.0f)
+{
+  ASSERT(input.size() == sizes.size(), "input and sizes vectors must be the same size");
 
   std::vector<cudaStream_t> int_streams = handle.get_internal_streams();
 
-  detail::brute_force_knn_impl(input, sizes, D, search_items, n, res_I, res_D,
-                               k, handle.get_stream(), int_streams.data(),
-                               handle.get_num_internal_streams(), rowMajorIndex,
-                               rowMajorQuery, translations, metric, metric_arg);
+  detail::brute_force_knn_impl(input,
+                               sizes,
+                               D,
+                               search_items,
+                               n,
+                               res_I,
+                               res_D,
+                               k,
+                               handle.get_stream(),
+                               int_streams.data(),
+                               handle.get_num_internal_streams(),
+                               rowMajorIndex,
+                               rowMajorQuery,
+                               translations,
+                               metric,
+                               metric_arg);
 }
 }  // namespace knn
 }  // namespace spatial
diff --git a/cpp/include/raft/spectral/cluster_solvers.hpp b/cpp/include/raft/spectral/cluster_solvers.hpp
index 6f507331d9..221a9679d4 100644
--- a/cpp/include/raft/spectral/cluster_solvers.hpp
+++ b/cpp/include/raft/spectral/cluster_solvers.hpp
@@ -24,8 +24,7 @@ using namespace matrix;
 
 // aggregate of control params for Eigen Solver:
 //
-template <typename index_type_t, typename value_type_t,
-          typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
 struct cluster_solver_config_t {
   size_type_t n_clusters;
   size_type_t maxIter;
@@ -35,23 +34,35 @@ struct cluster_solver_config_t {
   unsigned long long seed{123456};
 };
 
-template <typename index_type_t, typename value_type_t,
-          typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
 struct kmeans_solver_t {
-  explicit kmeans_solver_t(cluster_solver_config_t<index_type_t, value_type_t,
-                                                   size_type_t> const& config)
-    : config_(config) {}
-
-  std::pair<value_type_t, index_type_t> solve(
-    handle_t const& handle, size_type_t n_obs_vecs, size_type_t dim,
-    value_type_t const* __restrict__ obs,
-    index_type_t* __restrict__ codes) const {
+  explicit kmeans_solver_t(
+    cluster_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
+
+  std::pair<value_type_t, index_type_t> solve(handle_t const& handle,
+                                              size_type_t n_obs_vecs,
+                                              size_type_t dim,
+                                              value_type_t const* __restrict__ obs,
+                                              index_type_t* __restrict__ codes) const
+  {
     RAFT_EXPECTS(obs != nullptr, "Null obs buffer.");
     RAFT_EXPECTS(codes != nullptr, "Null codes buffer.");
     value_type_t residual{};
     index_type_t iters{};
-    kmeans(handle, n_obs_vecs, dim, config_.n_clusters, config_.tol,
-           config_.maxIter, obs, codes, residual, iters, config_.seed);
+    kmeans(handle,
+           n_obs_vecs,
+           dim,
+           config_.n_clusters,
+           config_.tol,
+           config_.maxIter,
+           obs,
+           codes,
+           residual,
+           iters,
+           config_.seed);
     return std::make_pair(residual, iters);
   }
 
diff --git a/cpp/include/raft/spectral/eigen_solvers.hpp b/cpp/include/raft/spectral/eigen_solvers.hpp
index e36dca2e0c..156b996586 100644
--- a/cpp/include/raft/spectral/eigen_solvers.hpp
+++ b/cpp/include/raft/spectral/eigen_solvers.hpp
@@ -23,8 +23,7 @@ using namespace matrix;
 
 // aggregate of control params for Eigen Solver:
 //
-template <typename index_type_t, typename value_type_t,
-          typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
 struct eigen_solver_config_t {
   size_type_t n_eigVecs;
   size_type_t maxIter;
@@ -34,42 +33,59 @@ struct eigen_solver_config_t {
 
   bool reorthogonalize{false};
   unsigned long long seed{
-    1234567};  // CAVEAT: this default value is now common to all instances of using seed in Lanczos; was not the case before: there were places where a default seed = 123456 was used; this may trigger slightly different # solver iterations
+    1234567};  // CAVEAT: this default value is now common to all instances of using seed in
+               // Lanczos; was not the case before: there were places where a default seed = 123456
+               // was used; this may trigger slightly different # solver iterations
 };
 
-template <typename index_type_t, typename value_type_t,
-          typename size_type_t = index_type_t>
+template <typename index_type_t, typename value_type_t, typename size_type_t = index_type_t>
 struct lanczos_solver_t {
-  explicit lanczos_solver_t(eigen_solver_config_t<index_type_t, value_type_t,
-                                                  size_type_t> const& config)
-    : config_(config) {}
+  explicit lanczos_solver_t(
+    eigen_solver_config_t<index_type_t, value_type_t, size_type_t> const& config)
+    : config_(config)
+  {
+  }
 
-  index_type_t solve_smallest_eigenvectors(
-    handle_t const& handle,
-    sparse_matrix_t<index_type_t, value_type_t> const& A,
-    value_type_t* __restrict__ eigVals,
-    value_type_t* __restrict__ eigVecs) const {
+  index_type_t solve_smallest_eigenvectors(handle_t const& handle,
+                                           sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                           value_type_t* __restrict__ eigVals,
+                                           value_type_t* __restrict__ eigVecs) const
+  {
     RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
     RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
     index_type_t iters{};
-    computeSmallestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter,
-                                config_.restartIter, config_.tol,
-                                config_.reorthogonalize, iters, eigVals,
-                                eigVecs, config_.seed);
+    computeSmallestEigenvectors(handle,
+                                A,
+                                config_.n_eigVecs,
+                                config_.maxIter,
+                                config_.restartIter,
+                                config_.tol,
+                                config_.reorthogonalize,
+                                iters,
+                                eigVals,
+                                eigVecs,
+                                config_.seed);
     return iters;
   }
 
-  index_type_t solve_largest_eigenvectors(
-    handle_t const& handle,
-    sparse_matrix_t<index_type_t, value_type_t> const& A,
-    value_type_t* __restrict__ eigVals,
-    value_type_t* __restrict__ eigVecs) const {
+  index_type_t solve_largest_eigenvectors(handle_t const& handle,
+                                          sparse_matrix_t<index_type_t, value_type_t> const& A,
+                                          value_type_t* __restrict__ eigVals,
+                                          value_type_t* __restrict__ eigVecs) const
+  {
     RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
     RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
     index_type_t iters{};
-    computeLargestEigenvectors(handle, A, config_.n_eigVecs, config_.maxIter,
-                               config_.restartIter, config_.tol,
-                               config_.reorthogonalize, iters, eigVals, eigVecs,
+    computeLargestEigenvectors(handle,
+                               A,
+                               config_.n_eigVecs,
+                               config_.maxIter,
+                               config_.restartIter,
+                               config_.tol,
+                               config_.reorthogonalize,
+                               iters,
+                               eigVals,
+                               eigVecs,
                                config_.seed);
     return iters;
   }
diff --git a/cpp/include/raft/spectral/kmeans.hpp b/cpp/include/raft/spectral/kmeans.hpp
index d089b85518..18b23bea55 100644
--- a/cpp/include/raft/spectral/kmeans.hpp
+++ b/cpp/include/raft/spectral/kmeans.hpp
@@ -43,15 +43,15 @@ using namespace raft::linalg;
 // Useful grid settings
 // =========================================================
 
-constexpr unsigned int BLOCK_SIZE = 1024;
-constexpr unsigned int WARP_SIZE = 32;
+constexpr unsigned int BLOCK_SIZE      = 1024;
+constexpr unsigned int WARP_SIZE       = 32;
 constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
 
 // =========================================================
 // CUDA kernels
 // =========================================================
 
-/** 
+/**
  *  @brief Compute distances between observation vectors and centroids
  *    Block dimensions should be (warpSize, 1,
  *    blockSize/warpSize). Ideally, the grid is large enough so there
@@ -75,11 +75,13 @@ constexpr unsigned int BSIZE_DIV_WSIZE = (BLOCK_SIZE / WARP_SIZE);
  *    initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void computeDistances(
-  index_type_t n, index_type_t d, index_type_t k,
-  const value_type_t* __restrict__ obs,
-  const value_type_t* __restrict__ centroids,
-  value_type_t* __restrict__ dists) {
+static __global__ void computeDistances(index_type_t n,
+                                        index_type_t d,
+                                        index_type_t k,
+                                        const value_type_t* __restrict__ obs,
+                                        const value_type_t* __restrict__ centroids,
+                                        value_type_t* __restrict__ dists)
+{
   // Loop index
   index_type_t i;
 
@@ -114,12 +116,10 @@ static __global__ void computeDistances(
 
         // Perform reduction on warp
         for (i = WARP_SIZE / 2; i > 0; i /= 2)
-          dist_private +=
-            __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i);
+          dist_private += __shfl_down_sync(warp_full_mask(), dist_private, i, 2 * i);
 
         // Write result to global memory
-        if (threadIdx.x == 0)
-          atomicAdd(dists + IDX(gidz, gidy, n), dist_private);
+        if (threadIdx.x == 0) atomicAdd(dists + IDX(gidz, gidy, n), dist_private);
 
         // Move to another observation vector
         gidz += blockDim.z * gridDim.z;
@@ -134,8 +134,8 @@ static __global__ void computeDistances(
   }
 }
 
-/** 
- *  @brief Find closest centroid to observation vectors. 
+/**
+ *  @brief Find closest centroid to observation vectors.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
  *    grid is large enough so there are n threads.
  *  @tparam index_type_t the type of data used for indexing.
@@ -156,10 +156,12 @@ static __global__ void computeDistances(
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void minDistances(index_type_t n, index_type_t k,
+static __global__ void minDistances(index_type_t n,
+                                    index_type_t k,
                                     value_type_t* __restrict__ dists,
                                     index_type_t* __restrict__ codes,
-                                    index_type_t* __restrict__ clusterSizes) {
+                                    index_type_t* __restrict__ clusterSizes)
+{
   // Loop index
   index_type_t i, j;
 
@@ -178,8 +180,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k,
     dist_min = dists[IDX(i, 0, n)];
     for (j = 1; j < k; ++j) {
       dist_curr = dists[IDX(i, j, n)];
-      code_min = (dist_curr < dist_min) ? j : code_min;
-      dist_min = (dist_curr < dist_min) ? dist_curr : dist_min;
+      code_min  = (dist_curr < dist_min) ? j : code_min;
+      dist_min  = (dist_curr < dist_min) ? dist_curr : dist_min;
     }
 
     // Transfer result to global memory
@@ -194,8 +196,8 @@ static __global__ void minDistances(index_type_t n, index_type_t k,
   }
 }
 
-/** 
- *  @brief Check if newly computed distances are smaller than old distances. 
+/**
+ *  @brief Check if newly computed distances are smaller than old distances.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
  *    grid is large enough so there are n threads.
  *  @tparam index_type_t the type of data used for indexing.
@@ -218,7 +220,8 @@ static __global__ void minDistances2(index_type_t n,
                                      value_type_t* __restrict__ dists_old,
                                      const value_type_t* __restrict__ dists_new,
                                      index_type_t* __restrict__ codes_old,
-                                     index_type_t code_new) {
+                                     index_type_t code_new)
+{
   // Loop index
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -243,7 +246,7 @@ static __global__ void minDistances2(index_type_t n,
   }
 }
 
-/** 
+/**
  *  @brief Compute size of k-means clusters.
  *    Block and grid dimensions should be 1-dimensional. Ideally the
  *    grid is large enough so there are n threads.
@@ -255,9 +258,10 @@ static __global__ void minDistances2(index_type_t n,
  *    cluster. Entries must be initialized to zero.
  */
 template <typename index_type_t>
-static __global__ void computeClusterSizes(
-  index_type_t n, const index_type_t* __restrict__ codes,
-  index_type_t* __restrict__ clusterSizes) {
+static __global__ void computeClusterSizes(index_type_t n,
+                                           const index_type_t* __restrict__ codes,
+                                           index_type_t* __restrict__ clusterSizes)
+{
   index_type_t i = threadIdx.x + blockIdx.x * blockDim.x;
   while (i < n) {
     atomicAdd(clusterSizes + codes[i], 1);
@@ -265,8 +269,8 @@ static __global__ void computeClusterSizes(
   }
 }
 
-/** 
- *  @brief Divide rows of centroid matrix by cluster sizes. 
+/**
+ *  @brief Divide rows of centroid matrix by cluster sizes.
  *    Divides the ith column of the sum matrix by the size of the ith
  *    cluster. If the sum matrix has been initialized so that the ith
  *    row is the sum of all observation vectors in the ith cluster,
@@ -287,9 +291,11 @@ static __global__ void computeClusterSizes(
  *    column is the mean position of a cluster).
  */
 template <typename index_type_t, typename value_type_t>
-static __global__ void divideCentroids(
-  index_type_t d, index_type_t k, const index_type_t* __restrict__ clusterSizes,
-  value_type_t* __restrict__ centroids) {
+static __global__ void divideCentroids(index_type_t d,
+                                       index_type_t k,
+                                       const index_type_t* __restrict__ clusterSizes,
+                                       value_type_t* __restrict__ centroids)
+{
   // Global indices
   index_type_t gidx, gidy;
 
@@ -340,11 +346,14 @@ static __global__ void divideCentroids(
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int chooseNewCentroid(handle_t const& handle, index_type_t n,
-                             index_type_t d, value_type_t rand,
+static int chooseNewCentroid(handle_t const& handle,
+                             index_type_t n,
+                             index_type_t d,
+                             value_type_t rand,
                              const value_type_t* __restrict__ obs,
                              value_type_t* __restrict__ dists,
-                             value_type_t* __restrict__ centroid) {
+                             value_type_t* __restrict__ centroid)
+{
   // Cumulative sum of distances
   value_type_t* distsCumSum = dists + n;
   // Residual sum of squares
@@ -352,44 +361,44 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n,
   // Observation vector that is chosen as new centroid
   index_type_t obsIndex;
 
-  auto stream = handle.get_stream();
+  auto stream             = handle.get_stream();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Compute cumulative sum of distances
-  thrust::inclusive_scan(thrust_exec_policy, thrust::device_pointer_cast(dists),
+  thrust::inclusive_scan(thrust_exec_policy,
+                         thrust::device_pointer_cast(dists),
                          thrust::device_pointer_cast(dists + n),
                          thrust::device_pointer_cast(distsCumSum));
   CHECK_CUDA(stream);
-  CUDA_TRY(cudaMemcpyAsync(&distsSum, distsCumSum + n - 1, sizeof(value_type_t),
-                           cudaMemcpyDeviceToHost, stream));
+  CUDA_TRY(cudaMemcpyAsync(
+    &distsSum, distsCumSum + n - 1, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
 
   // Randomly choose observation vector
   //   Probabilities are proportional to square of distance to closest
   //   centroid (see k-means++ algorithm)
   //
-  //seg-faults due to Thrust bug
-  //on binary-search-like algorithms
-  //when run with stream dependent
-  //execution policies; fixed on Thrust GitHub
-  //hence replace w/ linear interpolation,
-  //until the Thrust issue gets resolved:
+  // seg-faults due to Thrust bug
+  // on binary-search-like algorithms
+  // when run with stream dependent
+  // execution policies; fixed on Thrust GitHub
+  // hence replace w/ linear interpolation,
+  // until the Thrust issue gets resolved:
   //
   // obsIndex = (thrust::lower_bound(
   //               thrust_exec_policy, thrust::device_pointer_cast(distsCumSum),
   //               thrust::device_pointer_cast(distsCumSum + n), distsSum * rand) -
   //             thrust::device_pointer_cast(distsCumSum));
   //
-  //linear interpolation logic:
+  // linear interpolation logic:
   //{
   value_type_t minSum{0};
-  CUDA_TRY(cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t),
-                           cudaMemcpyDeviceToHost, stream));
+  CUDA_TRY(
+    cudaMemcpyAsync(&minSum, distsCumSum, sizeof(value_type_t), cudaMemcpyDeviceToHost, stream));
   CHECK_CUDA(stream);
 
   if (distsSum > minSum) {
     value_type_t vIndex = static_cast<value_type_t>(n - 1);
-    obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) /
-                                         (distsSum - minSum));
+    obsIndex = static_cast<index_type_t>(vIndex * (distsSum * rand - minSum) / (distsSum - minSum));
   } else {
     obsIndex = 0;
   }
@@ -400,15 +409,17 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n,
   obsIndex = min(obsIndex, n - 1);
 
   // Record new centroid position
-  CUDA_TRY(cudaMemcpyAsync(centroid, obs + IDX(0, obsIndex, d),
-                           d * sizeof(value_type_t), cudaMemcpyDeviceToDevice,
+  CUDA_TRY(cudaMemcpyAsync(centroid,
+                           obs + IDX(0, obsIndex, d),
+                           d * sizeof(value_type_t),
+                           cudaMemcpyDeviceToDevice,
                            stream));
 
   return 0;
 }
 
 /**
- *  @brief Choose initial cluster centroids for k-means algorithm.  
+ *  @brief Choose initial cluster centroids for k-means algorithm.
  *    Centroids are randomly chosen with k-means++ algorithm
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
@@ -432,11 +443,17 @@ static int chooseNewCentroid(handle_t const& handle, index_type_t n,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int initializeCentroids(
-  handle_t const& handle, index_type_t n, index_type_t d, index_type_t k,
-  const value_type_t* __restrict__ obs, value_type_t* __restrict__ centroids,
-  index_type_t* __restrict__ codes, index_type_t* __restrict__ clusterSizes,
-  value_type_t* __restrict__ dists, unsigned long long seed) {
+static int initializeCentroids(handle_t const& handle,
+                               index_type_t n,
+                               index_type_t d,
+                               index_type_t k,
+                               const value_type_t* __restrict__ obs,
+                               value_type_t* __restrict__ centroids,
+                               index_type_t* __restrict__ codes,
+                               index_type_t* __restrict__ clusterSizes,
+                               value_type_t* __restrict__ dists,
+                               unsigned long long seed)
+{
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
@@ -448,7 +465,7 @@ static int initializeCentroids(
   thrust::default_random_engine rng(seed);
   thrust::uniform_real_distribution<value_type_t> uniformDist(0, 1);
 
-  auto stream = handle.get_stream();
+  auto stream             = handle.get_stream();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   constexpr index_type_t grid_lower_bound{65535};
@@ -461,35 +478,34 @@ static int initializeCentroids(
   dim3 blockDim_warp{WARP_SIZE, 1, BSIZE_DIV_WSIZE};
 
   // CUDA grid dimensions
-  dim3 gridDim_warp{
-    min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1,
-    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
+  dim3 gridDim_warp{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+                    1,
+                    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound)};
 
   // CUDA grid dimensions
-  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound),
-                     1, 1};
+  dim3 gridDim_block{min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound), 1, 1};
 
   // Assign observation vectors to code 0
   CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
 
   // Choose first centroid
-  thrust::fill(thrust_exec_policy, thrust::device_pointer_cast(dists),
-               thrust::device_pointer_cast(dists + n), 1);
+  thrust::fill(thrust_exec_policy,
+               thrust::device_pointer_cast(dists),
+               thrust::device_pointer_cast(dists + n),
+               1);
   CHECK_CUDA(stream);
   if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids))
     WARNING("error in k-means++ (could not pick centroid)");
 
   // Compute distances from first centroid
   CUDA_TRY(cudaMemsetAsync(dists, 0, n * sizeof(value_type_t), stream));
-  computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(
-    n, d, 1, obs, centroids, dists);
+  computeDistances<<<gridDim_warp, blockDim_warp, 0, stream>>>(n, d, 1, obs, centroids, dists);
   CHECK_CUDA(stream);
 
   // Choose remaining centroids
   for (i = 1; i < k; ++i) {
     // Choose ith centroid
-    if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists,
-                          centroids + IDX(0, i, d)))
+    if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, dists, centroids + IDX(0, i, d)))
       WARNING("error in k-means++ (could not pick centroid)");
 
     // Compute distances from ith centroid
@@ -499,22 +515,20 @@ static int initializeCentroids(
     CHECK_CUDA(stream);
 
     // Recompute minimum distances
-    minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n,
-                                                            codes, i);
+    minDistances2<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, dists, dists + n, codes, i);
     CHECK_CUDA(stream);
   }
 
   // Compute cluster sizes
   CUDA_TRY(cudaMemsetAsync(clusterSizes, 0, k * sizeof(index_type_t), stream));
-  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes,
-                                                                clusterSizes);
+  computeClusterSizes<<<gridDim_block, BLOCK_SIZE, 0, stream>>>(n, codes, clusterSizes);
   CHECK_CUDA(stream);
 
   return 0;
 }
 
-/** 
- *  @brief Find cluster centroids closest to observation vectors. 
+/**
+ *  @brief Find cluster centroids closest to observation vectors.
  *    Distance is measured with Euclidean norm.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
@@ -540,15 +554,18 @@ static int initializeCentroids(
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int assignCentroids(handle_t const& handle, index_type_t n,
-                           index_type_t d, index_type_t k,
+static int assignCentroids(handle_t const& handle,
+                           index_type_t n,
+                           index_type_t d,
+                           index_type_t k,
                            const value_type_t* __restrict__ obs,
                            const value_type_t* __restrict__ centroids,
                            value_type_t* __restrict__ dists,
                            index_type_t* __restrict__ codes,
                            index_type_t* __restrict__ clusterSizes,
-                           value_type_t* residual_host) {
-  auto stream = handle.get_stream();
+                           value_type_t* residual_host)
+{
+  auto stream             = handle.get_stream();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Compute distance between centroids and observation vectors
@@ -561,11 +578,9 @@ static int assignCentroids(handle_t const& handle, index_type_t n,
   constexpr index_type_t grid_lower_bound{65535};
   gridDim.x = min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound);
   gridDim.y = min(k, grid_lower_bound);
-  gridDim.z =
-    min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
+  gridDim.z = min((n + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound);
 
-  computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids,
-                                                     dists);
+  computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, k, obs, centroids, dists);
   CHECK_CUDA(stream);
 
   // Find centroid closest to each observation vector
@@ -573,23 +588,21 @@ static int assignCentroids(handle_t const& handle, index_type_t n,
   blockDim.x = BLOCK_SIZE;
   blockDim.y = 1;
   blockDim.z = 1;
-  gridDim.x = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
-  gridDim.y = 1;
-  gridDim.z = 1;
-  minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes,
-                                                 clusterSizes);
+  gridDim.x  = min((n + BLOCK_SIZE - 1) / BLOCK_SIZE, grid_lower_bound);
+  gridDim.y  = 1;
+  gridDim.z  = 1;
+  minDistances<<<gridDim, blockDim, 0, stream>>>(n, k, dists, codes, clusterSizes);
   CHECK_CUDA(stream);
 
   // Compute residual sum of squares
-  *residual_host =
-    thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(dists),
-                   thrust::device_pointer_cast(dists + n));
+  *residual_host = thrust::reduce(
+    thrust_exec_policy, thrust::device_pointer_cast(dists), thrust::device_pointer_cast(dists + n));
 
   return 0;
 }
 
-/** 
- *  @brief Update cluster centroids for k-means algorithm. 
+/**
+ *  @brief Update cluster centroids for k-means algorithm.
  *    All clusters are assumed to be non-empty.
  *  @tparam index_type_t the type of data used for indexing.
  *  @tparam value_type_t the type of data used for weights, distances.
@@ -613,26 +626,29 @@ static int assignCentroids(handle_t const& handle, index_type_t n,
  *  @return Zero if successful. Otherwise non-zero.
  */
 template <typename index_type_t, typename value_type_t>
-static int updateCentroids(handle_t const& handle, index_type_t n,
-                           index_type_t d, index_type_t k,
+static int updateCentroids(handle_t const& handle,
+                           index_type_t n,
+                           index_type_t d,
+                           index_type_t k,
                            const value_type_t* __restrict__ obs,
                            const index_type_t* __restrict__ codes,
                            const index_type_t* __restrict__ clusterSizes,
                            value_type_t* __restrict__ centroids,
                            value_type_t* __restrict__ work,
-                           index_type_t* __restrict__ work_int) {
+                           index_type_t* __restrict__ work_int)
+{
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
 
   // Useful constants
-  const value_type_t one = 1;
+  const value_type_t one  = 1;
   const value_type_t zero = 0;
 
   constexpr index_type_t grid_lower_bound{65535};
 
-  auto stream = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
+  auto stream             = handle.get_stream();
+  auto cublas_h           = handle.get_cublas_handle();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Device memory
@@ -641,34 +657,56 @@ static int updateCentroids(handle_t const& handle, index_type_t n,
   thrust::device_ptr<index_type_t> rows(work_int + d * n);
 
   // Take transpose of observation matrix
-  CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, n, d, &one, obs,
-                          d, &zero, (value_type_t*)NULL, n,
-                          thrust::raw_pointer_cast(obs_copy), n, stream));
+  CUBLAS_CHECK(cublasgeam(cublas_h,
+                          CUBLAS_OP_T,
+                          CUBLAS_OP_N,
+                          n,
+                          d,
+                          &one,
+                          obs,
+                          d,
+                          &zero,
+                          (value_type_t*)NULL,
+                          n,
+                          thrust::raw_pointer_cast(obs_copy),
+                          n,
+                          stream));
 
   // Cluster assigned to each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
   CHECK_CUDA(stream);
-  thrust::transform(thrust_exec_policy, rows, rows + d * n,
-                    thrust::make_constant_iterator<index_type_t>(n), rows,
+  thrust::transform(thrust_exec_policy,
+                    rows,
+                    rows + d * n,
+                    thrust::make_constant_iterator<index_type_t>(n),
+                    rows,
                     thrust::modulus<index_type_t>());
   CHECK_CUDA(stream);
-  thrust::gather(thrust_exec_policy, rows, rows + d * n,
-                 thrust::device_pointer_cast(codes), codes_copy);
+  thrust::gather(
+    thrust_exec_policy, rows, rows + d * n, thrust::device_pointer_cast(codes), codes_copy);
   CHECK_CUDA(stream);
 
   // Row associated with each observation matrix entry
   thrust::sequence(thrust_exec_policy, rows, rows + d * n);
   CHECK_CUDA(stream);
-  thrust::transform(thrust_exec_policy, rows, rows + d * n,
-                    thrust::make_constant_iterator<index_type_t>(n), rows,
+  thrust::transform(thrust_exec_policy,
+                    rows,
+                    rows + d * n,
+                    thrust::make_constant_iterator<index_type_t>(n),
+                    rows,
                     thrust::divides<index_type_t>());
   CHECK_CUDA(stream);
 
   // Sort and reduce to add observation vectors in same cluster
-  thrust::stable_sort_by_key(thrust_exec_policy, codes_copy, codes_copy + d * n,
+  thrust::stable_sort_by_key(thrust_exec_policy,
+                             codes_copy,
+                             codes_copy + d * n,
                              make_zip_iterator(make_tuple(obs_copy, rows)));
   CHECK_CUDA(stream);
-  thrust::reduce_by_key(thrust_exec_policy, rows, rows + d * n, obs_copy,
+  thrust::reduce_by_key(thrust_exec_policy,
+                        rows,
+                        rows + d * n,
+                        obs_copy,
                         codes_copy,  // Output to codes_copy is ignored
                         thrust::device_pointer_cast(centroids));
   CHECK_CUDA(stream);
@@ -679,12 +717,11 @@ static int updateCentroids(handle_t const& handle, index_type_t n,
   dim3 blockDim{WARP_SIZE, BLOCK_SIZE / WARP_SIZE, 1};
 
   // CUDA grid dimensions
-  dim3 gridDim{
-    min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
-    min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound), 1};
+  dim3 gridDim{min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+               min((k + BSIZE_DIV_WSIZE - 1) / BSIZE_DIV_WSIZE, grid_lower_bound),
+               1};
 
-  divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes,
-                                                    centroids);
+  divideCentroids<<<gridDim, blockDim, 0, stream>>>(d, k, clusterSizes, centroids);
   CHECK_CUDA(stream);
 
   return 0;
@@ -698,8 +735,8 @@ namespace raft {
 // k-means algorithm
 // =========================================================
 
-/** 
- *  @brief Find clusters with k-means algorithm. 
+/**
+ *  @brief Find clusters with k-means algorithm.
  *    Initial centroids are chosen with k-means++ algorithm. Empty
  *    clusters are reinitialized by choosing new centroids with
  *    k-means++ algorithm.
@@ -735,15 +772,22 @@ namespace raft {
  *  @return error flag.
  */
 template <typename index_type_t, typename value_type_t>
-int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
-           index_type_t k, value_type_t tol, index_type_t maxiter,
+int kmeans(handle_t const& handle,
+           index_type_t n,
+           index_type_t d,
+           index_type_t k,
+           value_type_t tol,
+           index_type_t maxiter,
            const value_type_t* __restrict__ obs,
            index_type_t* __restrict__ codes,
            index_type_t* __restrict__ clusterSizes,
            value_type_t* __restrict__ centroids,
-           value_type_t* __restrict__ work, index_type_t* __restrict__ work_int,
-           value_type_t* residual_host, index_type_t* iters_host,
-           unsigned long long seed) {
+           value_type_t* __restrict__ work,
+           index_type_t* __restrict__ work_int,
+           value_type_t* residual_host,
+           index_type_t* iters_host,
+           unsigned long long seed)
+{
   // -------------------------------------------------------
   // Variable declarations
   // -------------------------------------------------------
@@ -764,101 +808,93 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
   // Initialization
   // -------------------------------------------------------
 
-  auto stream = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
+  auto stream             = handle.get_stream();
+  auto cublas_h           = handle.get_cublas_handle();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   // Trivial cases
   if (k == 1) {
     CUDA_TRY(cudaMemsetAsync(codes, 0, n * sizeof(index_type_t), stream));
-    CUDA_TRY(cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t),
-                             cudaMemcpyHostToDevice, stream));
-    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids,
-                        work, work_int))
+    CUDA_TRY(
+      cudaMemcpyAsync(clusterSizes, &n, sizeof(index_type_t), cudaMemcpyHostToDevice, stream));
+    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
       WARNING("could not compute k-means centroids");
 
     dim3 blockDim{WARP_SIZE, 1, BLOCK_SIZE / WARP_SIZE};
 
     dim3 gridDim{
-      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound), 1,
-      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE),
-          grid_lower_bound)};
+      min((d + WARP_SIZE - 1) / WARP_SIZE, grid_lower_bound),
+      1,
+      min((n + BLOCK_SIZE / WARP_SIZE - 1) / (BLOCK_SIZE / WARP_SIZE), grid_lower_bound)};
 
     CUDA_TRY(cudaMemsetAsync(work, 0, n * k * sizeof(value_type_t), stream));
-    computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids,
-                                                       work);
+    computeDistances<<<gridDim, blockDim, 0, stream>>>(n, d, 1, obs, centroids, work);
     CHECK_CUDA(stream);
-    *residual_host =
-      thrust::reduce(thrust_exec_policy, thrust::device_pointer_cast(work),
-                     thrust::device_pointer_cast(work + n));
+    *residual_host = thrust::reduce(
+      thrust_exec_policy, thrust::device_pointer_cast(work), thrust::device_pointer_cast(work + n));
     CHECK_CUDA(stream);
     return 0;
   }
   if (n <= k) {
-    thrust::sequence(thrust_exec_policy, thrust::device_pointer_cast(codes),
+    thrust::sequence(thrust_exec_policy,
+                     thrust::device_pointer_cast(codes),
                      thrust::device_pointer_cast(codes + n));
     CHECK_CUDA(stream);
-    thrust::fill_n(thrust_exec_policy,
-                   thrust::device_pointer_cast(clusterSizes), n, 1);
+    thrust::fill_n(thrust_exec_policy, thrust::device_pointer_cast(clusterSizes), n, 1);
     CHECK_CUDA(stream);
 
     if (n < k)
-      CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0,
-                               (k - n) * sizeof(index_type_t), stream));
-    CUDA_TRY(cudaMemcpyAsync(centroids, obs, d * n * sizeof(value_type_t),
-                             cudaMemcpyDeviceToDevice, stream));
+      CUDA_TRY(cudaMemsetAsync(clusterSizes + n, 0, (k - n) * sizeof(index_type_t), stream));
+    CUDA_TRY(cudaMemcpyAsync(
+      centroids, obs, d * n * sizeof(value_type_t), cudaMemcpyDeviceToDevice, stream));
     *residual_host = 0;
     return 0;
   }
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(linalg::cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // -------------------------------------------------------
   // k-means++ algorithm
   // -------------------------------------------------------
 
   // Choose initial cluster centroids
-  if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes,
-                          work, seed))
+  if (initializeCentroids(handle, n, d, k, obs, centroids, codes, clusterSizes, work, seed))
     WARNING("could not initialize k-means centroids");
 
   // Apply k-means iteration until convergence
   for (iter = 0; iter < maxiter; ++iter) {
     // Update cluster centroids
-    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids,
-                        work, work_int))
+    if (updateCentroids(handle, n, d, k, obs, codes, clusterSizes, centroids, work, work_int))
       WARNING("could not update k-means centroids");
 
     // Determine centroid closest to each observation
     residualPrev = *residual_host;
-    if (assignCentroids(handle, n, d, k, obs, centroids, work, codes,
-                        clusterSizes, residual_host))
+    if (assignCentroids(handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
       WARNING("could not assign observation vectors to k-means clusters");
 
     // Reinitialize empty clusters with new centroids
-    index_type_t emptyCentroid =
-      (thrust::find(thrust_exec_policy,
-                    thrust::device_pointer_cast(clusterSizes),
-                    thrust::device_pointer_cast(clusterSizes + k), 0) -
-       thrust::device_pointer_cast(clusterSizes));
+    index_type_t emptyCentroid = (thrust::find(thrust_exec_policy,
+                                               thrust::device_pointer_cast(clusterSizes),
+                                               thrust::device_pointer_cast(clusterSizes + k),
+                                               0) -
+                                  thrust::device_pointer_cast(clusterSizes));
 
     // FIXME: emptyCentroid never reaches k (infinite loop) under certain
     // conditions, such as if obs is corrupt (as seen as a result of a
     // DataFrame column of NULL edge vals used to create the Graph)
     while (emptyCentroid < k) {
-      if (chooseNewCentroid(handle, n, d, uniformDist(rng), obs, work,
-                            centroids + IDX(0, emptyCentroid, d)))
+      if (chooseNewCentroid(
+            handle, n, d, uniformDist(rng), obs, work, centroids + IDX(0, emptyCentroid, d)))
         WARNING("could not replace empty centroid");
-      if (assignCentroids(handle, n, d, k, obs, centroids, work, codes,
-                          clusterSizes, residual_host))
+      if (assignCentroids(
+            handle, n, d, k, obs, centroids, work, codes, clusterSizes, residual_host))
         WARNING("could not assign observation vectors to k-means clusters");
-      emptyCentroid =
-        (thrust::find(thrust_exec_policy,
-                      thrust::device_pointer_cast(clusterSizes),
-                      thrust::device_pointer_cast(clusterSizes + k), 0) -
-         thrust::device_pointer_cast(clusterSizes));
+      emptyCentroid = (thrust::find(thrust_exec_policy,
+                                    thrust::device_pointer_cast(clusterSizes),
+                                    thrust::device_pointer_cast(clusterSizes + k),
+                                    0) -
+                       thrust::device_pointer_cast(clusterSizes));
       CHECK_CUDA(stream);
     }
 
@@ -870,14 +906,13 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
   }
 
   // Warning if k-means has failed to converge
-  if (std::fabs(residualPrev - (*residual_host)) / n >= tol)
-    WARNING("k-means failed to converge");
+  if (std::fabs(residualPrev - (*residual_host)) / n >= tol) WARNING("k-means failed to converge");
 
   *iters_host = iter;
   return 0;
 }
 
-/** 
+/**
  *  @brief Find clusters with k-means algorithm.
  *    Initial centroids are chosen with k-means++ algorithm. Empty
  *    clusters are reinitialized by choosing new centroids with
@@ -903,11 +938,18 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
  *  @return error flag
  */
 template <typename index_type_t, typename value_type_t>
-int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
-           index_type_t k, value_type_t tol, index_type_t maxiter,
+int kmeans(handle_t const& handle,
+           index_type_t n,
+           index_type_t d,
+           index_type_t k,
+           value_type_t tol,
+           index_type_t maxiter,
            const value_type_t* __restrict__ obs,
-           index_type_t* __restrict__ codes, value_type_t& residual,
-           index_type_t& iters, unsigned long long seed = 123456) {
+           index_type_t* __restrict__ codes,
+           value_type_t& residual,
+           index_type_t& iters,
+           unsigned long long seed = 123456)
+{
   using namespace matrix;
 
   // Check that parameters are valid
@@ -924,9 +966,21 @@ int kmeans(handle_t const& handle, index_type_t n, index_type_t d,
   vector_t<index_type_t> work_int(handle, 2 * d * n);
 
   // Perform k-means
-  return kmeans<index_type_t, value_type_t>(
-    handle, n, d, k, tol, maxiter, obs, codes, clusterSizes.raw(),
-    centroids.raw(), work.raw(), work_int.raw(), &residual, &iters, seed);
+  return kmeans<index_type_t, value_type_t>(handle,
+                                            n,
+                                            d,
+                                            k,
+                                            tol,
+                                            maxiter,
+                                            obs,
+                                            codes,
+                                            clusterSizes.raw(),
+                                            centroids.raw(),
+                                            work.raw(),
+                                            work_int.raw(),
+                                            &residual,
+                                            &iters,
+                                            seed);
 }
 
 }  // namespace raft
diff --git a/cpp/include/raft/spectral/lapack.hpp b/cpp/include/raft/spectral/lapack.hpp
index d14bf05f37..35fc22c770 100644
--- a/cpp/include/raft/spectral/lapack.hpp
+++ b/cpp/include/raft/spectral/lapack.hpp
@@ -21,66 +21,125 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/error.hpp>
 
-//for now; TODO: check if/where this `define` should be;
+// for now; TODO: check if/where this `define` should be;
 //
 #define USE_LAPACK
 
 namespace raft {
 
-#define lapackCheckError(status)                        \
-  {                                                     \
-    if (status < 0) {                                   \
-      std::stringstream ss;                             \
-      ss << "Lapack error: argument number " << -status \
-         << " had an illegal value.";                   \
-      throw exception(ss.str());                        \
-    } else if (status > 0)                              \
-      RAFT_FAIL("Lapack error: internal error.");       \
+#define lapackCheckError(status)                                                     \
+  {                                                                                  \
+    if (status < 0) {                                                                \
+      std::stringstream ss;                                                          \
+      ss << "Lapack error: argument number " << -status << " had an illegal value."; \
+      throw exception(ss.str());                                                     \
+    } else if (status > 0)                                                           \
+      RAFT_FAIL("Lapack error: internal error.");                                    \
   }
 
-extern "C" void sgeqrf_(int *m, int *n, float *a, int *lda, float *tau,
-                        float *work, int *lwork, int *info);
-extern "C" void dgeqrf_(int *m, int *n, double *a, int *lda, double *tau,
-                        double *work, int *lwork, int *info);
-extern "C" void sormqr_(char *side, char *trans, int *m, int *n, int *k,
-                        float *a, int *lda, const float *tau, float *c,
-                        int *ldc, float *work, int *lwork, int *info);
-extern "C" void dormqr_(char *side, char *trans, int *m, int *n, int *k,
-                        double *a, int *lda, const double *tau, double *c,
-                        int *ldc, double *work, int *lwork, int *info);
-extern "C" int dgeev_(char *jobvl, char *jobvr, int *n, double *a, int *lda,
-                      double *wr, double *wi, double *vl, int *ldvl, double *vr,
-                      int *ldvr, double *work, int *lwork, int *info);
-
-extern "C" int sgeev_(char *jobvl, char *jobvr, int *n, float *a, int *lda,
-                      float *wr, float *wi, float *vl, int *ldvl, float *vr,
-                      int *ldvr, float *work, int *lwork, int *info);
-
-extern "C" cusolverStatus_t cusolverDnSgemmHost(
-  cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
-  const float *alpha, const float *A, int lda, const float *B, int ldb,
-  const float *beta, float *C, int ldc);
-
-extern "C" cusolverStatus_t cusolverDnDgemmHost(
-  cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k,
-  const double *alpha, const double *A, int lda, const double *B, int ldb,
-  const double *beta, double *C, int ldc);
-
-extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float *d, float *e,
-                                                 int *info);
-
-extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double *d, double *e,
-                                                 int *info);
-
-extern "C" cusolverStatus_t cusolverDnSsteqrHost(const signed char *compz,
-                                                 int n, float *d, float *e,
-                                                 float *z, int ldz, float *work,
-                                                 int *info);
-
-extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char *compz,
-                                                 int n, double *d, double *e,
-                                                 double *z, int ldz,
-                                                 double *work, int *info);
+extern "C" void sgeqrf_(
+  int* m, int* n, float* a, int* lda, float* tau, float* work, int* lwork, int* info);
+extern "C" void dgeqrf_(
+  int* m, int* n, double* a, int* lda, double* tau, double* work, int* lwork, int* info);
+extern "C" void sormqr_(char* side,
+                        char* trans,
+                        int* m,
+                        int* n,
+                        int* k,
+                        float* a,
+                        int* lda,
+                        const float* tau,
+                        float* c,
+                        int* ldc,
+                        float* work,
+                        int* lwork,
+                        int* info);
+extern "C" void dormqr_(char* side,
+                        char* trans,
+                        int* m,
+                        int* n,
+                        int* k,
+                        double* a,
+                        int* lda,
+                        const double* tau,
+                        double* c,
+                        int* ldc,
+                        double* work,
+                        int* lwork,
+                        int* info);
+extern "C" int dgeev_(char* jobvl,
+                      char* jobvr,
+                      int* n,
+                      double* a,
+                      int* lda,
+                      double* wr,
+                      double* wi,
+                      double* vl,
+                      int* ldvl,
+                      double* vr,
+                      int* ldvr,
+                      double* work,
+                      int* lwork,
+                      int* info);
+
+extern "C" int sgeev_(char* jobvl,
+                      char* jobvr,
+                      int* n,
+                      float* a,
+                      int* lda,
+                      float* wr,
+                      float* wi,
+                      float* vl,
+                      int* ldvl,
+                      float* vr,
+                      int* ldvr,
+                      float* work,
+                      int* lwork,
+                      int* info);
+
+extern "C" cusolverStatus_t cusolverDnSgemmHost(cublasOperation_t transa,
+                                                cublasOperation_t transb,
+                                                int m,
+                                                int n,
+                                                int k,
+                                                const float* alpha,
+                                                const float* A,
+                                                int lda,
+                                                const float* B,
+                                                int ldb,
+                                                const float* beta,
+                                                float* C,
+                                                int ldc);
+
+extern "C" cusolverStatus_t cusolverDnDgemmHost(cublasOperation_t transa,
+                                                cublasOperation_t transb,
+                                                int m,
+                                                int n,
+                                                int k,
+                                                const double* alpha,
+                                                const double* A,
+                                                int lda,
+                                                const double* B,
+                                                int ldb,
+                                                const double* beta,
+                                                double* C,
+                                                int ldc);
+
+extern "C" cusolverStatus_t cusolverDnSsterfHost(int n, float* d, float* e, int* info);
+
+extern "C" cusolverStatus_t cusolverDnDsterfHost(int n, double* d, double* e, int* info);
+
+extern "C" cusolverStatus_t cusolverDnSsteqrHost(
+  const signed char* compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info);
+
+extern "C" cusolverStatus_t cusolverDnDsteqrHost(const signed char* compz,
+                                                 int n,
+                                                 double* d,
+                                                 double* e,
+                                                 double* z,
+                                                 int ldz,
+                                                 double* work,
+                                                 int* info);
 
 template <typename T>
 class Lapack {
@@ -91,182 +150,339 @@ class Lapack {
  public:
   static void check_lapack_enabled();
 
-  static void gemm(bool transa, bool transb, int m, int n, int k, T alpha,
-                   const T *A, int lda, const T *B, int ldb, T beta, T *C,
+  static void gemm(bool transa,
+                   bool transb,
+                   int m,
+                   int n,
+                   int k,
+                   T alpha,
+                   const T* A,
+                   int lda,
+                   const T* B,
+                   int ldb,
+                   T beta,
+                   T* C,
                    int ldc);
 
   // special QR for lanczos
-  static void sterf(int n, T *d, T *e);
-  static void steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work);
+  static void sterf(int n, T* d, T* e);
+  static void steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work);
 
   // QR
   // computes the QR factorization of a general matrix
-  static void geqrf(int m, int n, T *a, int lda, T *tau, T *work, int *lwork);
+  static void geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork);
   // Generates the real orthogonal matrix Q of the QR factorization formed by geqrf.
 
   // multiply C by implicit Q
-  static void ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
-                    int lda, T *tau, T *c, int ldc, T *work, int *lwork);
-
-  static void geev(T *A, T *eigenvalues, int dim, int lda);
-  static void geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
+  static void ormqr(bool right_side,
+                    bool transq,
+                    int m,
+                    int n,
+                    int k,
+                    T* a,
+                    int lda,
+                    T* tau,
+                    T* c,
+                    int ldc,
+                    T* work,
+                    int* lwork);
+
+  static void geev(T* A, T* eigenvalues, int dim, int lda);
+  static void geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr);
+  static void geev(T* A,
+                   T* eigenvalues_r,
+                   T* eigenvalues_i,
+                   T* eigenvectors_r,
+                   T* eigenvectors_i,
+                   int dim,
+                   int lda,
                    int ldvr);
-  static void geev(T *A, T *eigenvalues_r, T *eigenvalues_i, T *eigenvectors_r,
-                   T *eigenvectors_i, int dim, int lda, int ldvr);
 
  private:
-  static void lapack_gemm(const char transa, const char transb, int m, int n,
-                          int k, float alpha, const float *a, int lda,
-                          const float *b, int ldb, float beta, float *c,
-                          int ldc) {
-    cublasOperation_t cublas_transa =
-      (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cublasOperation_t cublas_transb =
-      (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cusolverDnSgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha,
-                        (float *)a, lda, (float *)b, ldb, &beta, c, ldc);
+  static void lapack_gemm(const char transa,
+                          const char transb,
+                          int m,
+                          int n,
+                          int k,
+                          float alpha,
+                          const float* a,
+                          int lda,
+                          const float* b,
+                          int ldb,
+                          float beta,
+                          float* c,
+                          int ldc)
+  {
+    cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cusolverDnSgemmHost(
+      cublas_transa, cublas_transb, m, n, k, &alpha, (float*)a, lda, (float*)b, ldb, &beta, c, ldc);
   }
 
-  static void lapack_gemm(const signed char transa, const signed char transb,
-                          int m, int n, int k, double alpha, const double *a,
-                          int lda, const double *b, int ldb, double beta,
-                          double *c, int ldc) {
-    cublasOperation_t cublas_transa =
-      (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cublasOperation_t cublas_transb =
-      (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
-    cusolverDnDgemmHost(cublas_transa, cublas_transb, m, n, k, &alpha,
-                        (double *)a, lda, (double *)b, ldb, &beta, c, ldc);
+  static void lapack_gemm(const signed char transa,
+                          const signed char transb,
+                          int m,
+                          int n,
+                          int k,
+                          double alpha,
+                          const double* a,
+                          int lda,
+                          const double* b,
+                          int ldb,
+                          double beta,
+                          double* c,
+                          int ldc)
+  {
+    cublasOperation_t cublas_transa = (transa == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cublasOperation_t cublas_transb = (transb == 'N') ? CUBLAS_OP_N : CUBLAS_OP_T;
+    cusolverDnDgemmHost(cublas_transa,
+                        cublas_transb,
+                        m,
+                        n,
+                        k,
+                        &alpha,
+                        (double*)a,
+                        lda,
+                        (double*)b,
+                        ldb,
+                        &beta,
+                        c,
+                        ldc);
   }
 
-  static void lapack_sterf(int n, float *d, float *e, int *info) {
+  static void lapack_sterf(int n, float* d, float* e, int* info)
+  {
     cusolverDnSsterfHost(n, d, e, info);
   }
 
-  static void lapack_sterf(int n, double *d, double *e, int *info) {
+  static void lapack_sterf(int n, double* d, double* e, int* info)
+  {
     cusolverDnDsterfHost(n, d, e, info);
   }
 
-  static void lapack_steqr(const signed char compz, int n, float *d, float *e,
-                           float *z, int ldz, float *work, int *info) {
+  static void lapack_steqr(
+    const signed char compz, int n, float* d, float* e, float* z, int ldz, float* work, int* info)
+  {
     cusolverDnSsteqrHost(&compz, n, d, e, z, ldz, work, info);
   }
 
-  static void lapack_steqr(const signed char compz, int n, double *d, double *e,
-                           double *z, int ldz, double *work, int *info) {
+  static void lapack_steqr(const signed char compz,
+                           int n,
+                           double* d,
+                           double* e,
+                           double* z,
+                           int ldz,
+                           double* work,
+                           int* info)
+  {
     cusolverDnDsteqrHost(&compz, n, d, e, z, ldz, work, info);
   }
 
-  static void lapack_geqrf(int m, int n, float *a, int lda, float *tau,
-                           float *work, int *lwork, int *info) {
+  static void lapack_geqrf(
+    int m, int n, float* a, int lda, float* tau, float* work, int* lwork, int* info)
+  {
     sgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
   }
 
-  static void lapack_geqrf(int m, int n, double *a, int lda, double *tau,
-                           double *work, int *lwork, int *info) {
+  static void lapack_geqrf(
+    int m, int n, double* a, int lda, double* tau, double* work, int* lwork, int* info)
+  {
     dgeqrf_(&m, &n, a, &lda, tau, work, lwork, info);
   }
 
-  static void lapack_ormqr(char side, char trans, int m, int n, int k, float *a,
-                           int lda, float *tau, float *c, int ldc, float *work,
-                           int *lwork, int *info) {
-    sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork,
-            info);
+  static void lapack_ormqr(char side,
+                           char trans,
+                           int m,
+                           int n,
+                           int k,
+                           float* a,
+                           int lda,
+                           float* tau,
+                           float* c,
+                           int ldc,
+                           float* work,
+                           int* lwork,
+                           int* info)
+  {
+    sormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info);
   }
 
-  static void lapack_ormqr(char side, char trans, int m, int n, int k,
-                           double *a, int lda, double *tau, double *c, int ldc,
-                           double *work, int *lwork, int *info) {
-    dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork,
-            info);
+  static void lapack_ormqr(char side,
+                           char trans,
+                           int m,
+                           int n,
+                           int k,
+                           double* a,
+                           int lda,
+                           double* tau,
+                           double* c,
+                           int ldc,
+                           double* work,
+                           int* lwork,
+                           int* info)
+  {
+    dormqr_(&side, &trans, &m, &n, &k, a, &lda, tau, c, &ldc, work, lwork, info);
   }
 
-  static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, double *a,
-                                  int *lda, double *wr, double *wi, double *vl,
-                                  int *ldvl, double *vr, int *ldvr,
-                                  double *work, int *lwork, int *info) {
-    return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work,
-                  lwork, info);
+  static int lapack_geev_dispatch(char* jobvl,
+                                  char* jobvr,
+                                  int* n,
+                                  double* a,
+                                  int* lda,
+                                  double* wr,
+                                  double* wi,
+                                  double* vl,
+                                  int* ldvl,
+                                  double* vr,
+                                  int* ldvr,
+                                  double* work,
+                                  int* lwork,
+                                  int* info)
+  {
+    return dgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info);
   }
 
-  static int lapack_geev_dispatch(char *jobvl, char *jobvr, int *n, float *a,
-                                  int *lda, float *wr, float *wi, float *vl,
-                                  int *ldvl, float *vr, int *ldvr, float *work,
-                                  int *lwork, int *info) {
-    return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work,
-                  lwork, info);
+  static int lapack_geev_dispatch(char* jobvl,
+                                  char* jobvr,
+                                  int* n,
+                                  float* a,
+                                  int* lda,
+                                  float* wr,
+                                  float* wi,
+                                  float* vl,
+                                  int* ldvl,
+                                  float* vr,
+                                  int* ldvr,
+                                  float* work,
+                                  int* lwork,
+                                  int* info)
+  {
+    return sgeev_(jobvl, jobvr, n, a, lda, wr, wi, vl, ldvl, vr, ldvr, work, lwork, info);
   }
 
   // real eigenvalues
-  static void lapack_geev(T *A, T *eigenvalues, int dim, int lda) {
+  static void lapack_geev(T* A, T* eigenvalues, int dim, int lda)
+  {
     char job = 'N';
     std::vector<T> WI(dim);
-    int ldv = 1;
-    T *vl = 0;
+    int ldv       = 1;
+    T* vl         = 0;
     int work_size = 6 * dim;
     std::vector<T> work(work_size);
     int info;
-    lapack_geev_dispatch(&job, &job, &dim, A, &lda, eigenvalues, WI.data(), vl,
-                         &ldv, vl, &ldv, work.data(), &work_size, &info);
+    lapack_geev_dispatch(&job,
+                         &job,
+                         &dim,
+                         A,
+                         &lda,
+                         eigenvalues,
+                         WI.data(),
+                         vl,
+                         &ldv,
+                         vl,
+                         &ldv,
+                         work.data(),
+                         &work_size,
+                         &info);
     lapackCheckError(info);
   }
 
   // real eigenpairs
-  static void lapack_geev(T *A, T *eigenvalues, T *eigenvectors, int dim,
-                          int lda, int ldvr) {
+  static void lapack_geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr)
+  {
     char jobvl = 'N';
     char jobvr = 'V';
     std::vector<T> WI(dim);
     int work_size = 6 * dim;
-    T *vl = 0;
-    int ldvl = 1;
+    T* vl         = 0;
+    int ldvl      = 1;
     std::vector<T> work(work_size);
     int info;
-    lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues, WI.data(),
-                         vl, &ldvl, eigenvectors, &ldvr, work.data(),
-                         &work_size, &info);
+    lapack_geev_dispatch(&jobvl,
+                         &jobvr,
+                         &dim,
+                         A,
+                         &lda,
+                         eigenvalues,
+                         WI.data(),
+                         vl,
+                         &ldvl,
+                         eigenvectors,
+                         &ldvr,
+                         work.data(),
+                         &work_size,
+                         &info);
     lapackCheckError(info);
   }
 
   // complex eigenpairs
-  static void lapack_geev(T *A, T *eigenvalues_r, T *eigenvalues_i,
-                          T *eigenvectors_r, T *eigenvectors_i, int dim,
-                          int lda, int ldvr) {
-    char jobvl = 'N';
-    char jobvr = 'V';
+  static void lapack_geev(T* A,
+                          T* eigenvalues_r,
+                          T* eigenvalues_i,
+                          T* eigenvectors_r,
+                          T* eigenvectors_i,
+                          int dim,
+                          int lda,
+                          int ldvr)
+  {
+    char jobvl    = 'N';
+    char jobvr    = 'V';
     int work_size = 8 * dim;
-    int ldvl = 1;
+    int ldvl      = 1;
     std::vector<T> work(work_size);
     int info;
-    lapack_geev_dispatch(&jobvl, &jobvr, &dim, A, &lda, eigenvalues_r,
-                         eigenvalues_i, 0, &ldvl, eigenvectors_r, &ldvr,
-                         work.data(), &work_size, &info);
+    lapack_geev_dispatch(&jobvl,
+                         &jobvr,
+                         &dim,
+                         A,
+                         &lda,
+                         eigenvalues_r,
+                         eigenvalues_i,
+                         0,
+                         &ldvl,
+                         eigenvectors_r,
+                         &ldvr,
+                         work.data(),
+                         &work_size,
+                         &info);
     lapackCheckError(info);
   }
 };
 
 template <typename T>
-void Lapack<T>::check_lapack_enabled() {
+void Lapack<T>::check_lapack_enabled()
+{
 #ifndef USE_LAPACK
   RAFT_FAIL("Error: LAPACK not enabled.");
 #endif
 }
 
 template <typename T>
-void Lapack<T>::gemm(bool transa, bool transb, int m, int n, int k, T alpha,
-                     const T *A, int lda, const T *B, int ldb, T beta, T *C,
-                     int ldc) {
+void Lapack<T>::gemm(bool transa,
+                     bool transb,
+                     int m,
+                     int n,
+                     int k,
+                     T alpha,
+                     const T* A,
+                     int lda,
+                     const T* B,
+                     int ldb,
+                     T beta,
+                     T* C,
+                     int ldc)
+{
   // check_lapack_enabled();
   //#ifdef NVGRAPH_USE_LAPACK
   const char transA_char = transa ? 'T' : 'N';
   const char transB_char = transb ? 'T' : 'N';
-  lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C,
-              ldc);
+  lapack_gemm(transA_char, transB_char, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
   //#endif
 }
 
 template <typename T>
-void Lapack<T>::sterf(int n, T *d, T *e) {
+void Lapack<T>::sterf(int n, T* d, T* e)
+{
   //    check_lapack_enabled();
   //#ifdef NVGRAPH_USE_LAPACK
   int info;
@@ -276,7 +492,8 @@ void Lapack<T>::sterf(int n, T *d, T *e) {
 }
 
 template <typename T>
-void Lapack<T>::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) {
+void Lapack<T>::steqr(char compz, int n, T* d, T* e, T* z, int ldz, T* work)
+{
   //    check_lapack_enabled();
   //#ifdef NVGRAPH_USE_LAPACK
   int info;
@@ -286,8 +503,8 @@ void Lapack<T>::steqr(char compz, int n, T *d, T *e, T *z, int ldz, T *work) {
 }
 
 template <typename T>
-void Lapack<T>::geqrf(int m, int n, T *a, int lda, T *tau, T *work,
-                      int *lwork) {
+void Lapack<T>::geqrf(int m, int n, T* a, int lda, T* tau, T* work, int* lwork)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
   int info;
@@ -296,11 +513,22 @@ void Lapack<T>::geqrf(int m, int n, T *a, int lda, T *tau, T *work,
 #endif
 }
 template <typename T>
-void Lapack<T>::ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
-                      int lda, T *tau, T *c, int ldc, T *work, int *lwork) {
+void Lapack<T>::ormqr(bool right_side,
+                      bool transq,
+                      int m,
+                      int n,
+                      int k,
+                      T* a,
+                      int lda,
+                      T* tau,
+                      T* c,
+                      int ldc,
+                      T* work,
+                      int* lwork)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
-  char side = right_side ? 'R' : 'L';
+  char side  = right_side ? 'R' : 'L';
   char trans = transq ? 'T' : 'N';
   int info;
   lapack_ormqr(side, trans, m, n, k, a, lda, tau, c, ldc, work, lwork, &info);
@@ -310,7 +538,8 @@ void Lapack<T>::ormqr(bool right_side, bool transq, int m, int n, int k, T *a,
 
 // real eigenvalues
 template <typename T>
-void Lapack<T>::geev(T *A, T *eigenvalues, int dim, int lda) {
+void Lapack<T>::geev(T* A, T* eigenvalues, int dim, int lda)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
   lapack_geev(A, eigenvalues, dim, lda);
@@ -318,8 +547,8 @@ void Lapack<T>::geev(T *A, T *eigenvalues, int dim, int lda) {
 }
 // real eigenpairs
 template <typename T>
-void Lapack<T>::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
-                     int ldvr) {
+void Lapack<T>::geev(T* A, T* eigenvalues, T* eigenvectors, int dim, int lda, int ldvr)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
   lapack_geev(A, eigenvalues, eigenvectors, dim, lda, ldvr);
@@ -327,13 +556,18 @@ void Lapack<T>::geev(T *A, T *eigenvalues, T *eigenvectors, int dim, int lda,
 }
 // complex eigenpairs
 template <typename T>
-void Lapack<T>::geev(T *A, T *eigenvalues_r, T *eigenvalues_i,
-                     T *eigenvectors_r, T *eigenvectors_i, int dim, int lda,
-                     int ldvr) {
+void Lapack<T>::geev(T* A,
+                     T* eigenvalues_r,
+                     T* eigenvalues_i,
+                     T* eigenvectors_r,
+                     T* eigenvectors_i,
+                     int dim,
+                     int lda,
+                     int ldvr)
+{
   check_lapack_enabled();
 #ifdef USE_LAPACK
-  lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i,
-              dim, lda, ldvr);
+  lapack_geev(A, eigenvalues_r, eigenvalues_i, eigenvectors_r, eigenvectors_i, dim, lda, ldvr);
 #endif
 }
 
diff --git a/cpp/include/raft/spectral/matrix_wrappers.hpp b/cpp/include/raft/spectral/matrix_wrappers.hpp
index 42fc621a1a..9d1f899d66 100644
--- a/cpp/include/raft/spectral/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/matrix_wrappers.hpp
@@ -41,10 +41,12 @@ using size_type = int;  // for now; TODO: move it in appropriate header
 // Apply diagonal matrix to vector:
 //
 template <typename IndexType_, typename ValueType_>
-static __global__ void diagmv(IndexType_ n, ValueType_ alpha,
+static __global__ void diagmv(IndexType_ n,
+                              ValueType_ alpha,
                               const ValueType_* __restrict__ D,
                               const ValueType_* __restrict__ x,
-                              ValueType_* __restrict__ y) {
+                              ValueType_* __restrict__ y)
+{
   IndexType_ i = threadIdx.x + blockIdx.x * blockDim.x;
   while (i < n) {
     y[i] += alpha * D[i] * x[i];
@@ -59,7 +61,7 @@ enum struct sparse_mv_alg_t : int {
   SPARSE_MV_UNDEFINED = -1,
   SPARSE_MV_ALG_DEFAULT,  // generic, for any sparse matrix
   SPARSE_MV_ALG1,         // typical for CSR
-  SPARSE_MV_ALG2  // may provide better performamce for irregular sparse matrices
+  SPARSE_MV_ALG2          // may provide better performamce for irregular sparse matrices
 };
 
 // Vector "view"-like aggregate for linear algebra purposes
@@ -69,15 +71,14 @@ struct vector_view_t {
   value_type* buffer_;
   size_type size_;
 
-  vector_view_t(value_type* buffer, size_type sz)
-    : buffer_(buffer), size_(sz) {}
+  vector_view_t(value_type* buffer, size_type sz) : buffer_(buffer), size_(sz) {}
 
-  vector_view_t(vector_view_t&& other)
-    : buffer_(other.raw()), size_(other.size()) {}
+  vector_view_t(vector_view_t&& other) : buffer_(other.raw()), size_(other.size()) {}
 
-  vector_view_t& operator=(vector_view_t&& other) {
+  vector_view_t& operator=(vector_view_t&& other)
+  {
     buffer_ = other.raw();
-    size_ = other.size();
+    size_   = other.size();
   }
 };
 
@@ -85,8 +86,9 @@ template <typename value_type>
 class vector_t {
  public:
   vector_t(handle_t const& raft_handle, size_type sz)
-    : buffer_(sz, raft_handle.get_stream()),
-      thrust_policy(raft_handle.get_thrust_policy()) {}
+    : buffer_(sz, raft_handle.get_stream()), thrust_policy(raft_handle.get_thrust_policy())
+  {
+  }
 
   size_type size(void) const { return buffer_.size(); }
 
@@ -94,32 +96,40 @@ class vector_t {
 
   value_type const* raw(void) const { return buffer_.data(); }
 
-  value_type nrm1() const {
-    return thrust::reduce(thrust_policy, buffer_.data(),
-                          buffer_.data() + buffer_.size(), value_type{0},
+  value_type nrm1() const
+  {
+    return thrust::reduce(thrust_policy,
+                          buffer_.data(),
+                          buffer_.data() + buffer_.size(),
+                          value_type{0},
                           [] __device__(auto left, auto right) {
-                            auto abs_left = left > 0 ? left : -left;
+                            auto abs_left  = left > 0 ? left : -left;
                             auto abs_right = right > 0 ? right : -right;
                             return abs_left + abs_right;
                           });
   }
 
-  void fill(value_type value) {
+  void fill(value_type value)
+  {
     thrust::fill_n(thrust_policy, buffer_.data(), buffer_.size(), value);
   }
 
  private:
-  using thrust_exec_policy_t = thrust::detail::execute_with_allocator<
-    rmm::mr::thrust_allocator<char>, thrust::cuda_cub::execute_on_stream_base>;
+  using thrust_exec_policy_t =
+    thrust::detail::execute_with_allocator<rmm::mr::thrust_allocator<char>,
+                                           thrust::cuda_cub::execute_on_stream_base>;
   rmm::device_uvector<value_type> buffer_;
   const thrust_exec_policy_t thrust_policy;
 };
 
 template <typename index_type, typename value_type>
 struct sparse_matrix_t {
-  sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
-                  index_type const* col_indices, value_type const* values,
-                  index_type const nrows, index_type const ncols,
+  sparse_matrix_t(handle_t const& raft_handle,
+                  index_type const* row_offsets,
+                  index_type const* col_indices,
+                  value_type const* values,
+                  index_type const nrows,
+                  index_type const ncols,
                   index_type const nnz)
     : handle_(raft_handle),
       row_offsets_(row_offsets),
@@ -127,18 +137,25 @@ struct sparse_matrix_t {
       values_(values),
       nrows_(nrows),
       ncols_(ncols),
-      nnz_(nnz) {}
+      nnz_(nnz)
+  {
+  }
 
-  sparse_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
-                  index_type const* col_indices, value_type const* values,
-                  index_type const nrows, index_type const nnz)
+  sparse_matrix_t(handle_t const& raft_handle,
+                  index_type const* row_offsets,
+                  index_type const* col_indices,
+                  value_type const* values,
+                  index_type const nrows,
+                  index_type const nnz)
     : handle_(raft_handle),
       row_offsets_(row_offsets),
       col_indices_(col_indices),
       values_(values),
       nrows_(nrows),
       ncols_(nrows),
-      nnz_(nnz) {}
+      nnz_(nnz)
+  {
+  }
 
   template <typename CSRView>
   sparse_matrix_t(handle_t const& raft_handle, CSRView const& csr_view)
@@ -148,7 +165,9 @@ struct sparse_matrix_t {
       values_(csr_view.edge_data),
       nrows_(csr_view.number_of_vertices),
       ncols_(csr_view.number_of_vertices),
-      nnz_(csr_view.number_of_edges) {}
+      nnz_(csr_view.number_of_edges)
+  {
+  }
 
   virtual ~sparse_matrix_t(void) =
     default;  // virtual because used as base for following matrix types
@@ -158,21 +177,24 @@ struct sparse_matrix_t {
   // descriptor creation works with non-const, and const-casting
   // down is dangerous)
   //
-  virtual void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
+  virtual void mv(value_type alpha,
+                  value_type* __restrict__ x,
+                  value_type beta,
                   value_type* __restrict__ y,
                   sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
-                  bool transpose = false, bool symmetric = false) const {
+                  bool transpose      = false,
+                  bool symmetric      = false) const
+  {
     using namespace sparse;
 
     RAFT_EXPECTS(x != nullptr, "Null x buffer.");
     RAFT_EXPECTS(y != nullptr, "Null y buffer.");
 
     auto cusparse_h = handle_.get_cusparse_handle();
-    auto stream = handle_.get_stream();
+    auto stream     = handle_.get_stream();
 
-    cusparseOperation_t trans =
-      transpose ? CUSPARSE_OPERATION_TRANSPOSE :  // transpose
-        CUSPARSE_OPERATION_NON_TRANSPOSE;         //non-transpose
+    cusparseOperation_t trans = transpose ? CUSPARSE_OPERATION_TRANSPOSE :  // transpose
+                                  CUSPARSE_OPERATION_NON_TRANSPOSE;         // non-transpose
 
 #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP
     auto size_x = transpose ? nrows_ : ncols_;
@@ -180,15 +202,19 @@ struct sparse_matrix_t {
 
     cusparseSpMVAlg_t spmv_alg = translate_algorithm(alg);
 
-    //create descriptors:
+    // create descriptors:
     //(below casts are necessary, because
     // cusparseCreateCsr(...) takes non-const
     // void*; the casts should be harmless)
     //
     cusparseSpMatDescr_t matA;
-    CUSPARSE_CHECK(cusparsecreatecsr(
-      &matA, nrows_, ncols_, nnz_, const_cast<index_type*>(row_offsets_),
-      const_cast<index_type*>(col_indices_), const_cast<value_type*>(values_)));
+    CUSPARSE_CHECK(cusparsecreatecsr(&matA,
+                                     nrows_,
+                                     ncols_,
+                                     nnz_,
+                                     const_cast<index_type*>(row_offsets_),
+                                     const_cast<index_type*>(col_indices_),
+                                     const_cast<value_type*>(values_)));
 
     cusparseDnVecDescr_t vecX;
     CUSPARSE_CHECK(cusparsecreatednvec(&vecX, size_x, x));
@@ -196,31 +222,29 @@ struct sparse_matrix_t {
     cusparseDnVecDescr_t vecY;
     CUSPARSE_CHECK(cusparsecreatednvec(&vecY, size_y, y));
 
-    //get (scratch) external device buffer size:
+    // get (scratch) external device buffer size:
     //
     size_t bufferSize;
-    CUSPARSE_CHECK(cusparsespmv_buffersize(cusparse_h, trans, &alpha, matA,
-                                           vecX, &beta, vecY, spmv_alg,
-                                           &bufferSize, stream));
+    CUSPARSE_CHECK(cusparsespmv_buffersize(
+      cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, &bufferSize, stream));
 
-    //allocate external buffer:
+    // allocate external buffer:
     //
     vector_t<value_type> external_buffer(handle_, bufferSize);
 
-    //finally perform SpMV:
+    // finally perform SpMV:
     //
-    CUSPARSE_CHECK(cusparsespmv(cusparse_h, trans, &alpha, matA, vecX, &beta,
-                                vecY, spmv_alg, external_buffer.raw(), stream));
+    CUSPARSE_CHECK(cusparsespmv(
+      cusparse_h, trans, &alpha, matA, vecX, &beta, vecY, spmv_alg, external_buffer.raw(), stream));
 
-    //free descriptors:
+    // free descriptors:
     //(TODO: maybe wrap them in a RAII struct?)
     //
     CUSPARSE_CHECK(cusparseDestroyDnVec(vecY));
     CUSPARSE_CHECK(cusparseDestroyDnVec(vecX));
     CUSPARSE_CHECK(cusparseDestroySpMat(matA));
 #else
-    CUSPARSE_CHECK(
-      cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
+    CUSPARSE_CHECK(cusparsesetpointermode(cusparse_h, CUSPARSE_POINTER_MODE_HOST, stream));
     cusparseMatDescr_t descr = 0;
     CUSPARSE_CHECK(cusparseCreateMatDescr(&descr));
     if (symmetric) {
@@ -229,9 +253,20 @@ struct sparse_matrix_t {
       CUSPARSE_CHECK(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
     }
     CUSPARSE_CHECK(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
-    CUSPARSE_CHECK(cusparsecsrmv(cusparse_h, trans, nrows_, ncols_, nnz_,
-                                 &alpha, descr, values_, row_offsets_,
-                                 col_indices_, x, &beta, y, stream));
+    CUSPARSE_CHECK(cusparsecsrmv(cusparse_h,
+                                 trans,
+                                 nrows_,
+                                 ncols_,
+                                 nnz_,
+                                 &alpha,
+                                 descr,
+                                 values_,
+                                 row_offsets_,
+                                 col_indices_,
+                                 x,
+                                 &beta,
+                                 y,
+                                 stream));
     CUSPARSE_CHECK(cusparseDestroyMatDescr(descr));
 #endif
   }
@@ -239,19 +274,18 @@ struct sparse_matrix_t {
   handle_t const& get_handle(void) const { return handle_; }
 
 #if not defined CUDA_ENFORCE_LOWER and CUDA_VER_10_1_UP
-  cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const {
+  cusparseSpMVAlg_t translate_algorithm(sparse_mv_alg_t alg) const
+  {
     switch (alg) {
-      case sparse_mv_alg_t::SPARSE_MV_ALG1:
-        return CUSPARSE_CSRMV_ALG1;
-      case sparse_mv_alg_t::SPARSE_MV_ALG2:
-        return CUSPARSE_CSRMV_ALG2;
-      default:
-        return CUSPARSE_MV_ALG_DEFAULT;
+      case sparse_mv_alg_t::SPARSE_MV_ALG1: return CUSPARSE_CSRMV_ALG1;
+      case sparse_mv_alg_t::SPARSE_MV_ALG2: return CUSPARSE_CSRMV_ALG2;
+      default: return CUSPARSE_MV_ALG_DEFAULT;
     }
   }
 #endif
 
-  //private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence, aggregate
+  // private: // maybe not, keep this ASAPBNS ("as simple as possible, but not simpler"); hence,
+  // aggregate
 
   handle_t const& handle_;
   index_type const* row_offsets_;
@@ -264,43 +298,51 @@ struct sparse_matrix_t {
 
 template <typename index_type, typename value_type>
 struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
-  laplacian_matrix_t(handle_t const& raft_handle, index_type const* row_offsets,
-                     index_type const* col_indices, value_type const* values,
-                     index_type const nrows, index_type const nnz)
-    : sparse_matrix_t<index_type, value_type>(raft_handle, row_offsets,
-                                              col_indices, values, nrows, nnz),
-      diagonal_(raft_handle, nrows) {
+  laplacian_matrix_t(handle_t const& raft_handle,
+                     index_type const* row_offsets,
+                     index_type const* col_indices,
+                     value_type const* values,
+                     index_type const nrows,
+                     index_type const nnz)
+    : sparse_matrix_t<index_type, value_type>(
+        raft_handle, row_offsets, col_indices, values, nrows, nnz),
+      diagonal_(raft_handle, nrows)
+  {
     vector_t<value_type> ones{raft_handle, nrows};
     ones.fill(1.0);
-    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
-                                                diagonal_.raw());
+    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
 
   laplacian_matrix_t(handle_t const& raft_handle,
                      sparse_matrix_t<index_type, value_type> const& csr_m)
-    : sparse_matrix_t<index_type, value_type>(raft_handle, csr_m.row_offsets_,
-                                              csr_m.col_indices_, csr_m.values_,
-                                              csr_m.nrows_, csr_m.nnz_),
-      diagonal_(raft_handle, csr_m.nrows_) {
+    : sparse_matrix_t<index_type, value_type>(raft_handle,
+                                              csr_m.row_offsets_,
+                                              csr_m.col_indices_,
+                                              csr_m.values_,
+                                              csr_m.nrows_,
+                                              csr_m.nnz_),
+      diagonal_(raft_handle, csr_m.nrows_)
+  {
     vector_t<value_type> ones{raft_handle, csr_m.nrows_};
     ones.fill(1.0);
-    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0,
-                                                diagonal_.raw());
+    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
 
   // y = alpha*A*x + beta*y
   //
-  void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
+  void mv(value_type alpha,
+          value_type* __restrict__ x,
+          value_type beta,
           value_type* __restrict__ y,
           sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
-          bool transpose = false, bool symmetric = false) const override {
+          bool transpose      = false,
+          bool symmetric      = false) const override
+  {
     constexpr int BLOCK_SIZE = 1024;
-    auto n = sparse_matrix_t<index_type, value_type>::nrows_;
+    auto n                   = sparse_matrix_t<index_type, value_type>::nrows_;
 
-    auto cublas_h =
-      sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
-    auto stream =
-      sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
+    auto cublas_h = sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
+    auto stream   = sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
 
     // scales y by beta:
     //
@@ -312,8 +354,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
     // Apply diagonal matrix
     //
-    dim3 gridDim{
-      std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
+    dim3 gridDim{std::min<unsigned int>((n + BLOCK_SIZE - 1) / BLOCK_SIZE, 65535), 1, 1};
 
     dim3 blockDim{BLOCK_SIZE, 1, 1};
     diagmv<<<gridDim, blockDim, 0, stream>>>(n, alpha, diagonal_.raw(), x, y);
@@ -321,8 +362,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
     // Apply adjacency matrix
     //
-    sparse_matrix_t<index_type, value_type>::mv(-alpha, x, 1, y, alg, transpose,
-                                                symmetric);
+    sparse_matrix_t<index_type, value_type>::mv(-alpha, x, 1, y, alg, transpose, symmetric);
   }
 
   vector_t<value_type> diagonal_;
@@ -332,52 +372,66 @@ template <typename index_type, typename value_type>
 struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
   modularity_matrix_t(handle_t const& raft_handle,
                       index_type const* row_offsets,
-                      index_type const* col_indices, value_type const* values,
-                      index_type const nrows, index_type const nnz)
+                      index_type const* col_indices,
+                      value_type const* values,
+                      index_type const nrows,
+                      index_type const nnz)
     : laplacian_matrix_t<index_type, value_type>(
-        raft_handle, row_offsets, col_indices, values, nrows, nnz) {
+        raft_handle, row_offsets, col_indices, values, nrows, nnz)
+  {
     edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
   }
 
   modularity_matrix_t(handle_t const& raft_handle,
                       sparse_matrix_t<index_type, value_type> const& csr_m)
-    : laplacian_matrix_t<index_type, value_type>(raft_handle, csr_m) {
+    : laplacian_matrix_t<index_type, value_type>(raft_handle, csr_m)
+  {
     edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
   }
 
   // y = alpha*A*x + beta*y
   //
-  void mv(value_type alpha, value_type* __restrict__ x, value_type beta,
+  void mv(value_type alpha,
+          value_type* __restrict__ x,
+          value_type beta,
           value_type* __restrict__ y,
           sparse_mv_alg_t alg = sparse_mv_alg_t::SPARSE_MV_ALG1,
-          bool transpose = false, bool symmetric = false) const override {
+          bool transpose      = false,
+          bool symmetric      = false) const override
+  {
     auto n = sparse_matrix_t<index_type, value_type>::nrows_;
 
-    auto cublas_h =
-      sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
-    auto stream =
-      sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
+    auto cublas_h = sparse_matrix_t<index_type, value_type>::get_handle().get_cublas_handle();
+    auto stream   = sparse_matrix_t<index_type, value_type>::get_handle().get_stream();
 
     // y = A*x
     //
-    sparse_matrix_t<index_type, value_type>::mv(alpha, x, 0, y, alg, transpose,
-                                                symmetric);
+    sparse_matrix_t<index_type, value_type>::mv(alpha, x, 0, y, alg, transpose, symmetric);
     value_type dot_res;
 
     // gamma = d'*x
     //
     // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
-    CUBLAS_CHECK(linalg::cublasdot(
-      cublas_h, n, laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
-      1, x, 1, &dot_res, stream));
+    CUBLAS_CHECK(linalg::cublasdot(cublas_h,
+                                   n,
+                                   laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
+                                   1,
+                                   x,
+                                   1,
+                                   &dot_res,
+                                   stream));
 
     // y = y -(gamma/edge_sum)*d
     //
     value_type gamma_ = -dot_res / edge_sum_;
-    CUBLAS_CHECK(linalg::cublasaxpy(
-      cublas_h, n, &gamma_,
-      laplacian_matrix_t<index_type, value_type>::diagonal_.raw(), 1, y, 1,
-      stream));
+    CUBLAS_CHECK(linalg::cublasaxpy(cublas_h,
+                                    n,
+                                    &gamma_,
+                                    laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
+                                    1,
+                                    y,
+                                    1,
+                                    stream));
   }
 
   value_type edge_sum_;
diff --git a/cpp/include/raft/spectral/modularity_maximization.hpp b/cpp/include/raft/spectral/modularity_maximization.hpp
index fededbfcb4..0e0e47ddf3 100644
--- a/cpp/include/raft/spectral/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/modularity_maximization.hpp
@@ -39,7 +39,8 @@
 #endif
 
 #ifdef COLLECT_TIME_STATISTICS
-static double timer(void) {
+static double timer(void)
+{
   struct timeval tv;
   cudaDeviceSynchronize();
   gettimeofday(&tv, NULL);
@@ -78,17 +79,21 @@ using namespace linalg;
  *    performed.
  *  @return error flag.
  */
-template <typename vertex_t, typename weight_t, typename EigenSolver,
-          typename ClusterSolver>
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
-  handle_t const &handle, sparse_matrix_t<vertex_t, weight_t> const &csr_m,
-  EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
-  vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
+  handle_t const& handle,
+  sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  EigenSolver const& eigen_solver,
+  ClusterSolver const& cluster_solver,
+  vertex_t* __restrict__ clusters,
+  weight_t* eigVals,
+  weight_t* eigVecs)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
 
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
   auto cublas_h = handle.get_cublas_handle();
 
   std::tuple<vertex_t, weight_t, vertex_t>
@@ -102,11 +107,10 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
-  auto nEigVecs = eigen_config.n_eigVecs;
+  auto nEigVecs     = eigen_config.n_eigVecs;
 
   // Compute eigenvectors corresponding to largest eigenvalues
-  std::get<0>(stats) =
-    eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
+  std::get<0>(stats) = eigen_solver.solve_largest_eigenvectors(handle, B, eigVals, eigVecs);
 
   // Whiten eigenvector matrix
   transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
@@ -117,8 +121,7 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
   CHECK_CUDA(stream);
 
   // Find partition clustering
-  auto pair_cluster =
-    cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
 
   std::get<1>(stats) = pair_cluster.first;
   std::get<2>(stats) = pair_cluster.second;
@@ -137,11 +140,12 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
  *  @param modularity On exit, modularity
  */
 template <typename vertex_t, typename weight_t>
-void analyzeModularity(handle_t const &handle,
-                       sparse_matrix_t<vertex_t, weight_t> const &csr_m,
+void analyzeModularity(handle_t const& handle,
+                       sparse_matrix_t<vertex_t, weight_t> const& csr_m,
                        vertex_t nClusters,
-                       vertex_t const *__restrict__ clusters,
-                       weight_t &modularity) {
+                       vertex_t const* __restrict__ clusters,
+                       weight_t& modularity)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
 
   vertex_t i;
@@ -149,15 +153,14 @@ void analyzeModularity(handle_t const &handle,
   weight_t partModularity, clustersize;
 
   auto cublas_h = handle.get_cublas_handle();
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
 
   // Device memory
   vector_t<weight_t> part_i(handle, n);
   vector_t<weight_t> Bx(handle, n);
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Modularity
   modularity_matrix_t<vertex_t, weight_t> B{handle, csr_m};
@@ -167,8 +170,7 @@ void analyzeModularity(handle_t const &handle,
 
   // Iterate through partitions
   for (i = 0; i < nClusters; ++i) {
-    if (!construct_indicator(handle, i, n, clustersize, partModularity,
-                             clusters, part_i, Bx, B)) {
+    if (!construct_indicator(handle, i, n, clustersize, partModularity, clusters, part_i, Bx, B)) {
       WARNING("empty partition");
       continue;
     }
diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index 2df3812a4a..88cc8aa8f0 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -61,21 +61,25 @@ using namespace linalg;
  *    performed.
  *  @return statistics: number of eigensolver iterations, .
  */
-template <typename vertex_t, typename weight_t, typename EigenSolver,
-          typename ClusterSolver>
-std::tuple<vertex_t, weight_t, vertex_t> partition(
-  handle_t const &handle, sparse_matrix_t<vertex_t, weight_t> const &csr_m,
-  EigenSolver const &eigen_solver, ClusterSolver const &cluster_solver,
-  vertex_t *__restrict__ clusters, weight_t *eigVals, weight_t *eigVecs) {
+template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
+                                                   sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                                                   EigenSolver const& eigen_solver,
+                                                   ClusterSolver const& cluster_solver,
+                                                   vertex_t* __restrict__ clusters,
+                                                   weight_t* eigVals,
+                                                   weight_t* eigVecs)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
   RAFT_EXPECTS(eigVals != nullptr, "Null eigVals buffer.");
   RAFT_EXPECTS(eigVecs != nullptr, "Null eigVecs buffer.");
 
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
   auto cublas_h = handle.get_cublas_handle();
 
   std::tuple<vertex_t, weight_t, vertex_t>
-    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver, cluster solver residual, # iters cluster solver
+    stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver,
+            //cluster solver residual, # iters cluster solver
 
   vertex_t n = csr_m.nrows_;
 
@@ -86,22 +90,20 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
   // Compute eigenvectors of Laplacian
 
   // Initialize Laplacian
-  ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
   laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
-  auto nEigVecs = eigen_config.n_eigVecs;
+  auto nEigVecs     = eigen_config.n_eigVecs;
 
   // Compute smallest eigenvalues and eigenvectors
-  std::get<0>(stats) =
-    eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
+  std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
 
   // Whiten eigenvector matrix
   transform_eigen_matrix(handle, n, nEigVecs, eigVecs);
 
   // Find partition clustering
-  auto pair_cluster =
-    cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
+  auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
 
   std::get<1>(stats) = pair_cluster.first;
   std::get<2>(stats) = pair_cluster.second;
@@ -128,16 +130,19 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
  *  @return error flag.
  */
 template <typename vertex_t, typename weight_t>
-void analyzePartition(handle_t const &handle,
-                      sparse_matrix_t<vertex_t, weight_t> const &csr_m,
-                      vertex_t nClusters, const vertex_t *__restrict__ clusters,
-                      weight_t &edgeCut, weight_t &cost) {
+void analyzePartition(handle_t const& handle,
+                      sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      vertex_t nClusters,
+                      const vertex_t* __restrict__ clusters,
+                      weight_t& edgeCut,
+                      weight_t& cost)
+{
   RAFT_EXPECTS(clusters != nullptr, "Null clusters buffer.");
 
   vertex_t i;
   vertex_t n = csr_m.nrows_;
 
-  auto stream = handle.get_stream();
+  auto stream   = handle.get_stream();
   auto cublas_h = handle.get_cublas_handle();
 
   weight_t partEdgesCut, clustersize;
@@ -147,22 +152,20 @@ void analyzePartition(handle_t const &handle,
   vector_t<weight_t> Lx(handle, n);
 
   // Initialize cuBLAS
-  CUBLAS_CHECK(
-    cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+  CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
 
   // Initialize Laplacian
-  ///sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
+  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
   laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
 
   // Initialize output
-  cost = 0;
+  cost    = 0;
   edgeCut = 0;
 
   // Iterate through partitions
   for (i = 0; i < nClusters; ++i) {
     // Construct indicator vector for ith partition
-    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters,
-                             part_i, Lx, L)) {
+    if (!construct_indicator(handle, i, n, clustersize, partEdgesCut, clusters, part_i, Lx, L)) {
       WARNING("empty partition");
       continue;
     }
diff --git a/cpp/include/raft/spectral/spectral_util.hpp b/cpp/include/raft/spectral/spectral_util.hpp
index c148350c0f..44b4af4bdc 100644
--- a/cpp/include/raft/spectral/spectral_util.hpp
+++ b/cpp/include/raft/spectral/spectral_util.hpp
@@ -27,20 +27,18 @@ namespace raft {
 namespace spectral {
 
 template <typename index_type_t, typename value_type_t>
-static __global__ void scale_obs_kernel(index_type_t m, index_type_t n,
-                                        value_type_t* obs) {
+static __global__ void scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs)
+{
   index_type_t i, j, k, index, mm;
   value_type_t alpha, v, last;
   bool valid;
   // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension
 
   // compute alpha
-  mm = (((m + blockDim.x - 1) / blockDim.x) *
-        blockDim.x);  // m in multiple of blockDim.x
+  mm    = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x);  // m in multiple of blockDim.x
   alpha = 0.0;
 
-  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n;
-       j += blockDim.y * gridDim.y) {
+  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
     for (i = threadIdx.x; i < mm; i += blockDim.x) {
       // check if the thread is valid
       valid = i < m;
@@ -65,17 +63,17 @@ static __global__ void scale_obs_kernel(index_type_t m, index_type_t n,
   // scale by alpha
   alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x);
   alpha = std::sqrt(alpha);
-  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n;
-       j += blockDim.y * gridDim.y) {
+  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
     for (i = threadIdx.x; i < m; i += blockDim.x) {  // blockDim.x=32
-      index = i + j * m;
+      index      = i + j * m;
       obs[index] = obs[index] / alpha;
     }
   }
 }
 
 template <typename index_type_t>
-index_type_t next_pow2(index_type_t n) {
+index_type_t next_pow2(index_type_t n)
+{
   index_type_t v;
   // Reference:
   // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float
@@ -89,7 +87,8 @@ index_type_t next_pow2(index_type_t n) {
 }
 
 template <typename index_type_t, typename value_type_t>
-cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) {
+cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs)
+{
   index_type_t p2m;
 
   // find next power of 2
@@ -101,17 +100,16 @@ cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs) {
   dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1};
 
   // launch scaling kernel (scale each column of obs by its norm)
-  scale_obs_kernel<index_type_t, value_type_t>
-    <<<nblocks, nthreads>>>(m, n, obs);
+  scale_obs_kernel<index_type_t, value_type_t><<<nblocks, nthreads>>>(m, n, obs);
 
   return cudaSuccess;
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
-                            weight_t* eigVecs) {
-  auto stream = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
+void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs, weight_t* eigVecs)
+{
+  auto stream             = handle.get_stream();
+  auto cublas_h           = handle.get_cublas_handle();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
   const weight_t zero{0.0};
@@ -121,9 +119,9 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
   for (auto i = 0; i < nEigVecs; ++i) {
     weight_t mean, std;
 
-    mean = thrust::reduce(
-      thrust_exec_policy, thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
-      thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
+    mean = thrust::reduce(thrust_exec_policy,
+                          thrust::device_pointer_cast(eigVecs + IDX(0, i, n)),
+                          thrust::device_pointer_cast(eigVecs + IDX(0, i + 1, n)));
     CHECK_CUDA(stream);
     mean /= n;
     thrust::transform(thrust_exec_policy,
@@ -134,8 +132,7 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
                       thrust::minus<weight_t>());
     CHECK_CUDA(stream);
 
-    CUBLAS_CHECK(
-      cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
+    CUBLAS_CHECK(cublasnrm2(cublas_h, n, eigVecs + IDX(0, i, n), 1, &std, stream));
 
     std /= std::sqrt(static_cast<weight_t>(n));
 
@@ -152,16 +149,25 @@ void transform_eigen_matrix(handle_t const& handle, edge_t n, vertex_t nEigVecs,
   //   TODO: in-place transpose
   {
     vector_t<weight_t> work(handle, nEigVecs * n);
-    CUBLAS_CHECK(
-      cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
-
-    CUBLAS_CHECK(cublasgeam(cublas_h, CUBLAS_OP_T, CUBLAS_OP_N, nEigVecs, n,
-                            &one, eigVecs, n, &zero, (weight_t*)NULL, nEigVecs,
-                            work.raw(), nEigVecs, stream));
-
-    CUDA_TRY(cudaMemcpyAsync(eigVecs, work.raw(),
-                             nEigVecs * n * sizeof(weight_t),
-                             cudaMemcpyDeviceToDevice, stream));
+    CUBLAS_CHECK(cublassetpointermode(cublas_h, CUBLAS_POINTER_MODE_HOST, stream));
+
+    CUBLAS_CHECK(cublasgeam(cublas_h,
+                            CUBLAS_OP_T,
+                            CUBLAS_OP_N,
+                            nEigVecs,
+                            n,
+                            &one,
+                            eigVecs,
+                            n,
+                            &zero,
+                            (weight_t*)NULL,
+                            nEigVecs,
+                            work.raw(),
+                            nEigVecs,
+                            stream));
+
+    CUDA_TRY(cudaMemcpyAsync(
+      eigVecs, work.raw(), nEigVecs * n * sizeof(weight_t), cudaMemcpyDeviceToDevice, stream));
   }
 }
 
@@ -176,9 +182,9 @@ struct equal_to_i_op {
  public:
   equal_to_i_op(index_type_t _i) : i(_i) {}
   template <typename Tuple_>
-  __host__ __device__ void operator()(Tuple_ t) {
-    thrust::get<1>(t) =
-      (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0;
+  __host__ __device__ void operator()(Tuple_ t)
+  {
+    thrust::get<1>(t) = (thrust::get<0>(t) == i) ? (value_type_t)1.0 : (value_type_t)0.0;
   }
 };
 }  // namespace
@@ -186,38 +192,38 @@ struct equal_to_i_op {
 // Construct indicator vector for ith partition
 //
 template <typename vertex_t, typename edge_t, typename weight_t>
-bool construct_indicator(handle_t const& handle, edge_t index, edge_t n,
-                         weight_t& clustersize, weight_t& partStats,
+bool construct_indicator(handle_t const& handle,
+                         edge_t index,
+                         edge_t n,
+                         weight_t& clustersize,
+                         weight_t& partStats,
                          vertex_t const* __restrict__ clusters,
-                         vector_t<weight_t>& part_i, vector_t<weight_t>& Bx,
-                         laplacian_matrix_t<vertex_t, weight_t> const& B) {
-  auto stream = handle.get_stream();
-  auto cublas_h = handle.get_cublas_handle();
+                         vector_t<weight_t>& part_i,
+                         vector_t<weight_t>& Bx,
+                         laplacian_matrix_t<vertex_t, weight_t> const& B)
+{
+  auto stream             = handle.get_stream();
+  auto cublas_h           = handle.get_cublas_handle();
   auto thrust_exec_policy = handle.get_thrust_policy();
 
-  thrust::for_each(thrust_exec_policy,
-                   thrust::make_zip_iterator(thrust::make_tuple(
-                     thrust::device_pointer_cast(clusters),
-                     thrust::device_pointer_cast(part_i.raw()))),
-                   thrust::make_zip_iterator(thrust::make_tuple(
-                     thrust::device_pointer_cast(clusters + n),
-                     thrust::device_pointer_cast(part_i.raw() + n))),
-                   equal_to_i_op<vertex_t, weight_t>(index));
+  thrust::for_each(
+    thrust_exec_policy,
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters),
+                                                 thrust::device_pointer_cast(part_i.raw()))),
+    thrust::make_zip_iterator(thrust::make_tuple(thrust::device_pointer_cast(clusters + n),
+                                                 thrust::device_pointer_cast(part_i.raw() + n))),
+    equal_to_i_op<vertex_t, weight_t>(index));
   CHECK_CUDA(stream);
 
   // Compute size of ith partition
-  CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1,
-                         &clustersize, stream));
+  CUBLAS_CHECK(cublasdot(cublas_h, n, part_i.raw(), 1, part_i.raw(), 1, &clustersize, stream));
 
   clustersize = round(clustersize);
-  if (clustersize < 0.5) {
-    return false;
-  }
+  if (clustersize < 0.5) { return false; }
 
   // Compute part stats
   B.mv(1, part_i.raw(), 0, Bx.raw());
-  CUBLAS_CHECK(
-    cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
+  CUBLAS_CHECK(cublasdot(cublas_h, n, Bx.raw(), 1, part_i.raw(), 1, &partStats, stream));
 
   return true;
 }
diff --git a/cpp/include/raft/spectral/warn_dbg.hpp b/cpp/include/raft/spectral/warn_dbg.hpp
index 406f1b7c7e..08a4e6efb5 100644
--- a/cpp/include/raft/spectral/warn_dbg.hpp
+++ b/cpp/include/raft/spectral/warn_dbg.hpp
@@ -4,13 +4,13 @@
 #include <string>
 
 #define STRINGIFY_DETAIL(x) #x
-#define RAFT_STRINGIFY(x) STRINGIFY_DETAIL(x)
+#define RAFT_STRINGIFY(x)   STRINGIFY_DETAIL(x)
 
 #ifdef DEBUG
 #define COUT() (std::cout)
 #define CERR() (std::cerr)
 
-//nope:
+// nope:
 //
 #define WARNING(message)                                                  \
   do {                                                                    \
diff --git a/cpp/include/raft/stats/detail/mean.cuh b/cpp/include/raft/stats/detail/mean.cuh
index 1b338a035a..e8e6bea4dd 100644
--- a/cpp/include/raft/stats/detail/mean.cuh
+++ b/cpp/include/raft/stats/detail/mean.cuh
@@ -27,15 +27,15 @@ namespace detail {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D,
-                                   IdxType N) {
+__global__ void meanKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId = threadIdx.x / ColsPerBlk;
-  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data = Type(0);
-  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
+  IdxType thisColId           = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
+  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data            = Type(0);
+  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
   for (IdxType i = rowId; i < N; i += stride)
     thread_data += (colId < D) ? data[i * D + colId] : Type(0);
   __shared__ Type smu[ColsPerBlk];
@@ -47,8 +47,8 @@ __global__ void meanKernelRowMajor(Type *mu, const Type *data, IdxType D,
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D,
-                                   IdxType N) {
+__global__ void meanKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
@@ -58,30 +58,26 @@ __global__ void meanKernelColMajor(Type *mu, const Type *data, IdxType D,
     thread_data += data[idx];
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) {
-    mu[blockIdx.x] = acc / N;
-  }
+  if (threadIdx.x == 0) { mu[blockIdx.x] = acc / N; }
 }
 
 template <typename Type, typename IdxType = int>
-void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
-          bool rowMajor, cudaStream_t stream) {
+void mean(
+  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
+{
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk = 32;
-    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
-              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk    = 32;
+    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemsetAsync(mu, 0, sizeof(Type) * D, stream));
-    meanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
-      <<<grid, TPB, 0, stream>>>(mu, data, D, N);
+    meanKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(mu, data, D, N);
     CUDA_CHECK(cudaPeekAtLastError());
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::scalarMultiply(mu, mu, ratio, D, stream);
   } else {
-    meanKernelColMajor<Type, IdxType, TPB>
-      <<<D, TPB, 0, stream>>>(mu, data, D, N);
+    meanKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(mu, data, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/stats/detail/stddev.cuh b/cpp/include/raft/stats/detail/stddev.cuh
index e8917a60b3..42351269ea 100644
--- a/cpp/include/raft/stats/detail/stddev.cuh
+++ b/cpp/include/raft/stats/detail/stddev.cuh
@@ -27,15 +27,15 @@ namespace detail {
 
 ///@todo: ColPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D,
-                                     IdxType N) {
+__global__ void stddevKernelRowMajor(Type* std, const Type* data, IdxType D, IdxType N)
+{
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId = threadIdx.x / ColsPerBlk;
-  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data = Type(0);
-  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
+  IdxType thisColId           = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
+  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data            = Type(0);
+  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
   for (IdxType i = rowId; i < N; i += stride) {
     Type val = (colId < D) ? data[i * D + colId] : Type(0);
     thread_data += val * val;
@@ -49,41 +49,39 @@ __global__ void stddevKernelRowMajor(Type *std, const Type *data, IdxType D,
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void stddevKernelColMajor(Type *std, const Type *data,
-                                     const Type *mu, IdxType D, IdxType N) {
+__global__ void stddevKernelColMajor(
+  Type* std, const Type* data, const Type* mu, IdxType D, IdxType N)
+{
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
   IdxType colStart = N * blockIdx.x;
-  Type m = mu[blockIdx.x];
+  Type m           = mu[blockIdx.x];
   for (IdxType i = threadIdx.x; i < N; i += TPB) {
     IdxType idx = colStart + i;
-    Type diff = data[idx] - m;
+    Type diff   = data[idx] - m;
     thread_data += diff * diff;
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) {
-    std[blockIdx.x] = raft::mySqrt(acc / N);
-  }
+  if (threadIdx.x == 0) { std[blockIdx.x] = raft::mySqrt(acc / N); }
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu,
-                                   IdxType D, IdxType N) {
+__global__ void varsKernelColMajor(
+  Type* var, const Type* data, const Type* mu, IdxType D, IdxType N)
+{
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
   IdxType colStart = N * blockIdx.x;
-  Type m = mu[blockIdx.x];
+  Type m           = mu[blockIdx.x];
   for (IdxType i = threadIdx.x; i < N; i += TPB) {
     IdxType idx = colStart + i;
-    Type diff = data[idx] - m;
+    Type diff   = data[idx] - m;
     thread_data += diff * diff;
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) {
-    var[blockIdx.x] = acc / N;
-  }
+  if (threadIdx.x == 0) { var[blockIdx.x] = acc / N; }
 }
 
 /**
@@ -105,70 +103,78 @@ __global__ void varsKernelColMajor(Type *var, const Type *data, const Type *mu,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
-            bool sample, bool rowMajor, cudaStream_t stream) {
+void stddev(Type* std,
+            const Type* data,
+            const Type* mu,
+            IdxType D,
+            IdxType N,
+            bool sample,
+            bool rowMajor,
+            cudaStream_t stream)
+{
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk = 32;
-    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
-              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk    = 32;
+    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemset(std, 0, sizeof(Type) * D));
-    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
-      <<<grid, TPB, 0, stream>>>(std, data, D, N);
+    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(std, data, D, N);
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::binaryOp(
-      std, std, mu, D,
-      [ratio] __device__(Type a, Type b) {
-        return raft::mySqrt(a * ratio - b * b);
-      },
+      std,
+      std,
+      mu,
+      D,
+      [ratio] __device__(Type a, Type b) { return raft::mySqrt(a * ratio - b * b); },
       stream);
   } else {
-    stddevKernelColMajor<Type, IdxType, TPB>
-      <<<D, TPB, 0, stream>>>(std, data, mu, D, N);
+    stddevKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(std, data, mu, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 /**
-  * @brief Compute variance of the input matrix
-  *
-  * Variance operation is assumed to be performed on a given column.
-  *
-  * @tparam Type the data type
-  * @tparam IdxType Integer type used to for addressing
-  * @param var the output stddev vector
-  * @param data the input matrix
-  * @param mu the mean vector
-  * @param D number of columns of data
-  * @param N number of rows of data
-  * @param sample whether to evaluate sample stddev or not. In other words,
-  * whether
-  *  to normalize the output using N-1 or N, for true or false, respectively
-  * @param rowMajor whether the input data is row or col major
-  * @param stream cuda stream where to launch work
-  */
+ * @brief Compute variance of the input matrix
+ *
+ * Variance operation is assumed to be performed on a given column.
+ *
+ * @tparam Type the data type
+ * @tparam IdxType Integer type used to for addressing
+ * @param var the output stddev vector
+ * @param data the input matrix
+ * @param mu the mean vector
+ * @param D number of columns of data
+ * @param N number of rows of data
+ * @param sample whether to evaluate sample stddev or not. In other words,
+ * whether
+ *  to normalize the output using N-1 or N, for true or false, respectively
+ * @param rowMajor whether the input data is row or col major
+ * @param stream cuda stream where to launch work
+ */
 template <typename Type, typename IdxType = int>
-void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
-          bool sample, bool rowMajor, cudaStream_t stream) {
+void vars(Type* var,
+          const Type* data,
+          const Type* mu,
+          IdxType D,
+          IdxType N,
+          bool sample,
+          bool rowMajor,
+          cudaStream_t stream)
+{
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk = 32;
-    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
-              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk    = 32;
+    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemset(var, 0, sizeof(Type) * D));
-    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
-      <<<grid, TPB, 0, stream>>>(var, data, D, N);
+    stddevKernelRowMajor<Type, IdxType, TPB, ColsPerBlk><<<grid, TPB, 0, stream>>>(var, data, D, N);
     Type ratio = Type(1) / (sample ? Type(N - 1) : Type(N));
     raft::linalg::binaryOp(
-      var, var, mu, D,
-      [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream);
+      var, var, mu, D, [ratio] __device__(Type a, Type b) { return a * ratio - b * b; }, stream);
   } else {
-    varsKernelColMajor<Type, IdxType, TPB>
-      <<<D, TPB, 0, stream>>>(var, data, mu, D, N);
+    varsKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(var, data, mu, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/stats/detail/sum.cuh b/cpp/include/raft/stats/detail/sum.cuh
index 37a3313ed1..b7f5cc8ff7 100644
--- a/cpp/include/raft/stats/detail/sum.cuh
+++ b/cpp/include/raft/stats/detail/sum.cuh
@@ -27,15 +27,15 @@ namespace detail {
 
 ///@todo: ColsPerBlk has been tested only for 32!
 template <typename Type, typename IdxType, int TPB, int ColsPerBlk = 32>
-__global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D,
-                                  IdxType N) {
+__global__ void sumKernelRowMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
   const int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxType thisColId = threadIdx.x % ColsPerBlk;
-  IdxType thisRowId = threadIdx.x / ColsPerBlk;
-  IdxType colId = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
-  IdxType rowId = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
-  Type thread_data = Type(0);
-  const IdxType stride = RowsPerBlkPerIter * gridDim.x;
+  IdxType thisColId           = threadIdx.x % ColsPerBlk;
+  IdxType thisRowId           = threadIdx.x / ColsPerBlk;
+  IdxType colId               = thisColId + ((IdxType)blockIdx.y * ColsPerBlk);
+  IdxType rowId               = thisRowId + ((IdxType)blockIdx.x * RowsPerBlkPerIter);
+  Type thread_data            = Type(0);
+  const IdxType stride        = RowsPerBlkPerIter * gridDim.x;
   for (IdxType i = rowId; i < N; i += stride)
     thread_data += (colId < D) ? data[i * D + colId] : Type(0);
   __shared__ Type smu[ColsPerBlk];
@@ -47,8 +47,8 @@ __global__ void sumKernelRowMajor(Type *mu, const Type *data, IdxType D,
 }
 
 template <typename Type, typename IdxType, int TPB>
-__global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D,
-                                  IdxType N) {
+__global__ void sumKernelColMajor(Type* mu, const Type* data, IdxType D, IdxType N)
+{
   typedef cub::BlockReduce<Type, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   Type thread_data = Type(0);
@@ -58,27 +58,23 @@ __global__ void sumKernelColMajor(Type *mu, const Type *data, IdxType D,
     thread_data += data[idx];
   }
   Type acc = BlockReduce(temp_storage).Sum(thread_data);
-  if (threadIdx.x == 0) {
-    mu[blockIdx.x] = acc;
-  }
+  if (threadIdx.x == 0) { mu[blockIdx.x] = acc; }
 }
 
 template <typename Type, typename IdxType = int>
-void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
-         cudaStream_t stream) {
+void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
+{
   static const int TPB = 256;
   if (rowMajor) {
     static const int RowsPerThread = 4;
-    static const int ColsPerBlk = 32;
-    static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread;
-    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk),
-              raft::ceildiv(D, (IdxType)ColsPerBlk));
+    static const int ColsPerBlk    = 32;
+    static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
+    dim3 grid(raft::ceildiv(N, (IdxType)RowsPerBlk), raft::ceildiv(D, (IdxType)ColsPerBlk));
     CUDA_CHECK(cudaMemset(output, 0, sizeof(Type) * D));
     sumKernelRowMajor<Type, IdxType, TPB, ColsPerBlk>
       <<<grid, TPB, 0, stream>>>(output, input, D, N);
   } else {
-    sumKernelColMajor<Type, IdxType, TPB>
-      <<<D, TPB, 0, stream>>>(output, input, D, N);
+    sumKernelColMajor<Type, IdxType, TPB><<<D, TPB, 0, stream>>>(output, input, D, N);
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/stats/mean.hpp b/cpp/include/raft/stats/mean.hpp
index 6e4cf39850..ba1eb55e71 100644
--- a/cpp/include/raft/stats/mean.hpp
+++ b/cpp/include/raft/stats/mean.hpp
@@ -41,8 +41,9 @@ namespace stats {
  * @param stream: cuda stream
  */
 template <typename Type, typename IdxType = int>
-void mean(Type *mu, const Type *data, IdxType D, IdxType N, bool sample,
-          bool rowMajor, cudaStream_t stream) {
+void mean(
+  Type* mu, const Type* data, IdxType D, IdxType N, bool sample, bool rowMajor, cudaStream_t stream)
+{
   detail::mean(mu, data, D, N, sample, rowMajor, stream);
 }
 
diff --git a/cpp/include/raft/stats/mean_center.hpp b/cpp/include/raft/stats/mean_center.hpp
index 04934d4388..c0ba24312b 100644
--- a/cpp/include/raft/stats/mean_center.hpp
+++ b/cpp/include/raft/stats/mean_center.hpp
@@ -38,12 +38,25 @@ namespace stats {
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int, int TPB = 256>
-void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D,
-                IdxType N, bool rowMajor, bool bcastAlongRows,
-                cudaStream_t stream) {
+void meanCenter(Type* out,
+                const Type* data,
+                const Type* mu,
+                IdxType D,
+                IdxType N,
+                bool rowMajor,
+                bool bcastAlongRows,
+                cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    out, data, mu, D, N, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a - b; }, stream);
+    out,
+    data,
+    mu,
+    D,
+    N,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a - b; },
+    stream);
 }
 
 /**
@@ -61,11 +74,25 @@ void meanCenter(Type *out, const Type *data, const Type *mu, IdxType D,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int, int TPB = 256>
-void meanAdd(Type *out, const Type *data, const Type *mu, IdxType D, IdxType N,
-             bool rowMajor, bool bcastAlongRows, cudaStream_t stream) {
+void meanAdd(Type* out,
+             const Type* data,
+             const Type* mu,
+             IdxType D,
+             IdxType N,
+             bool rowMajor,
+             bool bcastAlongRows,
+             cudaStream_t stream)
+{
   raft::linalg::matrixVectorOp(
-    out, data, mu, D, N, rowMajor, bcastAlongRows,
-    [] __device__(Type a, Type b) { return a + b; }, stream);
+    out,
+    data,
+    mu,
+    D,
+    N,
+    rowMajor,
+    bcastAlongRows,
+    [] __device__(Type a, Type b) { return a + b; },
+    stream);
 }
 
 };  // end namespace stats
diff --git a/cpp/include/raft/stats/stddev.hpp b/cpp/include/raft/stats/stddev.hpp
index 17c5ae457d..9393dec8bc 100644
--- a/cpp/include/raft/stats/stddev.hpp
+++ b/cpp/include/raft/stats/stddev.hpp
@@ -42,8 +42,15 @@ namespace stats {
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
-            bool sample, bool rowMajor, cudaStream_t stream) {
+void stddev(Type* std,
+            const Type* data,
+            const Type* mu,
+            IdxType D,
+            IdxType N,
+            bool sample,
+            bool rowMajor,
+            cudaStream_t stream)
+{
   detail::stddev(std, data, mu, D, N, sample, rowMajor, stream);
 }
 
@@ -66,8 +73,15 @@ void stddev(Type *std, const Type *data, const Type *mu, IdxType D, IdxType N,
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void vars(Type *var, const Type *data, const Type *mu, IdxType D, IdxType N,
-          bool sample, bool rowMajor, cudaStream_t stream) {
+void vars(Type* var,
+          const Type* data,
+          const Type* mu,
+          IdxType D,
+          IdxType N,
+          bool sample,
+          bool rowMajor,
+          cudaStream_t stream)
+{
   detail::vars(var, data, mu, D, N, sample, rowMajor, stream);
 }
 
diff --git a/cpp/include/raft/stats/sum.hpp b/cpp/include/raft/stats/sum.hpp
index 4f67acdf36..cfb5142a14 100644
--- a/cpp/include/raft/stats/sum.hpp
+++ b/cpp/include/raft/stats/sum.hpp
@@ -38,8 +38,8 @@ namespace stats {
  * @param stream cuda stream where to launch work
  */
 template <typename Type, typename IdxType = int>
-void sum(Type *output, const Type *input, IdxType D, IdxType N, bool rowMajor,
-         cudaStream_t stream) {
+void sum(Type* output, const Type* input, IdxType D, IdxType N, bool rowMajor, cudaStream_t stream)
+{
   detail::sum(output, input, D, N, rowMajor, stream);
 }
 
diff --git a/cpp/include/raft/vectorized.cuh b/cpp/include/raft/vectorized.cuh
index ceffbcca78..b44d8bb4ad 100644
--- a/cpp/include/raft/vectorized.cuh
+++ b/cpp/include/raft/vectorized.cuh
@@ -22,11 +22,11 @@
 namespace raft {
 
 template <typename math_, int VecLen>
-struct IOType {};
+struct IOType {
+};
 template <>
 struct IOType<bool, 1> {
-  static_assert(sizeof(bool) == sizeof(int8_t),
-                "IOType bool size assumption failed");
+  static_assert(sizeof(bool) == sizeof(int8_t), "IOType bool size assumption failed");
   typedef int8_t Type;
 };
 template <>
@@ -215,50 +215,50 @@ struct IOType<double, 2> {
 };
 
 /**
-     * @struct TxN_t
-     *
-     * @brief Internal data structure that is used to define a facade for vectorized
-     * loads/stores across the most common POD types. The goal of his file is to
-     * provide with CUDA programmers, an easy way to have compiler issue vectorized
-     * load or store instructions to memory (either global or shared). Vectorized
-     * accesses to memory are important as they'll utilize its resources
-     * efficiently,
-     * when compared to their non-vectorized counterparts. Obviously, for whatever
-     * reasons if one is unable to issue such vectorized operations, one can always
-     * fallback to using POD types.
-     *
-     * Concept of vectorized accesses : Threads process multiple elements
-     * to speed up processing. These are loaded in a single read thanks
-     * to type promotion. It is then reinterpreted as a vector elements
-     * to perform the kernel's work.
-     *
-     * Caution : vectorized accesses requires input adresses to be memory aligned
-     * according not to the input type but to the promoted type used for reading.
-     *
-     * Example demonstrating the use of load operations, performing math on such
-     * loaded data and finally storing it back.
-     * @code{.cu}
-     * TxN_t<uint8_t,8> mydata1, mydata2;
-     * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio;
-     * mydata1.load(ptr1, idx);
-     * mydata2.load(ptr2, idx);
-     * #pragma unroll
-     * for(int i=0;i<mydata1.Ratio;++i) {
-     *     mydata1.val.data[i] += mydata2.val.data[i];
-     * }
-     * mydata1.store(ptr1, idx);
-     * @endcode
-     *
-     * By doing as above, the interesting thing is that the code effectively remains
-     * almost the same, in case one wants to upgrade to TxN_t<uint16_t,16> type.
-     * Only change required is to replace variable declaration appropriately.
-     *
-     * Obviously, it's caller's responsibility to take care of pointer alignment!
-     *
-     * @tparam math_ the data-type in which the compute/math needs to happen
-     * @tparam veclen_ the number of 'math_' types to be loaded/stored per
-     * instruction
-     */
+ * @struct TxN_t
+ *
+ * @brief Internal data structure that is used to define a facade for vectorized
+ * loads/stores across the most common POD types. The goal of his file is to
+ * provide with CUDA programmers, an easy way to have compiler issue vectorized
+ * load or store instructions to memory (either global or shared). Vectorized
+ * accesses to memory are important as they'll utilize its resources
+ * efficiently,
+ * when compared to their non-vectorized counterparts. Obviously, for whatever
+ * reasons if one is unable to issue such vectorized operations, one can always
+ * fallback to using POD types.
+ *
+ * Concept of vectorized accesses : Threads process multiple elements
+ * to speed up processing. These are loaded in a single read thanks
+ * to type promotion. It is then reinterpreted as a vector elements
+ * to perform the kernel's work.
+ *
+ * Caution : vectorized accesses requires input adresses to be memory aligned
+ * according not to the input type but to the promoted type used for reading.
+ *
+ * Example demonstrating the use of load operations, performing math on such
+ * loaded data and finally storing it back.
+ * @code{.cu}
+ * TxN_t<uint8_t,8> mydata1, mydata2;
+ * int idx = (threadIdx.x + (blockIdx.x * blockDim.x)) * mydata1.Ratio;
+ * mydata1.load(ptr1, idx);
+ * mydata2.load(ptr2, idx);
+ * #pragma unroll
+ * for(int i=0;i<mydata1.Ratio;++i) {
+ *     mydata1.val.data[i] += mydata2.val.data[i];
+ * }
+ * mydata1.store(ptr1, idx);
+ * @endcode
+ *
+ * By doing as above, the interesting thing is that the code effectively remains
+ * almost the same, in case one wants to upgrade to TxN_t<uint16_t,16> type.
+ * Only change required is to replace variable declaration appropriately.
+ *
+ * Obviously, it's caller's responsibility to take care of pointer alignment!
+ *
+ * @tparam math_ the data-type in which the compute/math needs to happen
+ * @tparam veclen_ the number of 'math_' types to be loaded/stored per
+ * instruction
+ */
 template <typename math_, int veclen_>
 struct TxN_t {
   /** underlying math data type */
@@ -282,7 +282,8 @@ struct TxN_t {
    * @brief Fill the contents of this structure with a constant value
    * @param _val the constant to be filled
    */
-  DI void fill(math_t _val) {
+  DI void fill(math_t _val)
+  {
 #pragma unroll
     for (int i = 0; i < Ratio; ++i) {
       val.data[i] = _val;
@@ -307,21 +308,24 @@ struct TxN_t {
    * @{
    */
   template <typename idx_t = int>
-  DI void load(const math_t *ptr, idx_t idx) {
-    const io_t *bptr = reinterpret_cast<const io_t *>(&ptr[idx]);
-    val.internal = __ldg(bptr);
+  DI void load(const math_t* ptr, idx_t idx)
+  {
+    const io_t* bptr = reinterpret_cast<const io_t*>(&ptr[idx]);
+    val.internal     = __ldg(bptr);
   }
 
   template <typename idx_t = int>
-  DI void load(math_t *ptr, idx_t idx) {
-    io_t *bptr = reinterpret_cast<io_t *>(&ptr[idx]);
+  DI void load(math_t* ptr, idx_t idx)
+  {
+    io_t* bptr   = reinterpret_cast<io_t*>(&ptr[idx]);
     val.internal = *bptr;
   }
 
   template <typename idx_t = int>
-  DI void store(math_t *ptr, idx_t idx) {
-    io_t *bptr = reinterpret_cast<io_t *>(&ptr[idx]);
-    *bptr = val.internal;
+  DI void store(math_t* ptr, idx_t idx)
+  {
+    io_t* bptr = reinterpret_cast<io_t*>(&ptr[idx]);
+    *bptr      = val.internal;
   }
   /** @} */
 };
@@ -338,11 +342,17 @@ struct TxN_t<math_, 0> {
 
   DI void fill(math_t _val) {}
   template <typename idx_t = int>
-  DI void load(const math_t *ptr, idx_t idx) {}
+  DI void load(const math_t* ptr, idx_t idx)
+  {
+  }
   template <typename idx_t = int>
-  DI void load(math_t *ptr, idx_t idx) {}
+  DI void load(math_t* ptr, idx_t idx)
+  {
+  }
   template <typename idx_t = int>
-  DI void store(math_t *ptr, idx_t idx) {}
+  DI void store(math_t* ptr, idx_t idx)
+  {
+  }
 };
 
 }  // namespace raft
diff --git a/cpp/test/cluster_solvers.cu b/cpp/test/cluster_solvers.cu
index 06b246d9a1..2c7996514a 100644
--- a/cpp/test/cluster_solvers.cu
+++ b/cpp/test/cluster_solvers.cu
@@ -23,7 +23,8 @@
 
 namespace raft {
 
-TEST(Raft, ClusterSolvers) {
+TEST(Raft, ClusterSolvers)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -40,7 +41,7 @@ TEST(Raft, ClusterSolvers) {
   index_type d{10};
   index_type k{5};
 
-  //nullptr expected to trigger exceptions:
+  // nullptr expected to trigger exceptions:
   //
   value_type* eigvecs{nullptr};
   index_type* codes{nullptr};
@@ -52,7 +53,8 @@ TEST(Raft, ClusterSolvers) {
   EXPECT_ANY_THROW(cluster_solver.solve(h, n, d, eigvecs, codes));
 }
 
-TEST(Raft, ModularitySolvers) {
+TEST(Raft, ModularitySolvers)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -66,7 +68,7 @@ TEST(Raft, ModularitySolvers) {
   value_type tol{1.0e-10};
   bool reorthog{true};
 
-  //nullptr expected to trigger exceptions:
+  // nullptr expected to trigger exceptions:
   //
   index_type* clusters{nullptr};
   value_type* eigvals{nullptr};
@@ -80,13 +82,11 @@ TEST(Raft, ModularitySolvers) {
 
   index_type k{5};
 
-  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol,
-                                                            seed};
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
   kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
 
   auto stream = h.get_stream();
-  sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
-                                             nullptr, 0,       0};
+  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
 
   EXPECT_ANY_THROW(spectral::modularity_maximization(
     h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
diff --git a/cpp/test/cudart_utils.cpp b/cpp/test/cudart_utils.cpp
index c14d880efd..150767992f 100644
--- a/cpp/test/cudart_utils.cpp
+++ b/cpp/test/cudart_utils.cpp
@@ -20,7 +20,8 @@
 
 namespace raft {
 
-TEST(Raft, Utils) {
+TEST(Raft, Utils)
+{
   ASSERT_NO_THROW(ASSERT(1 == 1, "Should not assert!"));
   ASSERT_THROW(ASSERT(1 != 1, "Should assert!"), exception);
   ASSERT_THROW(THROW("Should throw!"), exception);
diff --git a/cpp/test/distance/dist_adj.cu b/cpp/test/distance/dist_adj.cu
index efa1e2cd41..21d7e9d753 100644
--- a/cpp/test/distance/dist_adj.cu
+++ b/cpp/test/distance/dist_adj.cu
@@ -26,30 +26,42 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-__global__ void naiveDistanceAdjKernel(bool *dist, const DataType *x,
-                                       const DataType *y, int m, int n, int k,
-                                       DataType eps, bool isRowMajor) {
+__global__ void naiveDistanceAdjKernel(bool* dist,
+                                       const DataType* x,
+                                       const DataType* y,
+                                       int m,
+                                       int n,
+                                       int k,
+                                       DataType eps,
+                                       bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
     auto diff = x[xidx] - y[yidx];
     acc += diff * diff;
   }
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc <= eps;
 }
 
 template <typename DataType>
-void naiveDistanceAdj(bool *dist, const DataType *x, const DataType *y, int m,
-                      int n, int k, DataType eps, bool isRowMajor) {
+void naiveDistanceAdj(bool* dist,
+                      const DataType* x,
+                      const DataType* y,
+                      int m,
+                      int n,
+                      int k,
+                      DataType eps,
+                      bool isRowMajor)
+{
   static const dim3 TPB(16, 32, 1);
   dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
-  naiveDistanceAdjKernel<DataType>
-    <<<nblks, TPB>>>(dist, x, y, m, n, k, eps, isRowMajor);
+  naiveDistanceAdjKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, eps, isRowMajor);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -62,26 +74,28 @@ struct DistanceAdjInputs {
 };
 
 template <typename DataType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const DistanceAdjInputs<DataType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const DistanceAdjInputs<DataType>& dims)
+{
   return os;
 }
 
 template <typename DataType>
-class DistanceAdjTest
-  : public ::testing::TestWithParam<DistanceAdjInputs<DataType>> {
+class DistanceAdjTest : public ::testing::TestWithParam<DistanceAdjInputs<DataType>> {
  public:
   DistanceAdjTest()
     : params(::testing::TestWithParam<DistanceAdjInputs<DataType>>::GetParam()),
       stream(handle.get_stream()),
       dist(params.m * params.n, stream),
-      dist_ref(params.m * params.n, stream) {}
+      dist_ref(params.m * params.n, stream)
+  {
+  }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
-    int m = params.m;
-    int n = params.n;
-    int k = params.k;
+    int m           = params.m;
+    int n           = params.n;
+    int k           = params.k;
     bool isRowMajor = params.isRowMajor;
 
     rmm::device_uvector<DataType> x(m * k, stream);
@@ -92,21 +106,27 @@ class DistanceAdjTest
 
     DataType threshold = params.eps;
 
-    naiveDistanceAdj(dist_ref.data(), x.data(), y.data(), m, n, k, threshold,
-                     isRowMajor);
-    size_t worksize =
-      raft::distance::getWorkspaceSize<raft::distance::DistanceType::L2Expanded,
-                                       DataType, DataType, bool>(
+    naiveDistanceAdj(dist_ref.data(), x.data(), y.data(), m, n, k, threshold, isRowMajor);
+    size_t worksize = raft::distance::
+      getWorkspaceSize<raft::distance::DistanceType::L2Expanded, DataType, DataType, bool>(
         x.data(), y.data(), m, n, k);
     rmm::device_uvector<char> workspace(worksize, stream);
 
     auto fin_op = [threshold] __device__(DataType d_val, int g_d_idx) {
       return d_val <= threshold;
     };
-    raft::distance::distance<raft::distance::DistanceType::L2Expanded, DataType,
-                             DataType, bool>(
-      x.data(), y.data(), dist.data(), m, n, k, workspace.data(),
-      workspace.size(), fin_op, stream, isRowMajor);
+    raft::distance::distance<raft::distance::DistanceType::L2Expanded, DataType, DataType, bool>(
+      x.data(),
+      y.data(),
+      dist.data(),
+      m,
+      n,
+      k,
+      workspace.data(),
+      workspace.size(),
+      fin_op,
+      stream,
+      isRowMajor);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -131,14 +151,13 @@ const std::vector<DistanceAdjInputs<float>> inputsf = {
   {10.0f, 1024, 1024, 32, false, 1234ULL},
 };
 typedef DistanceAdjTest<float> DistanceAdjTestF;
-TEST_P(DistanceAdjTestF, Result) {
+TEST_P(DistanceAdjTestF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>()));
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>()));
 }
-INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceAdjInputs<double>> inputsd = {
   {0.01, 1024, 1024, 32, true, 1234ULL},
@@ -151,14 +170,13 @@ const std::vector<DistanceAdjInputs<double>> inputsd = {
   {10.0, 1024, 1024, 32, false, 1234ULL},
 };
 typedef DistanceAdjTest<double> DistanceAdjTestD;
-TEST_P(DistanceAdjTestD, Result) {
+TEST_P(DistanceAdjTestD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(
-    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>()));
+  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n, raft::Compare<bool>()));
 }
-INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceAdjTests, DistanceAdjTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_canberra.cu b/cpp/test/distance/dist_canberra.cu
index bddfdff3b6..db318605b4 100644
--- a/cpp/test/distance/dist_canberra.cu
+++ b/cpp/test/distance/dist_canberra.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceCanberra
-  : public DistanceTest<raft::distance::DistanceType::Canberra, DataType> {};
+class DistanceCanberra : public DistanceTest<raft::distance::DistanceType::Canberra, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceCanberra<float> DistanceCanberraF;
-TEST_P(DistanceCanberraF, Result) {
+TEST_P(DistanceCanberraF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceCanberra<double> DistanceCanberraD;
-TEST_P(DistanceCanberraD, Result) {
+TEST_P(DistanceCanberraD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCanberraD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_chebyshev.cu b/cpp/test/distance/dist_chebyshev.cu
index 0dc6edfaad..c7dccfe712 100644
--- a/cpp/test/distance/dist_chebyshev.cu
+++ b/cpp/test/distance/dist_chebyshev.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceLinf
-  : public DistanceTest<raft::distance::DistanceType::Linf, DataType> {};
+class DistanceLinf : public DistanceTest<raft::distance::DistanceType::Linf, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceLinf<float> DistanceLinfF;
-TEST_P(DistanceLinfF, Result) {
+TEST_P(DistanceLinfF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceLinf<double> DistanceLinfD;
-TEST_P(DistanceLinfD, Result) {
+TEST_P(DistanceLinfD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLinfD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_correlation.cu b/cpp/test/distance/dist_correlation.cu
index f6dc015738..0648ed96ca 100644
--- a/cpp/test/distance/dist_correlation.cu
+++ b/cpp/test/distance/dist_correlation.cu
@@ -22,8 +22,8 @@ namespace distance {
 
 template <typename DataType>
 class DistanceCorrelation
-  : public DistanceTest<raft::distance::DistanceType::CorrelationExpanded,
-                        DataType> {};
+  : public DistanceTest<raft::distance::DistanceType::CorrelationExpanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceCorrelation<float> DistanceCorrelationF;
-TEST_P(DistanceCorrelationF, Result) {
+TEST_P(DistanceCorrelationF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceCorrelation<double> DistanceCorrelationD;
-TEST_P(DistanceCorrelationD, Result) {
+TEST_P(DistanceCorrelationD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceCorrelationD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_cos.cu b/cpp/test/distance/dist_cos.cu
index 2487bcbd95..b3e6a4c97f 100644
--- a/cpp/test/distance/dist_cos.cu
+++ b/cpp/test/distance/dist_cos.cu
@@ -21,9 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceExpCos
-  : public DistanceTest<raft::distance::DistanceType::CosineExpanded,
-                        DataType> {};
+class DistanceExpCos : public DistanceTest<raft::distance::DistanceType::CosineExpanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceExpCos<float> DistanceExpCosF;
-TEST_P(DistanceExpCosF, Result) {
+TEST_P(DistanceExpCosF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
-                          raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceExpCos<double> DistanceExpCosD;
-TEST_P(DistanceExpCosD, Result) {
+TEST_P(DistanceExpCosD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
-                          raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpCosD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_euc_exp.cu b/cpp/test/distance/dist_euc_exp.cu
index a6ef01aa45..75ff7e682a 100644
--- a/cpp/test/distance/dist_euc_exp.cu
+++ b/cpp/test/distance/dist_euc_exp.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceEucExpTest
-  : public DistanceTest<raft::distance::DistanceType::L2Expanded, DataType> {};
+class DistanceEucExpTest : public DistanceTest<raft::distance::DistanceType::L2Expanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucExpTest<float> DistanceEucExpTestF;
-TEST_P(DistanceEucExpTestF, Result) {
+TEST_P(DistanceEucExpTestF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
-                          raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucExpTest<double> DistanceEucExpTestD;
-TEST_P(DistanceEucExpTestD, Result) {
+TEST_P(DistanceEucExpTestD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
-                          raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucExpTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_euc_unexp.cu b/cpp/test/distance/dist_euc_unexp.cu
index 290abda352..88affa16d5 100644
--- a/cpp/test/distance/dist_euc_unexp.cu
+++ b/cpp/test/distance/dist_euc_unexp.cu
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucUnexpTest<float> DistanceEucUnexpTestF;
-TEST_P(DistanceEucUnexpTestF, Result) {
+TEST_P(DistanceEucUnexpTestF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
-                          raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceEucUnexpTest<double> DistanceEucUnexpTestD;
-TEST_P(DistanceEucUnexpTestD, Result) {
+TEST_P(DistanceEucUnexpTestD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(devArrMatch(dist_ref.data(), dist.data(), m, n,
-                          raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceEucUnexpTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_hamming.cu b/cpp/test/distance/dist_hamming.cu
index 0123c8bada..631adc751c 100644
--- a/cpp/test/distance/dist_hamming.cu
+++ b/cpp/test/distance/dist_hamming.cu
@@ -22,8 +22,8 @@ namespace distance {
 
 template <typename DataType>
 class DistanceHamming
-  : public DistanceTest<raft::distance::DistanceType::HammingUnexpanded,
-                        DataType> {};
+  : public DistanceTest<raft::distance::DistanceType::HammingUnexpanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceHamming<float> DistanceHammingF;
-TEST_P(DistanceHammingF, Result) {
+TEST_P(DistanceHammingF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceHamming<double> DistanceHammingD;
-TEST_P(DistanceHammingD, Result) {
+TEST_P(DistanceHammingD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHammingD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_hellinger.cu b/cpp/test/distance/dist_hellinger.cu
index 39d197f786..8a07c8836f 100644
--- a/cpp/test/distance/dist_hellinger.cu
+++ b/cpp/test/distance/dist_hellinger.cu
@@ -22,8 +22,8 @@ namespace distance {
 
 template <typename DataType>
 class DistanceHellingerExp
-  : public DistanceTest<raft::distance::DistanceType::HellingerExpanded,
-                        DataType> {};
+  : public DistanceTest<raft::distance::DistanceType::HellingerExpanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceHellingerExp<float> DistanceHellingerExpF;
-TEST_P(DistanceHellingerExpF, Result) {
+TEST_P(DistanceHellingerExpF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceHellingerExp<double> DistanceHellingerExpD;
-TEST_P(DistanceHellingerExpD, Result) {
+TEST_P(DistanceHellingerExpD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceHellingerExpD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_jensen_shannon.cu b/cpp/test/distance/dist_jensen_shannon.cu
index 9070ce92c1..3cda31a852 100644
--- a/cpp/test/distance/dist_jensen_shannon.cu
+++ b/cpp/test/distance/dist_jensen_shannon.cu
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceJensenShannon<float> DistanceJensenShannonF;
-TEST_P(DistanceJensenShannonF, Result) {
+TEST_P(DistanceJensenShannonF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceJensenShannon<double> DistanceJensenShannonD;
-TEST_P(DistanceJensenShannonD, Result) {
+TEST_P(DistanceJensenShannonD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceJensenShannonD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_kl_divergence.cu b/cpp/test/distance/dist_kl_divergence.cu
index 7c32596527..4303b8cc8f 100644
--- a/cpp/test/distance/dist_kl_divergence.cu
+++ b/cpp/test/distance/dist_kl_divergence.cu
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceKLDivergence<float> DistanceKLDivergenceF;
-TEST_P(DistanceKLDivergenceF, Result) {
+TEST_P(DistanceKLDivergenceF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceKLDivergence<double> DistanceKLDivergenceD;
-TEST_P(DistanceKLDivergenceD, Result) {
+TEST_P(DistanceKLDivergenceD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceKLDivergenceD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_l1.cu b/cpp/test/distance/dist_l1.cu
index ff7705d195..dad160ca41 100644
--- a/cpp/test/distance/dist_l1.cu
+++ b/cpp/test/distance/dist_l1.cu
@@ -21,8 +21,8 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceUnexpL1
-  : public DistanceTest<raft::distance::DistanceType::L1, DataType> {};
+class DistanceUnexpL1 : public DistanceTest<raft::distance::DistanceType::L1, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -35,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceUnexpL1<float> DistanceUnexpL1F;
-TEST_P(DistanceUnexpL1F, Result) {
+TEST_P(DistanceUnexpL1F, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1F, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -55,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceUnexpL1<double> DistanceUnexpL1D;
-TEST_P(DistanceUnexpL1D, Result) {
+TEST_P(DistanceUnexpL1D, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceUnexpL1D, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_minkowski.cu b/cpp/test/distance/dist_minkowski.cu
index 7d87bbc2c7..34f6d2825e 100644
--- a/cpp/test/distance/dist_minkowski.cu
+++ b/cpp/test/distance/dist_minkowski.cu
@@ -21,8 +21,7 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-class DistanceLpUnexp
-  : public DistanceTest<raft::distance::DistanceType::LpUnexpanded, DataType> {
+class DistanceLpUnexp : public DistanceTest<raft::distance::DistanceType::LpUnexpanded, DataType> {
 };
 
 const std::vector<DistanceInputs<float>> inputsf = {
@@ -36,14 +35,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL, 3.0f},
 };
 typedef DistanceLpUnexp<float> DistanceLpUnexpF;
-TEST_P(DistanceLpUnexpF, Result) {
+TEST_P(DistanceLpUnexpF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL, 4.0},
@@ -56,14 +55,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL, 3.0},
 };
 typedef DistanceLpUnexp<double> DistanceLpUnexpD;
-TEST_P(DistanceLpUnexpD, Result) {
+TEST_P(DistanceLpUnexpD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceLpUnexpD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/dist_russell_rao.cu b/cpp/test/distance/dist_russell_rao.cu
index ae735951a8..e0bfcd7eb3 100644
--- a/cpp/test/distance/dist_russell_rao.cu
+++ b/cpp/test/distance/dist_russell_rao.cu
@@ -22,8 +22,8 @@ namespace distance {
 
 template <typename DataType>
 class DistanceRussellRao
-  : public DistanceTest<raft::distance::DistanceType::RusselRaoExpanded,
-                        DataType> {};
+  : public DistanceTest<raft::distance::DistanceType::RusselRaoExpanded, DataType> {
+};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 1024, 1024, 32, true, 1234ULL},
@@ -36,14 +36,14 @@ const std::vector<DistanceInputs<float>> inputsf = {
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceRussellRao<float> DistanceRussellRaoF;
-TEST_P(DistanceRussellRaoF, Result) {
+TEST_P(DistanceRussellRaoF, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoF, ::testing::ValuesIn(inputsf));
 
 const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
@@ -56,14 +56,14 @@ const std::vector<DistanceInputs<double>> inputsd = {
   {0.003, 1024, 1024, 1024, false, 1234ULL},
 };
 typedef DistanceRussellRao<double> DistanceRussellRaoD;
-TEST_P(DistanceRussellRaoD, Result) {
+TEST_P(DistanceRussellRaoD, Result)
+{
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
-  ASSERT_TRUE(raft::devArrMatch(dist_ref.data(), dist.data(), m, n,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    dist_ref.data(), dist.data(), m, n, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceRussellRaoD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/distance/distance_base.cuh b/cpp/test/distance/distance_base.cuh
index f31fbc9165..f445e3b578 100644
--- a/cpp/test/distance/distance_base.cuh
+++ b/cpp/test/distance/distance_base.cuh
@@ -25,43 +25,52 @@ namespace raft {
 namespace distance {
 
 template <typename DataType>
-__global__ void naiveDistanceKernel(DataType *dist, const DataType *x,
-                                    const DataType *y, int m, int n, int k,
+__global__ void naiveDistanceKernel(DataType* dist,
+                                    const DataType* x,
+                                    const DataType* y,
+                                    int m,
+                                    int n,
+                                    int k,
                                     raft::distance::DistanceType type,
-                                    bool isRowMajor) {
+                                    bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
     auto diff = x[xidx] - y[yidx];
     acc += diff * diff;
   }
   if (type == raft::distance::DistanceType::L2SqrtExpanded ||
       type == raft::distance::DistanceType::L2SqrtUnexpanded)
     acc = raft::mySqrt(acc);
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType>
-__global__ void naiveL1_Linf_CanberraDistanceKernel(
-  DataType *dist, const DataType *x, const DataType *y, int m, int n, int k,
-  raft::distance::DistanceType type, bool isRowMajor) {
+__global__ void naiveL1_Linf_CanberraDistanceKernel(DataType* dist,
+                                                    const DataType* x,
+                                                    const DataType* y,
+                                                    int m,
+                                                    int n,
+                                                    int k,
+                                                    raft::distance::DistanceType type,
+                                                    bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) {
-    return;
-  }
+  if (midx >= m || nidx >= n) { return; }
 
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a    = x[xidx];
+    auto b    = y[yidx];
     auto diff = (a > b) ? (a - b) : (b - a);
     if (type == raft::distance::DistanceType::Linf) {
       acc = raft::myMax(acc, diff);
@@ -75,29 +84,27 @@ __global__ void naiveL1_Linf_CanberraDistanceKernel(
     }
   }
 
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType>
-__global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x,
-                                          const DataType *y, int m, int n,
-                                          int k, bool isRowMajor) {
+__global__ void naiveCosineDistanceKernel(
+  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) {
-    return;
-  }
+  if (midx >= m || nidx >= n) { return; }
 
-  DataType acc_a = DataType(0);
-  DataType acc_b = DataType(0);
+  DataType acc_a  = DataType(0);
+  DataType acc_b  = DataType(0);
   DataType acc_ab = DataType(0);
 
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
     acc_a += a * a;
     acc_b += b * b;
     acc_ab += a * b;
@@ -106,64 +113,67 @@ __global__ void naiveCosineDistanceKernel(DataType *dist, const DataType *x,
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
 
   // Use 1.0 - (cosine similarity) to calc the distance
-  dist[outidx] =
-    (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b));
+  dist[outidx] = (DataType)1.0 - acc_ab / (raft::mySqrt(acc_a) * raft::mySqrt(acc_b));
 }
 
 template <typename DataType>
-__global__ void naiveHellingerDistanceKernel(DataType *dist, const DataType *x,
-                                             const DataType *y, int m, int n,
-                                             int k, bool isRowMajor) {
+__global__ void naiveHellingerDistanceKernel(
+  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
-  if (midx >= m || nidx >= n) {
-    return;
-  }
+  if (midx >= m || nidx >= n) { return; }
 
   DataType acc_ab = DataType(0);
 
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
     acc_ab += raft::mySqrt(a) * raft::mySqrt(b);
   }
 
   int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
 
   // Adjust to replace NaN in sqrt with 0 if input to sqrt is negative
-  acc_ab = 1 - acc_ab;
+  acc_ab         = 1 - acc_ab;
   auto rectifier = (!signbit(acc_ab));
-  dist[outidx] = raft::mySqrt(rectifier * acc_ab);
+  dist[outidx]   = raft::mySqrt(rectifier * acc_ab);
 }
 
 template <typename DataType>
-__global__ void naiveLpUnexpDistanceKernel(DataType *dist, const DataType *x,
-                                           const DataType *y, int m, int n,
-                                           int k, bool isRowMajor, DataType p) {
+__global__ void naiveLpUnexpDistanceKernel(DataType* dist,
+                                           const DataType* x,
+                                           const DataType* y,
+                                           int m,
+                                           int n,
+                                           int k,
+                                           bool isRowMajor,
+                                           DataType p)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   DataType acc = DataType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    int xidx  = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx  = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a    = x[xidx];
+    auto b    = y[yidx];
     auto diff = raft::L1Op<DataType>()(a - b);
     acc += raft::myPow(diff, p);
   }
   auto one_over_p = 1 / p;
-  acc = raft::myPow(acc, one_over_p);
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
-  dist[outidx] = acc;
+  acc             = raft::myPow(acc, one_over_p);
+  int outidx      = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  dist[outidx]    = acc;
 }
 
 template <typename DataType>
-__global__ void naiveHammingDistanceKernel(DataType *dist, const DataType *x,
-                                           const DataType *y, int m, int n,
-                                           int k, bool isRowMajor) {
+__global__ void naiveHammingDistanceKernel(
+  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
@@ -171,21 +181,19 @@ __global__ void naiveHammingDistanceKernel(DataType *dist, const DataType *x,
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
     acc += (a != b);
   }
-  acc = acc / k;
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  acc          = acc / k;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType>
-__global__ void naiveJensenShannonDistanceKernel(DataType *dist,
-                                                 const DataType *x,
-                                                 const DataType *y, int m,
-                                                 int n, int k,
-                                                 bool isRowMajor) {
+__global__ void naiveJensenShannonDistanceKernel(
+  DataType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
@@ -193,10 +201,10 @@ __global__ void naiveJensenShannonDistanceKernel(DataType *dist,
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
 
-    DataType m = 0.5f * (a + b);
+    DataType m  = 0.5f * (a + b);
     bool a_zero = a == 0;
     bool b_zero = b == 0;
 
@@ -206,18 +214,17 @@ __global__ void naiveJensenShannonDistanceKernel(DataType *dist,
     bool p_zero = p == 0;
     bool q_zero = q == 0;
 
-    acc +=
-      (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero)));
+    acc += (-a * (!p_zero * log(p + p_zero))) + (-b * (!q_zero * log(q + q_zero)));
   }
-  acc = raft::mySqrt(0.5f * acc);
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  acc          = raft::mySqrt(0.5f * acc);
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType, typename OutType>
-__global__ void naiveRussellRaoDistanceKernel(OutType *dist, const DataType *x,
-                                              const DataType *y, int m, int n,
-                                              int k, bool isRowMajor) {
+__global__ void naiveRussellRaoDistanceKernel(
+  OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
@@ -225,56 +232,55 @@ __global__ void naiveRussellRaoDistanceKernel(OutType *dist, const DataType *x,
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
     acc += (a * b);
   }
-  acc = (k - acc) / k;
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  acc          = (k - acc) / k;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType, typename OutType>
-__global__ void naiveKLDivergenceDistanceKernel(OutType *dist,
-                                                const DataType *x,
-                                                const DataType *y, int m, int n,
-                                                int k, bool isRowMajor) {
+__global__ void naiveKLDivergenceDistanceKernel(
+  OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
   OutType acc = OutType(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = isRowMajor ? i + midx * k : i * m + midx;
-    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
-    bool b_zero = (b == 0);
-    const auto m = (!b_zero) * (a / b);
+    int xidx          = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx          = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a            = x[xidx];
+    auto b            = y[yidx];
+    bool b_zero       = (b == 0);
+    const auto m      = (!b_zero) * (a / b);
     const bool m_zero = (m == 0);
     acc += (a * (!m_zero) * log(m + m_zero));
   }
-  acc = 0.5f * acc;
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  acc          = 0.5f * acc;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType, typename OutType>
-__global__ void naiveCorrelationDistanceKernel(OutType *dist, const DataType *x,
-                                               const DataType *y, int m, int n,
-                                               int k, bool isRowMajor) {
+__global__ void naiveCorrelationDistanceKernel(
+  OutType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
   int midx = threadIdx.x + blockIdx.x * blockDim.x;
   int nidx = threadIdx.y + blockIdx.y * blockDim.y;
   if (midx >= m || nidx >= n) return;
-  OutType acc = OutType(0);
-  auto a_norm = DataType(0);
-  auto b_norm = DataType(0);
+  OutType acc    = OutType(0);
+  auto a_norm    = DataType(0);
+  auto b_norm    = DataType(0);
   auto a_sq_norm = DataType(0);
   auto b_sq_norm = DataType(0);
   for (int i = 0; i < k; ++i) {
     int xidx = isRowMajor ? i + midx * k : i * m + midx;
     int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
-    auto a = x[xidx];
-    auto b = y[yidx];
+    auto a   = x[xidx];
+    auto b   = y[yidx];
     a_norm += a;
     b_norm += b;
     a_sq_norm += (a * a);
@@ -282,20 +288,27 @@ __global__ void naiveCorrelationDistanceKernel(OutType *dist, const DataType *x,
     acc += (a * b);
   }
 
-  auto numer = k * acc - (a_norm * b_norm);
+  auto numer   = k * acc - (a_norm * b_norm);
   auto Q_denom = k * a_sq_norm - (a_norm * a_norm);
   auto R_denom = k * b_sq_norm - (b_norm * b_norm);
 
   acc = 1 - (numer / raft::mySqrt(Q_denom * R_denom));
 
-  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+  int outidx   = isRowMajor ? midx * n + nidx : midx + m * nidx;
   dist[outidx] = acc;
 }
 
 template <typename DataType>
-void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m,
-                   int n, int k, raft::distance::DistanceType type,
-                   bool isRowMajor, DataType metric_arg = 2.0f) {
+void naiveDistance(DataType* dist,
+                   const DataType* x,
+                   const DataType* y,
+                   int m,
+                   int n,
+                   int k,
+                   raft::distance::DistanceType type,
+                   bool isRowMajor,
+                   DataType metric_arg = 2.0f)
+{
   static const dim3 TPB(16, 32, 1);
   dim3 nblks(raft::ceildiv(m, (int)TPB.x), raft::ceildiv(n, (int)TPB.y), 1);
 
@@ -310,43 +323,34 @@ void naiveDistance(DataType *dist, const DataType *x, const DataType *y, int m,
     case raft::distance::DistanceType::L2Unexpanded:
     case raft::distance::DistanceType::L2SqrtExpanded:
     case raft::distance::DistanceType::L2Expanded:
-      naiveDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, type, isRowMajor);
+      naiveDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, type, isRowMajor);
       break;
     case raft::distance::DistanceType::CosineExpanded:
-      naiveCosineDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveCosineDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::HellingerExpanded:
-      naiveHellingerDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveHellingerDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::LpUnexpanded:
       naiveLpUnexpDistanceKernel<DataType>
         <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor, metric_arg);
       break;
     case raft::distance::DistanceType::HammingUnexpanded:
-      naiveHammingDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveHammingDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::JensenShannon:
-      naiveJensenShannonDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveJensenShannonDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::RusselRaoExpanded:
-      naiveRussellRaoDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveRussellRaoDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::KLDivergence:
-      naiveKLDivergenceDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveKLDivergenceDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
     case raft::distance::DistanceType::CorrelationExpanded:
-      naiveCorrelationDistanceKernel<DataType>
-        <<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
+      naiveCorrelationDistanceKernel<DataType><<<nblks, TPB>>>(dist, x, y, m, n, k, isRowMajor);
       break;
-    default:
-      FAIL() << "should be here\n";
+    default: FAIL() << "should be here\n";
   }
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -361,24 +365,33 @@ struct DistanceInputs {
 };
 
 template <typename DataType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const DistanceInputs<DataType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const DistanceInputs<DataType>& dims)
+{
   return os;
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
-void distanceLauncher(DataType *x, DataType *y, DataType *dist, DataType *dist2,
-                      int m, int n, int k, DistanceInputs<DataType> &params,
-                      DataType threshold, char *workspace, size_t worksize,
-                      cudaStream_t stream, bool isRowMajor,
-                      DataType metric_arg = 2.0f) {
+void distanceLauncher(DataType* x,
+                      DataType* y,
+                      DataType* dist,
+                      DataType* dist2,
+                      int m,
+                      int n,
+                      int k,
+                      DistanceInputs<DataType>& params,
+                      DataType threshold,
+                      char* workspace,
+                      size_t worksize,
+                      cudaStream_t stream,
+                      bool isRowMajor,
+                      DataType metric_arg = 2.0f)
+{
   auto fin_op = [dist2, threshold] __device__(DataType d_val, int g_d_idx) {
     dist2[g_d_idx] = (d_val < threshold) ? 0.f : d_val;
     return d_val;
   };
   raft::distance::distance<distanceType, DataType, DataType, DataType>(
-    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor,
-    metric_arg);
+    x, y, dist, m, n, k, workspace, worksize, fin_op, stream, isRowMajor, metric_arg);
 }
 
 template <raft::distance::DistanceType distanceType, typename DataType>
@@ -391,23 +404,25 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
       y(params.n * params.k, stream),
       dist_ref(params.m * params.n, stream),
       dist(params.m * params.n, stream),
-      dist2(params.m * params.n, stream) {}
+      dist2(params.m * params.n, stream)
+  {
+  }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
-    int m = params.m;
-    int n = params.n;
-    int k = params.k;
+    int m               = params.m;
+    int n               = params.n;
+    int k               = params.k;
     DataType metric_arg = params.metric_arg;
-    bool isRowMajor = params.isRowMajor;
+    bool isRowMajor     = params.isRowMajor;
     if (distanceType == raft::distance::DistanceType::HellingerExpanded ||
         distanceType == raft::distance::DistanceType::JensenShannon ||
         distanceType == raft::distance::DistanceType::KLDivergence) {
       // Hellinger works only on positive numbers
       r.uniform(x.data(), m * k, DataType(0.0), DataType(1.0), stream);
       r.uniform(y.data(), n * k, DataType(0.0), DataType(1.0), stream);
-    } else if (distanceType ==
-               raft::distance::DistanceType::RusselRaoExpanded) {
+    } else if (distanceType == raft::distance::DistanceType::RusselRaoExpanded) {
       r.uniform(x.data(), m * k, DataType(0.0), DataType(1.0), stream);
       r.uniform(y.data(), n * k, DataType(0.0), DataType(1.0), stream);
       // Russel rao works on boolean values.
@@ -417,17 +432,27 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType>> {
       r.uniform(x.data(), m * k, DataType(-1.0), DataType(1.0), stream);
       r.uniform(y.data(), n * k, DataType(-1.0), DataType(1.0), stream);
     }
-    naiveDistance(dist_ref.data(), x.data(), y.data(), m, n, k, distanceType,
-                  isRowMajor, metric_arg);
-    size_t worksize =
-      raft::distance::getWorkspaceSize<distanceType, DataType, DataType,
-                                       DataType>(x.data(), y.data(), m, n, k);
+    naiveDistance(
+      dist_ref.data(), x.data(), y.data(), m, n, k, distanceType, isRowMajor, metric_arg);
+    size_t worksize = raft::distance::getWorkspaceSize<distanceType, DataType, DataType, DataType>(
+      x.data(), y.data(), m, n, k);
     rmm::device_uvector<char> workspace(worksize, stream);
 
     DataType threshold = -10000.f;
-    distanceLauncher<distanceType, DataType>(
-      x.data(), y.data(), dist.data(), dist2.data(), m, n, k, params, threshold,
-      workspace.data(), workspace.size(), stream, isRowMajor, metric_arg);
+    distanceLauncher<distanceType, DataType>(x.data(),
+                                             y.data(),
+                                             dist.data(),
+                                             dist2.data(),
+                                             m,
+                                             n,
+                                             k,
+                                             params,
+                                             threshold,
+                                             workspace.data(),
+                                             workspace.size(),
+                                             stream,
+                                             isRowMajor,
+                                             metric_arg);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
diff --git a/cpp/test/distance/fused_l2_nn.cu b/cpp/test/distance/fused_l2_nn.cu
index 33782baf8d..932857c536 100644
--- a/cpp/test/distance/fused_l2_nn.cu
+++ b/cpp/test/distance/fused_l2_nn.cu
@@ -30,40 +30,40 @@ template <typename LabelT, typename DataT>
 struct CubKVPMinReduce {
   typedef cub::KeyValuePair<LabelT, DataT> KVP;
 
-  DI KVP operator()(LabelT rit, const KVP &a, const KVP &b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(LabelT rit, const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
-  DI KVP operator()(const KVP &a, const KVP &b) {
-    return b.value < a.value ? b : a;
-  }
+  DI KVP operator()(const KVP& a, const KVP& b) { return b.value < a.value ? b : a; }
 
 };  // KVPMinReduce
 
 template <typename DataT, bool Sqrt, typename ReduceOpT, int NWARPS>
-__global__ void naiveKernel(cub::KeyValuePair<int, DataT> *min, DataT *x,
-                            DataT *y, int m, int n, int k, int *workspace,
-                            DataT maxVal) {
-  int midx = threadIdx.y + blockIdx.y * blockDim.y;
-  int nidx = threadIdx.x + blockIdx.x * blockDim.x;
+__global__ void naiveKernel(cub::KeyValuePair<int, DataT>* min,
+                            DataT* x,
+                            DataT* y,
+                            int m,
+                            int n,
+                            int k,
+                            int* workspace,
+                            DataT maxVal)
+{
+  int midx  = threadIdx.y + blockIdx.y * blockDim.y;
+  int nidx  = threadIdx.x + blockIdx.x * blockDim.x;
   DataT acc = DataT(0);
   for (int i = 0; i < k; ++i) {
-    int xidx = i + midx * k;
-    int yidx = i + nidx * k;
+    int xidx  = i + midx * k;
+    int yidx  = i + nidx * k;
     auto diff = midx >= m || nidx >= n ? DataT(0) : x[xidx] - y[yidx];
     acc += diff * diff;
   }
-  if (Sqrt) {
-    acc = raft::mySqrt(acc);
-  }
+  if (Sqrt) { acc = raft::mySqrt(acc); }
   ReduceOpT redOp;
   typedef cub::WarpReduce<cub::KeyValuePair<int, DataT>> WarpReduce;
   __shared__ typename WarpReduce::TempStorage temp[NWARPS];
   int warpId = threadIdx.x / raft::WarpSize;
   cub::KeyValuePair<int, DataT> tmp;
-  tmp.key = nidx;
+  tmp.key   = nidx;
   tmp.value = midx >= m || nidx >= n ? maxVal : acc;
-  tmp = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce<int, DataT>());
+  tmp       = WarpReduce(temp[warpId]).Reduce(tmp, CubKVPMinReduce<int, DataT>());
   if (threadIdx.x % raft::WarpSize == 0 && midx < m) {
     while (atomicCAS(workspace + midx, 0, 1) == 1)
       ;
@@ -75,8 +75,15 @@ __global__ void naiveKernel(cub::KeyValuePair<int, DataT> *min, DataT *x,
 }
 
 template <typename DataT, bool Sqrt>
-void naive(cub::KeyValuePair<int, DataT> *min, DataT *x, DataT *y, int m, int n,
-           int k, int *workspace, cudaStream_t stream) {
+void naive(cub::KeyValuePair<int, DataT>* min,
+           DataT* x,
+           DataT* y,
+           int m,
+           int n,
+           int k,
+           int* workspace,
+           cudaStream_t stream)
+{
   static const dim3 TPB(32, 16, 1);
   dim3 nblks(raft::ceildiv(n, (int)TPB.x), raft::ceildiv(m, (int)TPB.y), 1);
   CUDA_CHECK(cudaMemsetAsync(workspace, 0, sizeof(int) * m, stream));
@@ -86,8 +93,7 @@ void naive(cub::KeyValuePair<int, DataT> *min, DataT *x, DataT *y, int m, int n,
     <<<blks, 256, 0, stream>>>(min, m, std::numeric_limits<DataT>::max(), op);
   CUDA_CHECK(cudaGetLastError());
   naiveKernel<DataT, Sqrt, MinAndDistanceReduceOp<int, DataT>, 16>
-    <<<nblks, TPB, 0, stream>>>(min, x, y, m, n, k, workspace,
-                                std::numeric_limits<DataT>::max());
+    <<<nblks, TPB, 0, stream>>>(min, x, y, m, n, k, workspace, std::numeric_limits<DataT>::max());
   CUDA_CHECK(cudaGetLastError());
 }
 
@@ -110,10 +116,13 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
       yn(params.n, stream),
       min(params.m, stream),
       min_ref(params.m, stream),
-      workspace(params.m * sizeof(int), stream) {}
+      workspace(params.m * sizeof(int), stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int m = params.m;
     int n = params.n;
@@ -121,10 +130,8 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
     r.uniform(x.data(), m * k, DataT(-1.0), DataT(1.0), stream);
     r.uniform(y.data(), n * k, DataT(-1.0), DataT(1.0), stream);
     generateGoldenResult();
-    raft::linalg::rowNorm(xn.data(), x.data(), k, m, raft::linalg::L2Norm, true,
-                          stream);
-    raft::linalg::rowNorm(yn.data(), y.data(), k, n, raft::linalg::L2Norm, true,
-                          stream);
+    raft::linalg::rowNorm(xn.data(), x.data(), k, m, raft::linalg::L2Norm, true, stream);
+    raft::linalg::rowNorm(yn.data(), y.data(), k, n, raft::linalg::L2Norm, true, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -140,23 +147,34 @@ class FusedL2NNTest : public ::testing::TestWithParam<Inputs<DataT>> {
   raft::handle_t handle;
   cudaStream_t stream;
 
-  virtual void generateGoldenResult() {
+  virtual void generateGoldenResult()
+  {
     int m = params.m;
     int n = params.n;
     int k = params.k;
-    naive<DataT, Sqrt>(min_ref.data(), x.data(), y.data(), m, n, k,
-                       (int *)workspace.data(), stream);
+    naive<DataT, Sqrt>(min_ref.data(), x.data(), y.data(), m, n, k, (int*)workspace.data(), stream);
   }
 
-  void runTest(cub::KeyValuePair<int, DataT> *out) {
+  void runTest(cub::KeyValuePair<int, DataT>* out)
+  {
     int m = params.m;
     int n = params.n;
     int k = params.k;
     MinAndDistanceReduceOp<int, DataT> redOp;
-    fusedL2NN<DataT, cub::KeyValuePair<int, DataT>, int>(
-      out, x.data(), y.data(), xn.data(), yn.data(), m, n, k,
-      (void *)workspace.data(), redOp,
-      raft::distance::KVPMinReduce<int, DataT>(), Sqrt, true, stream);
+    fusedL2NN<DataT, cub::KeyValuePair<int, DataT>, int>(out,
+                                                         x.data(),
+                                                         y.data(),
+                                                         xn.data(),
+                                                         yn.data(),
+                                                         m,
+                                                         n,
+                                                         k,
+                                                         (void*)workspace.data(),
+                                                         redOp,
+                                                         raft::distance::KVPMinReduce<int, DataT>(),
+                                                         Sqrt,
+                                                         true,
+                                                         stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 };
@@ -165,9 +183,10 @@ template <typename T>
 struct CompareApproxAbsKVP {
   typedef typename cub::KeyValuePair<int, T> KVP;
   CompareApproxAbsKVP(T eps_) : eps(eps_) {}
-  bool operator()(const KVP &a, const KVP &b) const {
-    T diff = raft::abs(raft::abs(a.value) - raft::abs(b.value));
-    T m = std::max(raft::abs(a.value), raft::abs(b.value));
+  bool operator()(const KVP& a, const KVP& b) const
+  {
+    T diff  = raft::abs(raft::abs(a.value) - raft::abs(b.value));
+    T m     = std::max(raft::abs(a.value), raft::abs(b.value));
     T ratio = m >= eps ? diff / m : diff;
     return (ratio <= eps);
   }
@@ -179,17 +198,20 @@ struct CompareApproxAbsKVP {
 template <typename T>
 struct CompareExactKVP {
   typedef typename cub::KeyValuePair<int, T> KVP;
-  bool operator()(const KVP &a, const KVP &b) const {
+  bool operator()(const KVP& a, const KVP& b) const
+  {
     if (a.value != b.value) return false;
     return true;
   }
 };
 
 template <typename K, typename V, typename L>
-::testing::AssertionResult devArrMatch(const cub::KeyValuePair<K, V> *expected,
-                                       const cub::KeyValuePair<K, V> *actual,
-                                       size_t size, L eq_compare,
-                                       cudaStream_t stream = 0) {
+::testing::AssertionResult devArrMatch(const cub::KeyValuePair<K, V>* expected,
+                                       const cub::KeyValuePair<K, V>* actual,
+                                       size_t size,
+                                       L eq_compare,
+                                       cudaStream_t stream = 0)
+{
   typedef typename cub::KeyValuePair<K, V> KVP;
   std::shared_ptr<KVP> exp_h(new KVP[size]);
   std::shared_ptr<KVP> act_h(new KVP[size]);
@@ -201,47 +223,44 @@ template <typename K, typename V, typename L>
     auto act = act_h.get()[i];
     if (!eq_compare(exp, act)) {
       return ::testing::AssertionFailure()
-             << "actual=" << act.key << "," << act.value
-             << " != expected=" << exp.key << "," << exp.value << " @" << i;
+             << "actual=" << act.key << "," << act.value << " != expected=" << exp.key << ","
+             << exp.value << " @" << i;
     }
   }
   return ::testing::AssertionSuccess();
 }
 
 const std::vector<Inputs<float>> inputsf = {
-  {0.001f, 32, 32, 32, 1234ULL},   {0.001f, 32, 64, 32, 1234ULL},
-  {0.001f, 64, 32, 32, 1234ULL},   {0.001f, 64, 64, 32, 1234ULL},
-  {0.001f, 128, 32, 32, 1234ULL},  {0.001f, 128, 64, 32, 1234ULL},
+  {0.001f, 32, 32, 32, 1234ULL},   {0.001f, 32, 64, 32, 1234ULL},   {0.001f, 64, 32, 32, 1234ULL},
+  {0.001f, 64, 64, 32, 1234ULL},   {0.001f, 128, 32, 32, 1234ULL},  {0.001f, 128, 64, 32, 1234ULL},
   {0.001f, 128, 128, 64, 1234ULL}, {0.001f, 64, 128, 128, 1234ULL},
 
-  {0.001f, 32, 32, 34, 1234ULL},   {0.001f, 32, 64, 34, 1234ULL},
-  {0.001f, 64, 32, 34, 1234ULL},   {0.001f, 64, 64, 34, 1234ULL},
-  {0.001f, 128, 32, 34, 1234ULL},  {0.001f, 128, 64, 34, 1234ULL},
+  {0.001f, 32, 32, 34, 1234ULL},   {0.001f, 32, 64, 34, 1234ULL},   {0.001f, 64, 32, 34, 1234ULL},
+  {0.001f, 64, 64, 34, 1234ULL},   {0.001f, 128, 32, 34, 1234ULL},  {0.001f, 128, 64, 34, 1234ULL},
   {0.001f, 128, 128, 66, 1234ULL}, {0.001f, 64, 128, 130, 1234ULL},
 
-  {0.001f, 32, 32, 33, 1234ULL},   {0.001f, 32, 64, 33, 1234ULL},
-  {0.001f, 64, 32, 33, 1234ULL},   {0.001f, 64, 64, 33, 1234ULL},
-  {0.001f, 128, 32, 33, 1234ULL},  {0.001f, 128, 64, 33, 1234ULL},
+  {0.001f, 32, 32, 33, 1234ULL},   {0.001f, 32, 64, 33, 1234ULL},   {0.001f, 64, 32, 33, 1234ULL},
+  {0.001f, 64, 64, 33, 1234ULL},   {0.001f, 128, 32, 33, 1234ULL},  {0.001f, 128, 64, 33, 1234ULL},
   {0.001f, 128, 128, 65, 1234ULL}, {0.001f, 64, 128, 129, 1234ULL},
 
   {0.006f, 1805, 134, 2, 1234ULL},
 };
 typedef FusedL2NNTest<float, false> FusedL2NNTestF_Sq;
-TEST_P(FusedL2NNTestF_Sq, Result) {
+TEST_P(FusedL2NNTestF_Sq, Result)
+{
   runTest(min.data());
-  ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m,
-                          CompareApproxAbsKVP<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sq, ::testing::ValuesIn(inputsf));
 typedef FusedL2NNTest<float, true> FusedL2NNTestF_Sqrt;
-TEST_P(FusedL2NNTestF_Sqrt, Result) {
+TEST_P(FusedL2NNTestF_Sqrt, Result)
+{
   runTest(min.data());
-  ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m,
-                          CompareApproxAbsKVP<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestF_Sqrt, ::testing::ValuesIn(inputsf));
 
 const std::vector<Inputs<double>> inputsd = {
   {0.00001, 32, 32, 32, 1234ULL},   {0.00001, 32, 64, 32, 1234ULL},
@@ -262,21 +281,21 @@ const std::vector<Inputs<double>> inputsd = {
   {0.00001, 1805, 134, 2, 1234ULL},
 };
 typedef FusedL2NNTest<double, false> FusedL2NNTestD_Sq;
-TEST_P(FusedL2NNTestD_Sq, Result) {
+TEST_P(FusedL2NNTestD_Sq, Result)
+{
   runTest(min.data());
-  ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m,
-                          CompareApproxAbsKVP<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sq, ::testing::ValuesIn(inputsd));
 typedef FusedL2NNTest<double, true> FusedL2NNTestD_Sqrt;
-TEST_P(FusedL2NNTestD_Sqrt, Result) {
+TEST_P(FusedL2NNTestD_Sqrt, Result)
+{
   runTest(min.data());
-  ASSERT_TRUE(devArrMatch(min_ref.data(), min.data(), params.m,
-                          CompareApproxAbsKVP<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    min_ref.data(), min.data(), params.m, CompareApproxAbsKVP<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNTests, FusedL2NNTestD_Sqrt, ::testing::ValuesIn(inputsd));
 
 /// This is to test output determinism of the prim
 template <typename DataT, bool Sqrt>
@@ -284,7 +303,8 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
  public:
   FusedL2NNDetTest() : stream(handle.get_stream()), min1(0, stream) {}
 
-  void SetUp() override {
+  void SetUp() override
+  {
     FusedL2NNTest<DataT, Sqrt>::SetUp();
     int m = this->params.m;
     min1.resize(m, stream);
@@ -305,50 +325,46 @@ class FusedL2NNDetTest : public FusedL2NNTest<DataT, Sqrt> {
 };
 
 typedef FusedL2NNDetTest<float, false> FusedL2NNDetTestF_Sq;
-TEST_P(FusedL2NNDetTestF_Sq, Result) {
+TEST_P(FusedL2NNDetTestF_Sq, Result)
+{
   runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1.data());
-    ASSERT_TRUE(
-      devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>()));
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sq, ::testing::ValuesIn(inputsf));
 typedef FusedL2NNDetTest<float, true> FusedL2NNDetTestF_Sqrt;
-TEST_P(FusedL2NNDetTestF_Sqrt, Result) {
+TEST_P(FusedL2NNDetTestF_Sqrt, Result)
+{
   runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1.data());
-    ASSERT_TRUE(
-      devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>()));
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<float>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestF_Sqrt, ::testing::ValuesIn(inputsf));
 
 typedef FusedL2NNDetTest<double, false> FusedL2NNDetTestD_Sq;
-TEST_P(FusedL2NNDetTestD_Sq, Result) {
+TEST_P(FusedL2NNDetTestD_Sq, Result)
+{
   runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1.data());
-    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m,
-                            CompareExactKVP<double>()));
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<double>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sq, ::testing::ValuesIn(inputsd));
 typedef FusedL2NNDetTest<double, true> FusedL2NNDetTestD_Sqrt;
-TEST_P(FusedL2NNDetTestD_Sqrt, Result) {
+TEST_P(FusedL2NNDetTestD_Sqrt, Result)
+{
   runTest(min.data());  // assumed to be golden
   for (int i = 0; i < NumRepeats; ++i) {
     runTest(min1.data());
-    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m,
-                            CompareExactKVP<double>()));
+    ASSERT_TRUE(devArrMatch(min.data(), min1.data(), params.m, CompareExactKVP<double>()));
   }
 }
-INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(FusedL2NNDetTests, FusedL2NNDetTestD_Sqrt, ::testing::ValuesIn(inputsd));
 
 }  // end namespace distance
 }  // end namespace raft
diff --git a/cpp/test/eigen_solvers.cu b/cpp/test/eigen_solvers.cu
index ede790b38c..dc7de92eb8 100644
--- a/cpp/test/eigen_solvers.cu
+++ b/cpp/test/eigen_solvers.cu
@@ -25,7 +25,8 @@
 
 namespace raft {
 
-TEST(Raft, EigenSolvers) {
+TEST(Raft, EigenSolvers)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -36,7 +37,7 @@ TEST(Raft, EigenSolvers) {
   index_type* ro{nullptr};
   index_type* ci{nullptr};
   value_type* vs{nullptr};
-  index_type nnz = 0;
+  index_type nnz   = 0;
   index_type nrows = 0;
 
   sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
@@ -48,7 +49,7 @@ TEST(Raft, EigenSolvers) {
   value_type tol{1.0e-10};
   bool reorthog{true};
 
-  //nullptr expected to trigger exceptions:
+  // nullptr expected to trigger exceptions:
   //
   value_type* eigvals{nullptr};
   value_type* eigvecs{nullptr};
@@ -59,14 +60,13 @@ TEST(Raft, EigenSolvers) {
 
   lanczos_solver_t<index_type, value_type> eig_solver{cfg};
 
-  EXPECT_ANY_THROW(
-    eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
+  EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
 
-  EXPECT_ANY_THROW(
-    eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs));
+  EXPECT_ANY_THROW(eig_solver.solve_largest_eigenvectors(h, sm1, eigvals, eigvecs));
 }
 
-TEST(Raft, SpectralSolvers) {
+TEST(Raft, SpectralSolvers)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -80,7 +80,7 @@ TEST(Raft, SpectralSolvers) {
   value_type tol{1.0e-10};
   bool reorthog{true};
 
-  //nullptr expected to trigger exceptions:
+  // nullptr expected to trigger exceptions:
   //
   index_type* clusters{nullptr};
   value_type* eigvals{nullptr};
@@ -94,19 +94,16 @@ TEST(Raft, SpectralSolvers) {
 
   index_type k{5};
 
-  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol,
-                                                            seed};
+  cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
   kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
 
-  sparse_matrix_t<index_type, value_type> sm{h,       nullptr, nullptr,
-                                             nullptr, 0,       0};
-  EXPECT_ANY_THROW(spectral::partition(h, sm, eig_solver, cluster_solver,
-                                       clusters, eigvals, eigvecs));
+  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
+  EXPECT_ANY_THROW(
+    spectral::partition(h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
 
   value_type edgeCut{0};
   value_type cost{0};
-  EXPECT_ANY_THROW(
-    spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost));
+  EXPECT_ANY_THROW(spectral::analyzePartition(h, sm, k, clusters, edgeCut, cost));
 }
 
 }  // namespace raft
diff --git a/cpp/test/handle.cpp b/cpp/test/handle.cpp
index 3e27789078..698a601e85 100644
--- a/cpp/test/handle.cpp
+++ b/cpp/test/handle.cpp
@@ -22,7 +22,8 @@
 
 namespace raft {
 
-TEST(Raft, HandleDefault) {
+TEST(Raft, HandleDefault)
+{
   handle_t h;
   ASSERT_EQ(0, h.get_device());
   ASSERT_EQ(nullptr, h.get_stream());
@@ -32,7 +33,8 @@ TEST(Raft, HandleDefault) {
   ASSERT_NE(nullptr, h.get_cusparse_handle());
 }
 
-TEST(Raft, Handle) {
+TEST(Raft, Handle)
+{
   handle_t h(4);
   ASSERT_EQ(4, h.get_num_internal_streams());
   cudaStream_t stream;
@@ -43,13 +45,15 @@ TEST(Raft, Handle) {
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, GetInternalStreams) {
+TEST(Raft, GetInternalStreams)
+{
   handle_t h(4);
   auto streams = h.get_internal_streams();
   ASSERT_EQ(4U, streams.size());
 }
 
-TEST(Raft, GetHandleFromPool) {
+TEST(Raft, GetHandleFromPool)
+{
   handle_t parent(4);
 
   handle_t child(parent, 2);
@@ -62,13 +66,13 @@ TEST(Raft, GetHandleFromPool) {
   ASSERT_EQ(parent.get_device(), child.get_device());
 }
 
-TEST(Raft, GetHandleStreamViews) {
+TEST(Raft, GetHandleStreamViews)
+{
   handle_t parent(4);
 
   handle_t child(parent, 2);
   ASSERT_EQ(parent.get_internal_stream_view(2), child.get_stream_view());
-  ASSERT_EQ(parent.get_internal_stream_view(2).value(),
-            child.get_stream_view().value());
+  ASSERT_EQ(parent.get_internal_stream_view(2).value(), child.get_stream_view().value());
   EXPECT_FALSE(child.get_stream_view().is_default());
 }
 }  // namespace raft
diff --git a/cpp/test/integer_utils.cpp b/cpp/test/integer_utils.cpp
index 830d085a40..d883de59fe 100644
--- a/cpp/test/integer_utils.cpp
+++ b/cpp/test/integer_utils.cpp
@@ -20,7 +20,8 @@
 
 namespace raft {
 
-TEST(Raft, rounding_up) {
+TEST(Raft, rounding_up)
+{
   ASSERT_EQ(raft::div_rounding_up_safe(5, 3), 2);
   ASSERT_EQ(raft::div_rounding_up_safe(0, 3), 0);
   ASSERT_EQ(raft::div_rounding_up_safe(7, 8), 1);
@@ -29,7 +30,8 @@ TEST(Raft, rounding_up) {
   ASSERT_EQ(raft::div_rounding_up_unsafe(7, 8), 1);
 }
 
-TEST(Raft, is_a_power_of_two) {
+TEST(Raft, is_a_power_of_two)
+{
   ASSERT_EQ(raft::is_a_power_of_two(1 << 5), true);
   ASSERT_EQ(raft::is_a_power_of_two((1 << 5) + 1), false);
 }
diff --git a/cpp/test/label/label.cu b/cpp/test/label/label.cu
index f79d8f10c8..d983ec1162 100644
--- a/cpp/test/label/label.cu
+++ b/cpp/test/label/label.cu
@@ -35,7 +35,8 @@ class labelTest : public ::testing::Test {
 };
 
 typedef labelTest MakeMonotonicTest;
-TEST_F(MakeMonotonicTest, Result) {
+TEST_F(MakeMonotonicTest, Result)
+{
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
 
@@ -45,11 +46,9 @@ TEST_F(MakeMonotonicTest, Result) {
   rmm::device_uvector<float> actual(m, stream);
   rmm::device_uvector<float> expected(m, stream);
 
-  float *data_h =
-    new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0};
+  float* data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0};
 
-  float *expected_h =
-    new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0};
+  float* expected_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0};
 
   raft::update_device(data.data(), data_h, m, stream);
   raft::update_device(expected.data(), expected_h, m, stream);
@@ -58,14 +57,14 @@ TEST_F(MakeMonotonicTest, Result) {
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
-  ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m,
-                          raft::Compare<bool>(), stream));
+  ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m, raft::Compare<bool>(), stream));
 
   delete data_h;
   delete expected_h;
 }
 
-TEST(labelTest, Classlabels) {
+TEST(labelTest, Classlabels)
+{
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
 
@@ -81,17 +80,16 @@ TEST(labelTest, Classlabels) {
   ASSERT_EQ(n_classes, 3);
 
   float y_unique_exp[] = {-1, 1, 2};
-  EXPECT_TRUE(devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes,
-                              raft::Compare<float>(), stream));
+  EXPECT_TRUE(
+    devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes, raft::Compare<float>(), stream));
 
   rmm::device_uvector<float> y_relabeled_d(n_rows, stream);
 
-  getOvrlabels(y_d.data(), n_rows, y_unique_d.data(), n_classes,
-               y_relabeled_d.data(), 2, stream);
+  getOvrlabels(y_d.data(), n_rows, y_unique_d.data(), n_classes, y_relabeled_d.data(), 2, stream);
 
   float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1};
-  EXPECT_TRUE(devArrMatchHost(y_relabeled_exp, y_relabeled_d.data(), n_rows,
-                              raft::Compare<float>(), stream));
+  EXPECT_TRUE(
+    devArrMatchHost(y_relabeled_exp, y_relabeled_d.data(), n_rows, raft::Compare<float>(), stream));
 }
 };  // namespace label
 };  // namespace raft
diff --git a/cpp/test/label/merge_labels.cu b/cpp/test/label/merge_labels.cu
index 76e0a4295e..dd67f0fd89 100644
--- a/cpp/test/label/merge_labels.cu
+++ b/cpp/test/label/merge_labels.cu
@@ -39,8 +39,7 @@ struct MergeLabelsInputs {
 };
 
 template <typename Index_>
-class MergeLabelsTest
-  : public ::testing::TestWithParam<MergeLabelsInputs<Index_>> {
+class MergeLabelsTest : public ::testing::TestWithParam<MergeLabelsInputs<Index_>> {
  protected:
   MergeLabelsTest()
     : params(::testing::TestWithParam<MergeLabelsInputs<Index_>>::GetParam()),
@@ -50,25 +49,23 @@ class MergeLabelsTest
       expected(params.N, stream),
       R(params.N, stream),
       mask(params.N, stream),
-      m(stream) {}
-
-  void Run() {
-    raft::update_device(labels_a.data(), params.labels_a.data(), params.N,
-                        stream);
-    raft::update_device(labels_b.data(), params.labels_b.data(), params.N,
-                        stream);
-    raft::update_device(expected.data(), params.expected.data(), params.N,
-                        stream);
-    raft::update_device(mask.data(),
-                        reinterpret_cast<bool *>(params.mask.data()), params.N,
-                        stream);
-
-    merge_labels(labels_a.data(), labels_b.data(), mask.data(), R.data(),
-                 m.data(), params.N, stream);
+      m(stream)
+  {
+  }
+
+  void Run()
+  {
+    raft::update_device(labels_a.data(), params.labels_a.data(), params.N, stream);
+    raft::update_device(labels_b.data(), params.labels_b.data(), params.N, stream);
+    raft::update_device(expected.data(), params.expected.data(), params.N, stream);
+    raft::update_device(mask.data(), reinterpret_cast<bool*>(params.mask.data()), params.N, stream);
+
+    merge_labels(
+      labels_a.data(), labels_b.data(), mask.data(), R.data(), m.data(), params.N, stream);
 
     cudaStreamSynchronize(stream);
-    ASSERT_TRUE(raft::devArrMatch<Index_>(expected.data(), labels_a.data(),
-                                          params.N, raft::Compare<Index_>()));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(
+      expected.data(), labels_a.data(), params.N, raft::Compare<Index_>()));
   }
 
  protected:
@@ -86,22 +83,14 @@ TEST_P(MergeLabelsTestI, Result) { Run(); }
 using MergeLabelsTestL = MergeLabelsTest<int64_t>;
 TEST_P(MergeLabelsTestL, Result) { Run(); }
 
-constexpr int MAX32 = std::numeric_limits<int>::max();
+constexpr int MAX32     = std::numeric_limits<int>::max();
 constexpr int64_t MAX64 = std::numeric_limits<int64_t>::max();
 
 const std::vector<MergeLabelsInputs<int>> merge_inputs_32 = {
   {4, {1, 1, 3, MAX32}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}},
   {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-  {6,
-   {1, 2, 1, 4, 5, MAX32},
-   {1, 2, MAX32, 4, 5, 4},
-   {1, 1, 0, 1, 1, 0},
-   {1, 2, 1, 4, 5, 4}},
-  {6,
-   {1, 2, 2, 2, 2, 6},
-   {1, 1, 1, 5, 5, 5},
-   {1, 1, 1, 1, 1, 1},
-   {1, 1, 1, 1, 1, 1}},
+  {6, {1, 2, 1, 4, 5, MAX32}, {1, 2, MAX32, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}},
+  {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}},
   {8,
    {1, 1, 3, 3, MAX32, 1, 3, MAX32},
    {1, 2, 3, 2, MAX32, 2, 2, 2},
@@ -117,16 +106,8 @@ const std::vector<MergeLabelsInputs<int>> merge_inputs_32 = {
 const std::vector<MergeLabelsInputs<int64_t>> merge_inputs_64 = {
   {4, {1, 1, 3, MAX64}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}},
   {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-  {6,
-   {1, 2, 1, 4, 5, MAX64},
-   {1, 2, MAX64, 4, 5, 4},
-   {1, 1, 0, 1, 1, 0},
-   {1, 2, 1, 4, 5, 4}},
-  {6,
-   {1, 2, 2, 2, 2, 6},
-   {1, 1, 1, 5, 5, 5},
-   {1, 1, 1, 1, 1, 1},
-   {1, 1, 1, 1, 1, 1}},
+  {6, {1, 2, 1, 4, 5, MAX64}, {1, 2, MAX64, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}},
+  {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}},
   {8,
    {1, 1, 3, 3, MAX64, 1, 3, MAX64},
    {1, 2, 3, 2, MAX64, 2, 2, 2},
@@ -139,10 +120,8 @@ const std::vector<MergeLabelsInputs<int64_t>> merge_inputs_64 = {
    {1, 1, 1, 1, 1, 7, 7, 7}},
 };
 
-INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI,
-                        ::testing::ValuesIn(merge_inputs_32));
-INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL,
-                        ::testing::ValuesIn(merge_inputs_64));
+INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, ::testing::ValuesIn(merge_inputs_32));
+INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, ::testing::ValuesIn(merge_inputs_64));
 
 }  // namespace label
 }  // namespace raft
diff --git a/cpp/test/lap/lap.cu b/cpp/test/lap/lap.cu
index 08429e18f2..183c0bd2f3 100644
--- a/cpp/test/lap/lap.cu
+++ b/cpp/test/lap/lap.cu
@@ -31,11 +31,11 @@
 #include <raft/lap/lap.cuh>
 #include <random>
 
-#define PROBLEMSIZE 1000  // Number of rows/columns
-#define BATCHSIZE 10      // Number of problems in the batch
-#define COSTRANGE 1000
+#define PROBLEMSIZE  1000  // Number of rows/columns
+#define BATCHSIZE    10    // Number of problems in the batch
+#define COSTRANGE    1000
 #define PROBLEMCOUNT 1
-#define REPETITIONS 1
+#define REPETITIONS  1
 
 #define SEED 01010001
 
@@ -45,38 +45,41 @@ namespace raft {
 
 // Function for generating problem with uniformly distributed integer costs between [0, COSTRANGE].
 template <typename weight_t>
-void generateProblem(weight_t *cost_matrix, int SP, int N, int costrange) {
+void generateProblem(weight_t* cost_matrix, int SP, int N, int costrange)
+{
   long N2 = SP * N * N;
 
   std::uniform_int_distribution<int> distribution(0, costrange);
 
   for (long i = 0; i < N2; i++) {
-    int val = distribution(generator);
+    int val        = distribution(generator);
     cost_matrix[i] = (weight_t)val;
   }
 }
 
 template <typename vertex_t, typename weight_t>
-void hungarian_test(int problemsize, int costrange, int problemcount,
-                    int repetitions, int batchsize, weight_t epsilon,
-                    bool verbose = false) {
+void hungarian_test(int problemsize,
+                    int costrange,
+                    int problemcount,
+                    int repetitions,
+                    int batchsize,
+                    weight_t epsilon,
+                    bool verbose = false)
+{
   raft::handle_t handle;
 
-  weight_t *h_cost = new weight_t[batchsize * problemsize * problemsize];
+  weight_t* h_cost = new weight_t[batchsize * problemsize * problemsize];
 
   for (int j = 0; j < problemcount; j++) {
     generateProblem(h_cost, batchsize, problemsize, costrange);
 
-    rmm::device_uvector<weight_t> elements_v(
-      batchsize * problemsize * problemsize, handle.get_stream());
-    rmm::device_uvector<vertex_t> row_assignment_v(batchsize * problemsize,
-                                                   handle.get_stream());
-    rmm::device_uvector<vertex_t> col_assignment_v(batchsize * problemsize,
-                                                   handle.get_stream());
+    rmm::device_uvector<weight_t> elements_v(batchsize * problemsize * problemsize,
+                                             handle.get_stream());
+    rmm::device_uvector<vertex_t> row_assignment_v(batchsize * problemsize, handle.get_stream());
+    rmm::device_uvector<vertex_t> col_assignment_v(batchsize * problemsize, handle.get_stream());
 
-    raft::update_device(elements_v.data(), h_cost,
-                        batchsize * problemsize * problemsize,
-                        handle.get_stream());
+    raft::update_device(
+      elements_v.data(), h_cost, batchsize * problemsize * problemsize, handle.get_stream());
 
     for (int i = 0; i < repetitions; i++) {
       float start = omp_get_wtime();
@@ -86,20 +89,18 @@ void hungarian_test(int problemsize, int costrange, int problemcount,
         handle, problemsize, batchsize, epsilon);
 
       // Solve LAP(s) for given cost matrix
-      lpx.solve(elements_v.data(), row_assignment_v.data(),
-                col_assignment_v.data());
+      lpx.solve(elements_v.data(), row_assignment_v.data(), col_assignment_v.data());
 
       float end = omp_get_wtime();
 
       float total_time = (end - start);
 
       if (verbose) {
-        // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual objectives. At optimality both values should match.
+        // Use getPrimalObjectiveValue and getDualObjectiveValue APIs to get primal and dual
+        // objectives. At optimality both values should match.
         for (int k = 0; k < batchsize; k++) {
-          std::cout << j << ":" << i << ":" << k << ":"
-                    << lpx.getPrimalObjectiveValue(k) << ":"
-                    << lpx.getDualObjectiveValue(k) << ":" << total_time
-                    << std::endl;
+          std::cout << j << ":" << i << ":" << k << ":" << lpx.getPrimalObjectiveValue(k) << ":"
+                    << lpx.getDualObjectiveValue(k) << ":" << total_time << std::endl;
         }
       }
     }
@@ -108,34 +109,38 @@ void hungarian_test(int problemsize, int costrange, int problemcount,
   delete[] h_cost;
 }
 
-TEST(Raft, HungarianIntFloat) {
-  hungarian_test<int, float>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                             BATCHSIZE, float{1e-6});
+TEST(Raft, HungarianIntFloat)
+{
+  hungarian_test<int, float>(
+    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6});
 }
 
-TEST(Raft, HungarianIntDouble) {
-  hungarian_test<int, double>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                              BATCHSIZE, double{1e-6});
+TEST(Raft, HungarianIntDouble)
+{
+  hungarian_test<int, double>(
+    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6});
 }
 
-TEST(Raft, HungarianIntLong) {
-  hungarian_test<int, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                            BATCHSIZE, long{0});
+TEST(Raft, HungarianIntLong)
+{
+  hungarian_test<int, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0});
 }
 
-TEST(Raft, HungarianLongFloat) {
-  hungarian_test<long, float>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                              BATCHSIZE, float{1e-6});
+TEST(Raft, HungarianLongFloat)
+{
+  hungarian_test<long, float>(
+    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, float{1e-6});
 }
 
-TEST(Raft, HungarianLongDouble) {
-  hungarian_test<long, double>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT,
-                               REPETITIONS, BATCHSIZE, double{1e-6});
+TEST(Raft, HungarianLongDouble)
+{
+  hungarian_test<long, double>(
+    PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, double{1e-6});
 }
 
-TEST(Raft, HungarianLongLong) {
-  hungarian_test<long, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS,
-                             BATCHSIZE, long{0});
+TEST(Raft, HungarianLongLong)
+{
+  hungarian_test<long, long>(PROBLEMSIZE, COSTRANGE, PROBLEMCOUNT, REPETITIONS, BATCHSIZE, long{0});
 }
 
 }  // namespace raft
diff --git a/cpp/test/linalg/add.cu b/cpp/test/linalg/add.cu
index 48ad83dfd2..17b000044e 100644
--- a/cpp/test/linalg/add.cu
+++ b/cpp/test/linalg/add.cu
@@ -33,10 +33,13 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
       in1(params.len, stream),
       in2(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<AddInputs<InT, OutT>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
@@ -47,9 +50,10 @@ class AddTest : public ::testing::TestWithParam<AddInputs<InT, OutT>> {
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void compare() {
-    ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len,
-                                  raft::CompareApprox<OutT>(params.tolerance)));
+  void compare()
+  {
+    ASSERT_TRUE(raft::devArrMatch(
+      out_ref.data(), out.data(), params.len, raft::CompareApprox<OutT>(params.tolerance)));
   }
 
  protected:
diff --git a/cpp/test/linalg/add.cuh b/cpp/test/linalg/add.cuh
index 137419758f..1d9352bfc1 100644
--- a/cpp/test/linalg/add.cuh
+++ b/cpp/test/linalg/add.cuh
@@ -23,18 +23,17 @@ namespace raft {
 namespace linalg {
 
 template <typename InT, typename OutT = InT>
-__global__ void naiveAddElemKernel(OutT *out, const InT *in1, const InT *in2,
-                                   int len) {
+__global__ void naiveAddElemKernel(OutT* out, const InT* in1, const InT* in2, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = OutT(in1[idx] + in2[idx]);
-  }
+  if (idx < len) { out[idx] = OutT(in1[idx] + in2[idx]); }
 }
 
 template <typename InT, typename OutT = InT>
-void naiveAddElem(OutT *out, const InT *in1, const InT *in2, int len) {
+void naiveAddElem(OutT* out, const InT* in1, const InT* in2, int len)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveAddElemKernel<InT, OutT><<<nblks, TPB>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -47,8 +46,8 @@ struct AddInputs {
 };
 
 template <typename InT, typename OutT = InT>
-::std::ostream &operator<<(::std::ostream &os,
-                           const AddInputs<InT, OutT> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const AddInputs<InT, OutT>& dims)
+{
   return os;
 }
 
diff --git a/cpp/test/linalg/binary_op.cu b/cpp/test/linalg/binary_op.cu
index c8121bfbe4..c833faa0b2 100644
--- a/cpp/test/linalg/binary_op.cu
+++ b/cpp/test/linalg/binary_op.cu
@@ -29,28 +29,29 @@ namespace linalg {
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename IdxType, typename OutType>
-void binaryOpLaunch(OutType *out, const InType *in1, const InType *in2,
-                    IdxType len, cudaStream_t stream) {
+void binaryOpLaunch(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, cudaStream_t stream)
+{
   binaryOp(
-    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; },
-    stream);
+    out, in1, in2, len, [] __device__(InType a, InType b) { return a + b; }, stream);
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-class BinaryOpTest
-  : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>> {
+class BinaryOpTest : public ::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>> {
  public:
   BinaryOpTest()
-    : params(::testing::TestWithParam<
-             BinaryOpInputs<InType, IdxType, OutType>>::GetParam()),
+    : params(::testing::TestWithParam<BinaryOpInputs<InType, IdxType, OutType>>::GetParam()),
       stream(handle.get_stream()),
       in1(params.len, stream),
       in2(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     IdxType len = params.len;
     r.uniform(in1.data(), len, InType(-1.0), InType(1.0), stream);
@@ -71,67 +72,66 @@ class BinaryOpTest
   rmm::device_uvector<OutType> out;
 };
 
-const std::vector<BinaryOpInputs<float, int>> inputsf_i32 = {
-  {0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<BinaryOpInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, int> BinaryOpTestF_i32;
-TEST_P(BinaryOpTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(BinaryOpTestF_i32, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32,
-                         ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
-const std::vector<BinaryOpInputs<float, size_t>> inputsf_i64 = {
-  {0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<BinaryOpInputs<float, size_t>> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, size_t> BinaryOpTestF_i64;
-TEST_P(BinaryOpTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(BinaryOpTestF_i64, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64,
-                         ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<BinaryOpInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<float, int, double> BinaryOpTestF_i32_D;
-TEST_P(BinaryOpTestF_i32_D, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(BinaryOpTestF_i32_D, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D,
-                         ::testing::ValuesIn(inputsf_i32_d));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d));
 
-const std::vector<BinaryOpInputs<double, int>> inputsd_i32 = {
-  {0.00000001, 1024 * 1024, 1234ULL}};
+const std::vector<BinaryOpInputs<double, int>> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<double, int> BinaryOpTestD_i32;
-TEST_P(BinaryOpTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(BinaryOpTestD_i32, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32,
-                         ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<BinaryOpInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 1234ULL}};
 typedef BinaryOpTest<double, size_t> BinaryOpTestD_i64;
-TEST_P(BinaryOpTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(BinaryOpTestD_i64, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64,
-                         ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(BinaryOpTests, BinaryOpTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 template <typename math_t>
 class BinaryOpAlignment : public ::testing::Test {
  protected:
-  BinaryOpAlignment() {
+  BinaryOpAlignment()
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
   }
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
  public:
-  void Misaligned() {
+  void Misaligned()
+  {
     // Test to trigger cudaErrorMisalignedAddress if veclen is incorrectly
     // chosen.
     int n = 1024;
@@ -141,8 +141,12 @@ class BinaryOpAlignment : public ::testing::Test {
     CUDA_CHECK(cudaMemsetAsync(x.data(), 0, n * sizeof(math_t), stream));
     CUDA_CHECK(cudaMemsetAsync(y.data(), 0, n * sizeof(math_t), stream));
     raft::linalg::binaryOp(
-      z.data() + 9, x.data() + 137, y.data() + 19, 256,
-      [] __device__(math_t x, math_t y) { return x + y; }, stream);
+      z.data() + 9,
+      x.data() + 137,
+      y.data() + 19,
+      256,
+      [] __device__(math_t x, math_t y) { return x + y; },
+      stream);
   }
 
   raft::handle_t handle;
diff --git a/cpp/test/linalg/binary_op.cuh b/cpp/test/linalg/binary_op.cuh
index fd8ed6dd1e..97cb3ecb24 100644
--- a/cpp/test/linalg/binary_op.cuh
+++ b/cpp/test/linalg/binary_op.cuh
@@ -24,18 +24,17 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename IdxType>
-__global__ void naiveAddKernel(OutType *out, const InType *in1,
-                               const InType *in2, IdxType len) {
+__global__ void naiveAddKernel(OutType* out, const InType* in1, const InType* in2, IdxType len)
+{
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
-  if (idx < len) {
-    out[idx] = static_cast<OutType>(in1[idx] + in2[idx]);
-  }
+  if (idx < len) { out[idx] = static_cast<OutType>(in1[idx] + in2[idx]); }
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-void naiveAdd(OutType *out, const InType *in1, const InType *in2, IdxType len) {
+void naiveAdd(OutType* out, const InType* in1, const InType* in2, IdxType len)
+{
   static const IdxType TPB = 64;
-  IdxType nblks = raft::ceildiv(len, TPB);
+  IdxType nblks            = raft::ceildiv(len, TPB);
   naiveAddKernel<InType, OutType, IdxType><<<nblks, TPB>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -48,8 +47,8 @@ struct BinaryOpInputs {
 };
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const BinaryOpInputs<InType, IdxType, OutType> &d) {
+::std::ostream& operator<<(::std::ostream& os, const BinaryOpInputs<InType, IdxType, OutType>& d)
+{
   return os;
 }
 
diff --git a/cpp/test/linalg/cholesky_r1.cu b/cpp/test/linalg/cholesky_r1.cu
index 262a1ad26c..6c7bbd1232 100644
--- a/cpp/test/linalg/cholesky_r1.cu
+++ b/cpp/test/linalg/cholesky_r1.cu
@@ -36,7 +36,8 @@ class CholeskyR1Test : public ::testing::Test {
       L(n_rows * n_rows, handle.get_stream()),
       L_exp(n_rows * n_rows, handle.get_stream()),
       devInfo(handle.get_stream()),
-      workspace(0, handle.get_stream()) {
+      workspace(0, handle.get_stream())
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(G.data(), G_host, n_rows * n_rows, stream);
@@ -48,55 +49,58 @@ class CholeskyR1Test : public ::testing::Test {
     int n_bytes = 0;
     // Initializing in CUBLAS_FILL_MODE_LOWER, because that has larger workspace
     // requirements.
-    raft::linalg::choleskyRank1Update(handle, L.data(), n_rows, n_rows, nullptr,
-                                      &n_bytes, CUBLAS_FILL_MODE_LOWER, stream);
+    raft::linalg::choleskyRank1Update(
+      handle, L.data(), n_rows, n_rows, nullptr, &n_bytes, CUBLAS_FILL_MODE_LOWER, stream);
     Lwork = std::max(Lwork * sizeof(math_t), (size_t)n_bytes);
     workspace.resize(Lwork, stream);
   }
 
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
-  void testR1Update() {
+  void testR1Update()
+  {
     int n = n_rows * n_rows;
-    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER,
-                                           CUBLAS_FILL_MODE_UPPER};
+    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER};
     for (auto uplo : fillmode) {
       raft::copy(L.data(), G.data(), n, stream);
       for (int rank = 1; rank <= n_rows; rank++) {
         std::stringstream ss;
-        ss << "Rank " << rank
-           << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper");
+        ss << "Rank " << rank << ((uplo == CUBLAS_FILL_MODE_LOWER) ? ", lower" : ", upper");
         SCOPED_TRACE(ss.str());
 
         // Expected solution using Cholesky factorization from scratch
         raft::copy(L_exp.data(), G.data(), n, stream);
-        CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(
-          solver_handle, uplo, rank, L_exp.data(), n_rows,
-          (math_t*)workspace.data(), Lwork, devInfo.data(), stream));
+        CUSOLVER_CHECK(raft::linalg::cusolverDnpotrf(solver_handle,
+                                                     uplo,
+                                                     rank,
+                                                     L_exp.data(),
+                                                     n_rows,
+                                                     (math_t*)workspace.data(),
+                                                     Lwork,
+                                                     devInfo.data(),
+                                                     stream));
 
         // Incremental Cholesky factorization using rank one updates.
-        raft::linalg::choleskyRank1Update(handle, L.data(), rank, n_rows,
-                                          workspace.data(), &Lwork, uplo,
-                                          stream);
+        raft::linalg::choleskyRank1Update(
+          handle, L.data(), rank, n_rows, workspace.data(), &Lwork, uplo, stream);
 
-        ASSERT_TRUE(raft::devArrMatch(L_exp.data(), L.data(), n_rows * rank,
-                                      raft::CompareApprox<math_t>(3e-3)));
+        ASSERT_TRUE(raft::devArrMatch(
+          L_exp.data(), L.data(), n_rows * rank, raft::CompareApprox<math_t>(3e-3)));
       }
     }
   }
 
-  void testR1Error() {
+  void testR1Error()
+  {
     raft::update_device(G.data(), G2_host, 4, stream);
-    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER,
-                                           CUBLAS_FILL_MODE_UPPER};
+    std::vector<cublasFillMode_t> fillmode{CUBLAS_FILL_MODE_LOWER, CUBLAS_FILL_MODE_UPPER};
     for (auto uplo : fillmode) {
       raft::copy(L.data(), G.data(), 4, stream);
       ASSERT_NO_THROW(raft::linalg::choleskyRank1Update(
         handle, L.data(), 1, 2, workspace.data(), &Lwork, uplo, stream));
-      ASSERT_THROW(
-        raft::linalg::choleskyRank1Update(
-          handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream),
-        raft::exception);
+      ASSERT_THROW(raft::linalg::choleskyRank1Update(
+                     handle, L.data(), 2, 2, workspace.data(), &Lwork, uplo, stream),
+                   raft::exception);
 
       math_t eps = std::numeric_limits<math_t>::epsilon();
       ASSERT_NO_THROW(raft::linalg::choleskyRank1Update(
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index fdfc3052b7..9bb84e1eb7 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -33,8 +33,8 @@ struct coalescedReductionInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os,
-                           const coalescedReductionInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const coalescedReductionInputs<T>& dims)
+{
   return os;
 }
 
@@ -42,25 +42,28 @@ template <typename T>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename T>
-void coalescedReductionLaunch(T *dots, const T *data, int cols, int rows,
-                              cudaStream_t stream, bool inplace = false) {
-  coalescedReduction(dots, data, cols, rows, (T)0, stream, inplace,
-                     [] __device__(T in, int i) { return in * in; });
+void coalescedReductionLaunch(
+  T* dots, const T* data, int cols, int rows, cudaStream_t stream, bool inplace = false)
+{
+  coalescedReduction(
+    dots, data, cols, rows, (T)0, stream, inplace, [] __device__(T in, int i) { return in * in; });
 }
 
 template <typename T>
-class coalescedReductionTest
-  : public ::testing::TestWithParam<coalescedReductionInputs<T>> {
+class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductionInputs<T>> {
  public:
   coalescedReductionTest()
     : params(::testing::TestWithParam<coalescedReductionInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       dots_exp(params.rows * params.cols, stream),
-      dots_act(params.rows * params.cols, stream) {}
+      dots_act(params.rows * params.cols, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
@@ -70,8 +73,7 @@ class coalescedReductionTest
     // Perform reduction with default inplace = false first
     coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream);
     // Add to result with inplace = true next
-    coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream,
-                             true);
+    coalescedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream, true);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -86,34 +88,36 @@ class coalescedReductionTest
   rmm::device_uvector<T> dots_act;
 };
 
-const std::vector<coalescedReductionInputs<float>> inputsf = {
-  {0.000002f, 1024, 32, 1234ULL},
-  {0.000002f, 1024, 64, 1234ULL},
-  {0.000002f, 1024, 128, 1234ULL},
-  {0.000002f, 1024, 256, 1234ULL}};
+const std::vector<coalescedReductionInputs<float>> inputsf = {{0.000002f, 1024, 32, 1234ULL},
+                                                              {0.000002f, 1024, 64, 1234ULL},
+                                                              {0.000002f, 1024, 128, 1234ULL},
+                                                              {0.000002f, 1024, 256, 1234ULL}};
 
-const std::vector<coalescedReductionInputs<double>> inputsd = {
-  {0.000000001, 1024, 32, 1234ULL},
-  {0.000000001, 1024, 64, 1234ULL},
-  {0.000000001, 1024, 128, 1234ULL},
-  {0.000000001, 1024, 256, 1234ULL}};
+const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 1024, 32, 1234ULL},
+                                                               {0.000000001, 1024, 64, 1234ULL},
+                                                               {0.000000001, 1024, 128, 1234ULL},
+                                                               {0.000000001, 1024, 256, 1234ULL}};
 
 typedef coalescedReductionTest<float> coalescedReductionTestF;
-TEST_P(coalescedReductionTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(coalescedReductionTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef coalescedReductionTest<double> coalescedReductionTestD;
-TEST_P(coalescedReductionTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(coalescedReductionTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestF,
+INSTANTIATE_TEST_CASE_P(coalescedReductionTests,
+                        coalescedReductionTestF,
                         ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(coalescedReductionTests, coalescedReductionTestD,
+INSTANTIATE_TEST_CASE_P(coalescedReductionTests,
+                        coalescedReductionTestD,
                         ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
diff --git a/cpp/test/linalg/divide.cu b/cpp/test/linalg/divide.cu
index d90955147c..130a22abf0 100644
--- a/cpp/test/linalg/divide.cu
+++ b/cpp/test/linalg/divide.cu
@@ -25,37 +25,36 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naiveDivideKernel(Type *out, const Type *in, Type scalar,
-                                  int len) {
+__global__ void naiveDivideKernel(Type* out, const Type* in, Type scalar, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in[idx] / scalar;
-  }
+  if (idx < len) { out[idx] = in[idx] / scalar; }
 }
 
 template <typename Type>
-void naiveDivide(Type *out, const Type *in, Type scalar, int len,
-                 cudaStream_t stream) {
+void naiveDivide(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveDivideKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename T>
-class DivideTest
-  : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>> {
+class DivideTest : public ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>> {
  public:
   DivideTest()
-    : params(
-        ::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam()),
+    : params(::testing::TestWithParam<raft::linalg::UnaryOpInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       in(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int len = params.len;
     CUDA_CHECK(cudaStreamCreate(&stream));
@@ -75,25 +74,23 @@ class DivideTest
   rmm::device_uvector<T> out;
 };
 
-const std::vector<UnaryOpInputs<float>> inputsf = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<UnaryOpInputs<float>> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef DivideTest<float> DivideTestF;
-TEST_P(DivideTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(DivideTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestF, ::testing::ValuesIn(inputsf));
 
 typedef DivideTest<double> DivideTestD;
-const std::vector<UnaryOpInputs<double>> inputsd = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-TEST_P(DivideTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          raft::CompareApprox<double>(params.tolerance)));
+const std::vector<UnaryOpInputs<double>> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+TEST_P(DivideTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(DivideTests, DivideTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu
index 2ac9118506..3df3abd2af 100644
--- a/cpp/test/linalg/eig.cu
+++ b/cpp/test/linalg/eig.cu
@@ -35,7 +35,8 @@ struct EigInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const EigInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const EigInputs<T>& dims)
+{
   return os;
 }
 
@@ -56,34 +57,60 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
       eig_vectors_large(params.n * params.n, stream),
       eig_vectors_jacobi_large(params.n * params.n, stream),
       eig_vals_large(params.n, stream),
-      eig_vals_jacobi_large(params.n, stream) {}
+      eig_vals_jacobi_large(params.n, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int len = params.len;
 
-    T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
-                        0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
+    T cov_matrix_h[] = {
+      1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
     raft::update_device(cov_matrix.data(), cov_matrix_h, len, stream);
 
-    T eig_vectors_ref_h[] = {0.2790, -0.6498, 0.6498, -0.2789, -0.5123, 0.4874,
-                             0.4874, -0.5123, 0.6498, 0.2789,  -0.2789, -0.6498,
-                             0.4874, 0.5123,  0.5123, 0.4874};
-    T eig_vals_ref_h[] = {0.0614, 0.1024, 0.3096, 3.5266};
+    T eig_vectors_ref_h[] = {0.2790,
+                             -0.6498,
+                             0.6498,
+                             -0.2789,
+                             -0.5123,
+                             0.4874,
+                             0.4874,
+                             -0.5123,
+                             0.6498,
+                             0.2789,
+                             -0.2789,
+                             -0.6498,
+                             0.4874,
+                             0.5123,
+                             0.5123,
+                             0.4874};
+    T eig_vals_ref_h[]    = {0.0614, 0.1024, 0.3096, 3.5266};
 
     raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, len, stream);
-    raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, params.n_col,
-                        stream);
+    raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, params.n_col, stream);
 
-    eigDC(handle, cov_matrix.data(), params.n_row, params.n_col,
-          eig_vectors.data(), eig_vals.data(), stream);
+    eigDC(handle,
+          cov_matrix.data(),
+          params.n_row,
+          params.n_col,
+          eig_vectors.data(),
+          eig_vals.data(),
+          stream);
 
-    T tol = 1.e-7;
+    T tol      = 1.e-7;
     int sweeps = 15;
-    eigJacobi(handle, cov_matrix.data(), params.n_row, params.n_col,
-              eig_vectors_jacobi.data(), eig_vals_jacobi.data(), stream, tol,
+    eigJacobi(handle,
+              cov_matrix.data(),
+              params.n_row,
+              params.n_col,
+              eig_vectors_jacobi.data(),
+              eig_vals_jacobi.data(),
+              stream,
+              tol,
               sweeps);
 
     // test code for comparing two methods
@@ -91,11 +118,22 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
 
     r.uniform(cov_matrix_large.data(), len, T(-1.0), T(1.0), stream);
 
-    eigDC(handle, cov_matrix_large.data(), params.n, params.n,
-          eig_vectors_large.data(), eig_vals_large.data(), stream);
-    eigJacobi(handle, cov_matrix_large.data(), params.n, params.n,
-              eig_vectors_jacobi_large.data(), eig_vals_jacobi_large.data(),
-              stream, tol, sweeps);
+    eigDC(handle,
+          cov_matrix_large.data(),
+          params.n,
+          params.n,
+          eig_vectors_large.data(),
+          eig_vals_large.data(),
+          stream);
+    eigJacobi(handle,
+              cov_matrix_large.data(),
+              params.n,
+              params.n,
+              eig_vectors_jacobi_large.data(),
+              eig_vals_jacobi_large.data(),
+              stream,
+              tol,
+              sweeps);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -105,87 +143,105 @@ class EigTest : public ::testing::TestWithParam<EigInputs<T>> {
 
   EigInputs<T> params;
 
-  rmm::device_uvector<T> cov_matrix, eig_vectors, eig_vectors_jacobi,
-    eig_vectors_ref, eig_vals, eig_vals_jacobi, eig_vals_ref;
+  rmm::device_uvector<T> cov_matrix, eig_vectors, eig_vectors_jacobi, eig_vectors_ref, eig_vals,
+    eig_vals_jacobi, eig_vals_ref;
 
-  rmm::device_uvector<T> cov_matrix_large, eig_vectors_large,
-    eig_vectors_jacobi_large, eig_vals_large, eig_vals_jacobi_large;
+  rmm::device_uvector<T> cov_matrix_large, eig_vectors_large, eig_vectors_jacobi_large,
+    eig_vals_large, eig_vals_jacobi_large;
 };
 
-const std::vector<EigInputs<float>> inputsf2 = {
-  {0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
 
-const std::vector<EigInputs<double>> inputsd2 = {
-  {0.001, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}};
 
 typedef EigTest<float> EigTestValF;
-TEST_P(EigTestValF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestValF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
+                                eig_vals.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestValD;
-TEST_P(EigTestValD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestValD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
+                                eig_vals.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecF;
-TEST_P(EigTestVecF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), params.len,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestVecF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
+                                eig_vectors.data(),
+                                params.len,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecD;
-TEST_P(EigTestVecD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), params.len,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestVecD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
+                                eig_vectors.data(),
+                                params.len,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestValJacobiF;
-TEST_P(EigTestValJacobiF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref.data(), eig_vals_jacobi.data(), params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestValJacobiF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
+                                eig_vals_jacobi.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestValJacobiD;
-TEST_P(EigTestValJacobiD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref.data(), eig_vals_jacobi.data(), params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestValJacobiD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
+                                eig_vals_jacobi.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecJacobiF;
-TEST_P(EigTestVecJacobiF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_ref.data(), eig_vectors_jacobi.data(), params.len,
-    raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestVecJacobiF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
+                                eig_vectors_jacobi.data(),
+                                params.len,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecJacobiD;
-TEST_P(EigTestVecJacobiD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_ref.data(), eig_vectors_jacobi.data(), params.len,
-    raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestVecJacobiD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
+                                eig_vectors_jacobi.data(),
+                                params.len,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigTest<float> EigTestVecCompareF;
-TEST_P(EigTestVecCompareF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_large.data(), eig_vectors_jacobi_large.data(),
-    (params.n * params.n), raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigTestVecCompareF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_large.data(),
+                                eig_vectors_jacobi_large.data(),
+                                (params.n * params.n),
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigTest<double> EigTestVecCompareD;
-TEST_P(EigTestVecCompareD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    eig_vectors_large.data(), eig_vectors_jacobi_large.data(),
-    (params.n * params.n), raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigTestVecCompareD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_large.data(),
+                                eig_vectors_jacobi_large.data(),
+                                (params.n * params.n),
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValF, ::testing::ValuesIn(inputsf2));
@@ -196,17 +252,13 @@ INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecF, ::testing::ValuesIn(inputsf2));
 
 INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestValJacobiD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigTests, EigTestVecJacobiD, ::testing::ValuesIn(inputsd2));
 
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/test/linalg/eig_sel.cu b/cpp/test/linalg/eig_sel.cu
index 9eb1c10313..b1e88c91dd 100644
--- a/cpp/test/linalg/eig_sel.cu
+++ b/cpp/test/linalg/eig_sel.cu
@@ -37,7 +37,8 @@ struct EigSelInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const EigSelInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const EigSelInputs<T>& dims)
+{
   return os;
 }
 
@@ -51,27 +52,46 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
       eig_vectors(12, stream),
       eig_vectors_ref(12, stream),
       eig_vals(params.n_col, stream),
-      eig_vals_ref(params.n_col, stream) {}
+      eig_vals_ref(params.n_col, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     int len = params.len;
 
-    T cov_matrix_h[] = {1.0,  0.9, 0.81, 0.729, 0.9,   1.0,  0.9, 0.81,
-                        0.81, 0.9, 1.0,  0.9,   0.729, 0.81, 0.9, 1.0};
+    T cov_matrix_h[] = {
+      1.0, 0.9, 0.81, 0.729, 0.9, 1.0, 0.9, 0.81, 0.81, 0.9, 1.0, 0.9, 0.729, 0.81, 0.9, 1.0};
     ASSERT(len == 16, "This test only works with 4x4 matrices!");
     raft::update_device(cov_matrix.data(), cov_matrix_h, len, stream);
 
-    T eig_vectors_ref_h[] = {-0.5123, 0.4874,  0.4874, -0.5123, 0.6498, 0.2789,
-                             -0.2789, -0.6498, 0.4874, 0.5123,  0.5123, 0.4874};
-    T eig_vals_ref_h[] = {0.1024, 0.3096, 3.5266, 3.5266};
+    T eig_vectors_ref_h[] = {-0.5123,
+                             0.4874,
+                             0.4874,
+                             -0.5123,
+                             0.6498,
+                             0.2789,
+                             -0.2789,
+                             -0.6498,
+                             0.4874,
+                             0.5123,
+                             0.5123,
+                             0.4874};
+    T eig_vals_ref_h[]    = {0.1024, 0.3096, 3.5266, 3.5266};
 
     raft::update_device(eig_vectors_ref.data(), eig_vectors_ref_h, 12, stream);
     raft::update_device(eig_vals_ref.data(), eig_vals_ref_h, 4, stream);
 
-    eigSelDC(handle, cov_matrix.data(), params.n_row, params.n_col, 3,
-             eig_vectors.data(), eig_vals.data(),
-             EigVecMemUsage::OVERWRITE_INPUT, stream);
+    eigSelDC(handle,
+             cov_matrix.data(),
+             params.n_row,
+             params.n_col,
+             3,
+             eig_vectors.data(),
+             eig_vals.data(),
+             EigVecMemUsage::OVERWRITE_INPUT,
+             stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -87,51 +107,53 @@ class EigSelTest : public ::testing::TestWithParam<EigSelInputs<T>> {
   rmm::device_uvector<T> eig_vals_ref;
 };
 
-const std::vector<EigSelInputs<float>> inputsf2 = {
-  {0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigSelInputs<float>> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}};
 
-const std::vector<EigSelInputs<double>> inputsd2 = {
-  {0.001, 4 * 4, 4, 4, 1234ULL, 256}};
+const std::vector<EigSelInputs<double>> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}};
 
 typedef EigSelTest<float> EigSelTestValF;
-TEST_P(EigSelTestValF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigSelTestValF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
+                                eig_vals.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigSelTest<double> EigSelTestValD;
-TEST_P(EigSelTestValD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vals_ref.data(), eig_vals.data(), params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigSelTestValD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vals_ref.data(),
+                                eig_vals.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef EigSelTest<float> EigSelTestVecF;
-TEST_P(EigSelTestVecF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), 12,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(EigSelTestVecF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
+                                eig_vectors.data(),
+                                12,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef EigSelTest<double> EigSelTestVecD;
-TEST_P(EigSelTestVecD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(eig_vectors_ref.data(), eig_vectors.data(), 12,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(EigSelTestVecD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(eig_vectors_ref.data(),
+                                eig_vectors.data(),
+                                12,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestValD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EigSelTest, EigSelTestVecD, ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/eltwise.cu b/cpp/test/linalg/eltwise.cu
index c3b26f5423..5ecca16be6 100644
--- a/cpp/test/linalg/eltwise.cu
+++ b/cpp/test/linalg/eltwise.cu
@@ -26,19 +26,17 @@ namespace linalg {
 //// Testing unary ops
 
 template <typename Type>
-__global__ void naiveScaleKernel(Type *out, const Type *in, Type scalar,
-                                 int len) {
+__global__ void naiveScaleKernel(Type* out, const Type* in, Type scalar, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = scalar * in[idx];
-  }
+  if (idx < len) { out[idx] = scalar * in[idx]; }
 }
 
 template <typename Type>
-void naiveScale(Type *out, const Type *in, Type scalar, int len,
-                cudaStream_t stream) {
+void naiveScale(Type* out, const Type* in, Type scalar, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveScaleKernel<Type><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -52,26 +50,28 @@ struct ScalarMultiplyInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os,
-                           const ScalarMultiplyInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const ScalarMultiplyInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
-class ScalarMultiplyTest
-  : public ::testing::TestWithParam<ScalarMultiplyInputs<T>> {
+class ScalarMultiplyTest : public ::testing::TestWithParam<ScalarMultiplyInputs<T>> {
  public:
   ScalarMultiplyTest()
     : params(::testing::TestWithParam<ScalarMultiplyInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       in(len, stream),
       out_ref(len, stream),
-      out(len, stream) {}
+      out(len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
-    int len = params.len;
+    int len  = params.len;
     T scalar = params.scalar;
     r.uniform(in, len, T(-1.0), T(1.0), stream);
     naiveScale(out_ref, in, scalar, len, stream);
@@ -87,46 +87,43 @@ class ScalarMultiplyTest
   rmm::device_uvector<T> in, out_ref, out;
 };
 
-const std::vector<ScalarMultiplyInputs<float>> inputsf1 = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<ScalarMultiplyInputs<float>> inputsf1 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 
 const std::vector<ScalarMultiplyInputs<double>> inputsd1 = {
   {0.00000001, 1024 * 1024, 2.0, 1234ULL}};
 
 typedef ScalarMultiplyTest<float> ScalarMultiplyTestF;
-TEST_P(ScalarMultiplyTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(ScalarMultiplyTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef ScalarMultiplyTest<double> ScalarMultiplyTestD;
-TEST_P(ScalarMultiplyTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(ScalarMultiplyTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF,
-                         ::testing::ValuesIn(inputsf1));
+INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestF, ::testing::ValuesIn(inputsf1));
 
-INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD,
-                         ::testing::ValuesIn(inputsd1));
+INSTANTIATE_TEST_SUITE_P(ScalarMultiplyTests, ScalarMultiplyTestD, ::testing::ValuesIn(inputsd1));
 
 //// Testing binary ops
 
 template <typename Type>
-__global__ void naiveAddKernel(Type *out, const Type *in1, const Type *in2,
-                               int len) {
+__global__ void naiveAddKernel(Type* out, const Type* in1, const Type* in2, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in1[idx] + in2[idx];
-  }
+  if (idx < len) { out[idx] = in1[idx] + in2[idx]; }
 }
 
 template <typename Type>
-void naiveAdd(Type *out, const Type *in1, const Type *in2, int len,
-              cudaStream_t stream) {
+void naiveAdd(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveAddKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -139,8 +136,8 @@ struct EltwiseAddInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os,
-                           const EltwiseAddInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const EltwiseAddInputs<T>& dims)
+{
   return os;
 }
 
@@ -153,10 +150,13 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
       in1(params.len, stream),
       in2(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<EltwiseAddInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
@@ -175,29 +175,27 @@ class EltwiseAddTest : public ::testing::TestWithParam<EltwiseAddInputs<T>> {
   rmm::device_uvector<T> in1, in2, out_ref, out;
 };
 
-const std::vector<EltwiseAddInputs<float>> inputsf2 = {
-  {0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<EltwiseAddInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
 
-const std::vector<EltwiseAddInputs<double>> inputsd2 = {
-  {0.00000001, 1024 * 1024, 1234ULL}};
+const std::vector<EltwiseAddInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
 
 typedef EltwiseAddTest<float> EltwiseAddTestF;
-TEST_P(EltwiseAddTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(EltwiseAddTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef EltwiseAddTest<double> EltwiseAddTestD;
-TEST_P(EltwiseAddTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(EltwiseAddTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(EltwiseAddTests, EltwiseAddTestD, ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/gemm_layout.cu b/cpp/test/linalg/gemm_layout.cu
index 699d40d55e..6231715c8a 100644
--- a/cpp/test/linalg/gemm_layout.cu
+++ b/cpp/test/linalg/gemm_layout.cu
@@ -36,9 +36,9 @@ struct GemmLayoutInputs {
 
 // Reference GEMM implementation.
 template <typename T>
-__global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K,
-                          bool isZColMajor, bool isXColMajor,
-                          bool isYColMajor) {
+__global__ void naiveGemm(
+  T* Z, T* X, T* Y, int M, int N, int K, bool isZColMajor, bool isXColMajor, bool isYColMajor)
+{
   int tidx = blockIdx.x * blockDim.x + threadIdx.x;
   int tidy = blockIdx.y * blockDim.y + threadIdx.y;
 
@@ -51,7 +51,7 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K,
         temp += X[xIndex] * Y[yIndex];
       }
       int zIndex = isZColMajor ? m + n * M : m * N + n;
-      Z[zIndex] = temp;
+      Z[zIndex]  = temp;
     }
   }
 }
@@ -59,7 +59,8 @@ __global__ void naiveGemm(T *Z, T *X, T *Y, int M, int N, int K,
 template <typename T>
 class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<GemmLayoutInputs<T>>::GetParam();
 
     raft::handle_t handle;
@@ -72,8 +73,8 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
     // Dimensions of Y : K x N
     // Dimensions of Z : M x N
 
-    T *X = NULL;  // Argument X
-    T *Y = NULL;  // Argument Y
+    T* X = NULL;  // Argument X
+    T* Y = NULL;  // Argument Y
 
     size_t xElems = params.M * params.K;
     size_t yElems = params.K * params.N;
@@ -87,27 +88,35 @@ class GemmLayoutTest : public ::testing::TestWithParam<GemmLayoutInputs<T>> {
     r.uniform(X, xElems, T(-10.0), T(10.0), stream);
     r.uniform(Y, yElems, T(-10.0), T(10.0), stream);
 
-    dim3 blocks(raft::ceildiv<int>(params.M, 128),
-                raft::ceildiv<int>(params.N, 4), 1);
+    dim3 blocks(raft::ceildiv<int>(params.M, 128), raft::ceildiv<int>(params.N, 4), 1);
     dim3 threads(128, 4, 1);
 
-    naiveGemm<<<blocks, threads>>>(refZ, X, Y, params.M, params.N, params.K,
-                                   params.zLayout, params.xLayout,
-                                   params.yLayout);
-
-    gemm(handle, Z, X, Y, params.M, params.N, params.K, params.zLayout,
-         params.xLayout, params.yLayout, stream);
+    naiveGemm<<<blocks, threads>>>(
+      refZ, X, Y, params.M, params.N, params.K, params.zLayout, params.xLayout, params.yLayout);
+
+    gemm(handle,
+         Z,
+         X,
+         Y,
+         params.M,
+         params.N,
+         params.K,
+         params.zLayout,
+         params.xLayout,
+         params.yLayout,
+         stream);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     CUDA_CHECK(cudaFree(refZ));
     CUDA_CHECK(cudaFree(Z));
   }
 
  protected:
   GemmLayoutInputs<T> params;
-  T *refZ = NULL;  // Reference result for comparison
-  T *Z = NULL;     // Computed result
+  T* refZ = NULL;  // Reference result for comparison
+  T* Z    = NULL;  // Computed result
 };
 
 const std::vector<GemmLayoutInputs<float>> inputsf = {
@@ -131,22 +140,20 @@ const std::vector<GemmLayoutInputs<double>> inputsd = {
   {50, 80, 60, false, false, false, 893038ULL}};
 
 typedef GemmLayoutTest<float> GemmLayoutTestF;
-TEST_P(GemmLayoutTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N,
-                                raft::CompareApprox<float>(1e-4)));
+TEST_P(GemmLayoutTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox<float>(1e-4)));
 }
 
 typedef GemmLayoutTest<double> GemmLayoutTestD;
-TEST_P(GemmLayoutTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N,
-                                raft::CompareApprox<float>(1e-6)));
+TEST_P(GemmLayoutTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(refZ, Z, params.M * params.N, raft::CompareApprox<float>(1e-6)));
 }
 
-INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(GemmLayoutTests, GemmLayoutTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/gemv.cu b/cpp/test/linalg/gemv.cu
index 92e59ae49b..4d5472f38c 100644
--- a/cpp/test/linalg/gemv.cu
+++ b/cpp/test/linalg/gemv.cu
@@ -34,10 +34,16 @@ struct GemvInputs {
 
 // Reference GEMV implementation.
 template <typename T>
-__global__ void naiveGemv(T *y, const T *A, const T *x, const int n_rows,
-                          const int n_cols, const int lda, const bool trans_a) {
+__global__ void naiveGemv(T* y,
+                          const T* A,
+                          const T* x,
+                          const int n_rows,
+                          const int n_cols,
+                          const int lda,
+                          const bool trans_a)
+{
   int istart = blockIdx.x * blockDim.x + threadIdx.x;
-  int istep = blockDim.x * gridDim.x;
+  int istep  = blockDim.x * gridDim.x;
 
   if (!trans_a) {
     for (int i = istart; i < n_rows; i += istep) {
@@ -69,12 +75,14 @@ class GemvTest : public ::testing::TestWithParam<GemvInputs<T>> {
   GemvTest()
     : testing::TestWithParam<GemvInputs<T>>(),
       refy(0, rmm::cuda_stream_default),
-      y(0, rmm::cuda_stream_default) {
+      y(0, rmm::cuda_stream_default)
+  {
     rmm::cuda_stream_default.synchronize();
   }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<GemvInputs<T>>::GetParam();
 
     raft::handle_t handle;
@@ -98,39 +106,55 @@ class GemvTest : public ::testing::TestWithParam<GemvInputs<T>> {
     dim3 blocks(raft::ceildiv<int>(yElems, 256), 1, 1);
     dim3 threads(256, 1, 1);
 
-    naiveGemv<<<blocks, threads>>>(refy.data(), A.data(), x.data(),
-                                   params.n_rows, params.n_cols, params.lda,
-                                   params.trans_a);
-
-    gemv(handle, A.data(), params.n_rows, params.n_cols, params.lda, x.data(),
-         y.data(), params.trans_a, stream);
+    naiveGemv<<<blocks, threads>>>(
+      refy.data(), A.data(), x.data(), params.n_rows, params.n_cols, params.lda, params.trans_a);
+
+    gemv(handle,
+         A.data(),
+         params.n_rows,
+         params.n_cols,
+         params.lda,
+         x.data(),
+         y.data(),
+         params.trans_a,
+         stream);
   }
 
   void TearDown() override {}
 };
 
-const std::vector<GemvInputs<float>> inputsf = {
-  {80, 70, 80, true, 76433ULL},    {80, 100, 80, true, 426646ULL},
-  {20, 100, 20, true, 37703ULL},   {100, 60, 200, true, 538004ULL},
-  {50, 10, 60, false, 73012ULL},   {90, 90, 90, false, 538147ULL},
-  {30, 100, 30, false, 412352ULL}, {40, 80, 100, false, 297941ULL}};
-
-const std::vector<GemvInputs<double>> inputsd = {
-  {10, 70, 10, true, 535648ULL},  {30, 30, 30, true, 956681ULL},
-  {70, 80, 70, true, 875083ULL},  {80, 90, 200, true, 50744ULL},
-  {90, 90, 90, false, 506321ULL}, {40, 100, 70, false, 638418ULL},
-  {80, 50, 80, false, 701529ULL}, {50, 80, 60, false, 893038ULL}};
+const std::vector<GemvInputs<float>> inputsf = {{80, 70, 80, true, 76433ULL},
+                                                {80, 100, 80, true, 426646ULL},
+                                                {20, 100, 20, true, 37703ULL},
+                                                {100, 60, 200, true, 538004ULL},
+                                                {50, 10, 60, false, 73012ULL},
+                                                {90, 90, 90, false, 538147ULL},
+                                                {30, 100, 30, false, 412352ULL},
+                                                {40, 80, 100, false, 297941ULL}};
+
+const std::vector<GemvInputs<double>> inputsd = {{10, 70, 10, true, 535648ULL},
+                                                 {30, 30, 30, true, 956681ULL},
+                                                 {70, 80, 70, true, 875083ULL},
+                                                 {80, 90, 200, true, 50744ULL},
+                                                 {90, 90, 90, false, 506321ULL},
+                                                 {40, 100, 70, false, 638418ULL},
+                                                 {80, 50, 80, false, 701529ULL},
+                                                 {50, 80, 60, false, 893038ULL}};
 
 typedef GemvTest<float> GemvTestF;
-TEST_P(GemvTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(refy.data(), y.data(),
+TEST_P(GemvTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(refy.data(),
+                                y.data(),
                                 params.trans_a ? params.n_cols : params.n_rows,
                                 raft::CompareApprox<float>(1e-4)));
 }
 
 typedef GemvTest<double> GemvTestD;
-TEST_P(GemvTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(refy.data(), y.data(),
+TEST_P(GemvTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(refy.data(),
+                                y.data(),
                                 params.trans_a ? params.n_cols : params.n_rows,
                                 raft::CompareApprox<float>(1e-6)));
 }
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index f04c225aa9..787d9ba415 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -25,13 +25,22 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename IdxType, typename OutType>
-void mapLaunch(OutType *out, const InType *in1, const InType *in2,
-               const InType *in3, InType scalar, IdxType len,
-               cudaStream_t stream) {
+void mapLaunch(OutType* out,
+               const InType* in1,
+               const InType* in2,
+               const InType* in3,
+               InType scalar,
+               IdxType len,
+               cudaStream_t stream)
+{
   map(
-    out, len,
+    out,
+    len,
     [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; },
-    stream, in1, in2, in3);
+    stream,
+    in1,
+    in2,
+    in3);
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
@@ -43,9 +52,14 @@ struct MapInputs {
 };
 
 template <typename InType, typename IdxType, typename OutType = InType>
-void create_ref(OutType *out_ref, const InType *in1, const InType *in2,
-                const InType *in3, InType scalar, IdxType len,
-                cudaStream_t stream) {
+void create_ref(OutType* out_ref,
+                const InType* in1,
+                const InType* in2,
+                const InType* in3,
+                InType scalar,
+                IdxType len,
+                cudaStream_t stream)
+{
   rmm::device_uvector<InType> tmp(len, stream);
   eltwiseAdd(tmp.data(), in1, in2, len, stream);
   eltwiseAdd(out_ref, tmp.data(), in3, len, stream);
@@ -54,21 +68,22 @@ void create_ref(OutType *out_ref, const InType *in1, const InType *in2,
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-class MapTest
-  : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>> {
+class MapTest : public ::testing::TestWithParam<MapInputs<InType, IdxType, OutType>> {
  public:
   MapTest()
-    : params(::testing::TestWithParam<
-             MapInputs<InType, IdxType, OutType>>::GetParam()),
+    : params(::testing::TestWithParam<MapInputs<InType, IdxType, OutType>>::GetParam()),
       stream(handle.get_stream()),
       in1(params.len, stream),
       in2(params.len, stream),
       in3(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
 
     IdxType len = params.len;
@@ -76,10 +91,8 @@ class MapTest
     r.uniform(in2.data(), len, InType(-1.0), InType(1.0), stream);
     r.uniform(in3.data(), len, InType(-1.0), InType(1.0), stream);
 
-    create_ref(out_ref.data(), in1.data(), in2.data(), in3.data(),
-               params.scalar, len, stream);
-    mapLaunch(out.data(), in1.data(), in2.data(), in3.data(), params.scalar,
-              len, stream);
+    create_ref(out_ref.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream);
+    mapLaunch(out.data(), in1.data(), in2.data(), in3.data(), params.scalar, len, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -92,55 +105,52 @@ class MapTest
   rmm::device_uvector<OutType> out_ref, out;
 };
 
-const std::vector<MapInputs<float, int>> inputsf_i32 = {
-  {0.000001f, 1024 * 1024, 1234ULL, 3.2}};
+const std::vector<MapInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 1234ULL, 3.2}};
 typedef MapTest<float, int> MapTestF_i32;
-TEST_P(MapTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MapTestF_i32, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32,
-                         ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
-const std::vector<MapInputs<float, size_t>> inputsf_i64 = {
-  {0.000001f, 1024 * 1024, 1234ULL, 9.4}};
+const std::vector<MapInputs<float, size_t>> inputsf_i64 = {{0.000001f, 1024 * 1024, 1234ULL, 9.4}};
 typedef MapTest<float, size_t> MapTestF_i64;
-TEST_P(MapTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MapTestF_i64, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64,
-                         ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MapInputs<float, int, double>> inputsf_i32_d = {
   {0.000001f, 1024 * 1024, 1234ULL, 5.9}};
 typedef MapTest<float, int, double> MapTestF_i32_D;
-TEST_P(MapTestF_i32_D, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapTestF_i32_D, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D,
-                         ::testing::ValuesIn(inputsf_i32_d));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestF_i32_D, ::testing::ValuesIn(inputsf_i32_d));
 
-const std::vector<MapInputs<double, int>> inputsd_i32 = {
-  {0.00000001, 1024 * 1024, 1234ULL, 7.5}};
+const std::vector<MapInputs<double, int>> inputsd_i32 = {{0.00000001, 1024 * 1024, 1234ULL, 7.5}};
 typedef MapTest<double, int> MapTestD_i32;
-TEST_P(MapTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapTestD_i32, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32,
-                         ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MapInputs<double, size_t>> inputsd_i64 = {
   {0.00000001, 1024 * 1024, 1234ULL, 5.2}};
 typedef MapTest<double, size_t> MapTestD_i64;
-TEST_P(MapTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapTestD_i64, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64,
-                         ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(MapTests, MapTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 }  // namespace linalg
 }  // namespace raft
diff --git a/cpp/test/linalg/map_then_reduce.cu b/cpp/test/linalg/map_then_reduce.cu
index 9d59e49e60..1594cc3544 100644
--- a/cpp/test/linalg/map_then_reduce.cu
+++ b/cpp/test/linalg/map_then_reduce.cu
@@ -27,21 +27,18 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename MapOp>
-__global__ void naiveMapReduceKernel(OutType *out, const InType *in, size_t len,
-                                     MapOp map) {
+__global__ void naiveMapReduceKernel(OutType* out, const InType* in, size_t len, MapOp map)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    raft::myAtomicAdd(out, (OutType)map(in[idx]));
-  }
+  if (idx < len) { raft::myAtomicAdd(out, (OutType)map(in[idx])); }
 }
 
 template <typename InType, typename OutType, typename MapOp>
-void naiveMapReduce(OutType *out, const InType *in, size_t len, MapOp map,
-                    cudaStream_t stream) {
+void naiveMapReduce(OutType* out, const InType* in, size_t len, MapOp map, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, (size_t)TPB);
-  naiveMapReduceKernel<InType, OutType, MapOp>
-    <<<nblks, TPB, 0, stream>>>(out, in, len, map);
+  int nblks            = raft::ceildiv(len, (size_t)TPB);
+  naiveMapReduceKernel<InType, OutType, MapOp><<<nblks, TPB, 0, stream>>>(out, in, len, map);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -53,7 +50,8 @@ struct MapReduceInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const MapReduceInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MapReduceInputs<T>& dims)
+{
   return os;
 }
 
@@ -61,8 +59,9 @@ template <typename T>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename OutType>
-void mapReduceLaunch(OutType *out_ref, OutType *out, const InType *in,
-                     size_t len, cudaStream_t stream) {
+void mapReduceLaunch(
+  OutType* out_ref, OutType* out, const InType* in, size_t len, cudaStream_t stream)
+{
   auto op = [] __device__(InType in) { return in; };
   naiveMapReduce(out_ref, in, len, op, stream);
   mapThenSumReduce(out, len, op, 0, in);
@@ -78,10 +77,12 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
       out_ref(params.len, stream),
       out(params.len, stream)
 
-  {}
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     auto len = params.len;
     r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream);
@@ -98,42 +99,40 @@ class MapReduceTest : public ::testing::TestWithParam<MapReduceInputs<InType>> {
   rmm::device_uvector<OutType> out_ref, out;
 };
 
-const std::vector<MapReduceInputs<float>> inputsf = {
-  {0.001f, 1024 * 1024, 1234ULL}};
+const std::vector<MapReduceInputs<float>> inputsf = {{0.001f, 1024 * 1024, 1234ULL}};
 typedef MapReduceTest<float, float> MapReduceTestFF;
-TEST_P(MapReduceTestFF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MapReduceTestFF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFF, ::testing::ValuesIn(inputsf));
 
 typedef MapReduceTest<float, double> MapReduceTestFD;
-TEST_P(MapReduceTestFD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapReduceTestFD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestFD, ::testing::ValuesIn(inputsf));
 
-const std::vector<MapReduceInputs<double>> inputsd = {
-  {0.000001, 1024 * 1024, 1234ULL}};
+const std::vector<MapReduceInputs<double>> inputsd = {{0.000001, 1024 * 1024, 1234ULL}};
 typedef MapReduceTest<double, double> MapReduceTestDD;
-TEST_P(MapReduceTestDD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MapReduceTestDD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(out_ref.data(), out.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MapReduceTests, MapReduceTestDD, ::testing::ValuesIn(inputsd));
 
 template <typename T>
 class MapGenericReduceTest : public ::testing::Test {
-  using InType = typename T::first_type;
+  using InType  = typename T::first_type;
   using OutType = typename T::second_type;
 
  protected:
-  MapGenericReduceTest()
-    : input(n, handle.get_stream()), output(handle.get_stream()) {
+  MapGenericReduceTest() : input(n, handle.get_stream()), output(handle.get_stream())
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     initInput(input.data(), input.size(), stream);
@@ -142,7 +141,8 @@ class MapGenericReduceTest : public ::testing::Test {
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
  public:
-  void initInput(InType *input, int n, cudaStream_t stream) {
+  void initInput(InType* input, int n, cudaStream_t stream)
+  {
     raft::random::Rng r(137);
     r.uniform(input, n, InType(2), InType(3), stream);
     InType val = 1;
@@ -151,21 +151,19 @@ class MapGenericReduceTest : public ::testing::Test {
     raft::update_device(input + 337, &val, 1, stream);
   }
 
-  void testMin() {
-    auto op = [] __device__(InType in) { return in; };
+  void testMin()
+  {
+    auto op               = [] __device__(InType in) { return in; };
     const OutType neutral = std::numeric_limits<InType>::max();
-    mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream,
-                  input.data());
-    EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1,
-                                  raft::Compare<OutType>()));
+    mapThenReduce(output.data(), input.size(), neutral, op, cub::Min(), stream, input.data());
+    EXPECT_TRUE(raft::devArrMatch(OutType(1), output.data(), 1, raft::Compare<OutType>()));
   }
-  void testMax() {
-    auto op = [] __device__(InType in) { return in; };
+  void testMax()
+  {
+    auto op               = [] __device__(InType in) { return in; };
     const OutType neutral = std::numeric_limits<InType>::min();
-    mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream,
-                  input.data());
-    EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1,
-                                  raft::Compare<OutType>()));
+    mapThenReduce(output.data(), input.size(), neutral, op, cub::Max(), stream, input.data());
+    EXPECT_TRUE(raft::devArrMatch(OutType(5), output.data(), 1, raft::Compare<OutType>()));
   }
 
  protected:
@@ -178,8 +176,7 @@ class MapGenericReduceTest : public ::testing::Test {
 };
 
 using IoTypePair =
-  ::testing::Types<std::pair<float, float>, std::pair<float, double>,
-                   std::pair<double, double>>;
+  ::testing::Types<std::pair<float, float>, std::pair<float, double>, std::pair<double, double>>;
 
 TYPED_TEST_CASE(MapGenericReduceTest, IoTypePair);
 TYPED_TEST(MapGenericReduceTest, min) { this->testMin(); }
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index aad1d1e137..3db7c53041 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -32,8 +32,8 @@ struct MatVecOpInputs {
 };
 
 template <typename T, typename IdxType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const MatVecOpInputs<T, IdxType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs<T, IdxType>& dims)
+{
   return os;
 }
 
@@ -41,24 +41,45 @@ template <typename T, typename IdxType>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename T, typename IdxType>
-void matrixVectorOpLaunch(T *out, const T *in, const T *vec1, const T *vec2,
-                          IdxType D, IdxType N, bool rowMajor,
-                          bool bcastAlongRows, bool useTwoVectors,
-                          cudaStream_t stream) {
+void matrixVectorOpLaunch(T* out,
+                          const T* in,
+                          const T* vec1,
+                          const T* vec2,
+                          IdxType D,
+                          IdxType N,
+                          bool rowMajor,
+                          bool bcastAlongRows,
+                          bool useTwoVectors,
+                          cudaStream_t stream)
+{
   if (useTwoVectors) {
     matrixVectorOp(
-      out, in, vec1, vec2, D, N, rowMajor, bcastAlongRows,
-      [] __device__(T a, T b, T c) { return a + b + c; }, stream);
+      out,
+      in,
+      vec1,
+      vec2,
+      D,
+      N,
+      rowMajor,
+      bcastAlongRows,
+      [] __device__(T a, T b, T c) { return a + b + c; },
+      stream);
   } else {
     matrixVectorOp(
-      out, in, vec1, D, N, rowMajor, bcastAlongRows,
-      [] __device__(T a, T b) { return a + b; }, stream);
+      out,
+      in,
+      vec1,
+      D,
+      N,
+      rowMajor,
+      bcastAlongRows,
+      [] __device__(T a, T b) { return a + b; },
+      stream);
   }
 }
 
 template <typename T, typename IdxType>
-class MatVecOpTest
-  : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>> {
+class MatVecOpTest : public ::testing::TestWithParam<MatVecOpInputs<T, IdxType>> {
  public:
   MatVecOpTest()
     : params(::testing::TestWithParam<MatVecOpInputs<T, IdxType>>::GetParam()),
@@ -67,27 +88,50 @@ class MatVecOpTest
       out_ref(params.rows * params.cols, stream),
       out(params.rows * params.cols, stream),
       vec1(params.bcastAlongRows ? params.cols : params.rows, stream),
-      vec2(params.bcastAlongRows ? params.cols : params.rows, stream) {}
+      vec2(params.bcastAlongRows ? params.cols : params.rows, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     IdxType N = params.rows, D = params.cols;
-    IdxType len = N * D;
+    IdxType len    = N * D;
     IdxType vecLen = params.bcastAlongRows ? D : N;
     r.uniform(in.data(), len, (T)-1.0, (T)1.0, stream);
     r.uniform(vec1.data(), vecLen, (T)-1.0, (T)1.0, stream);
     r.uniform(vec2.data(), vecLen, (T)-1.0, (T)1.0, stream);
     if (params.useTwoVectors) {
-      naiveMatVec(out_ref.data(), in.data(), vec1.data(), vec2.data(), D, N,
-                  params.rowMajor, params.bcastAlongRows, (T)1.0);
+      naiveMatVec(out_ref.data(),
+                  in.data(),
+                  vec1.data(),
+                  vec2.data(),
+                  D,
+                  N,
+                  params.rowMajor,
+                  params.bcastAlongRows,
+                  (T)1.0);
     } else {
-      naiveMatVec(out_ref.data(), in.data(), vec1.data(), D, N, params.rowMajor,
-                  params.bcastAlongRows, (T)1.0);
+      naiveMatVec(out_ref.data(),
+                  in.data(),
+                  vec1.data(),
+                  D,
+                  N,
+                  params.rowMajor,
+                  params.bcastAlongRows,
+                  (T)1.0);
     }
-    matrixVectorOpLaunch(out.data(), in.data(), vec1.data(), vec2.data(), D, N,
-                         params.rowMajor, params.bcastAlongRows,
-                         params.useTwoVectors, stream);
+    matrixVectorOpLaunch(out.data(),
+                         in.data(),
+                         vec1.data(),
+                         vec2.data(),
+                         D,
+                         N,
+                         params.rowMajor,
+                         params.bcastAlongRows,
+                         params.useTwoVectors,
+                         stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -118,23 +162,23 @@ const std::vector<MatVecOpInputs<float, int>> inputsf_i32 = {
   {0.00001f, 1024, 32, false, false, true, 1234ULL},
   {0.00001f, 1024, 64, false, false, true, 1234ULL}};
 typedef MatVecOpTest<float, int> MatVecOpTestF_i32;
-TEST_P(MatVecOpTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MatVecOpTestF_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.rows * params.cols, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32,
-                         ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
 const std::vector<MatVecOpInputs<float, size_t>> inputsf_i64 = {
   {0.00001f, 2500, 250, false, false, false, 1234ULL},
   {0.00001f, 2500, 250, false, false, true, 1234ULL}};
 typedef MatVecOpTest<float, size_t> MatVecOpTestF_i64;
-TEST_P(MatVecOpTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MatVecOpTestF_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.rows * params.cols, CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64,
-                         ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MatVecOpInputs<double, int>> inputsd_i32 = {
   {0.0000001, 1024, 32, true, true, false, 1234ULL},
@@ -155,23 +199,27 @@ const std::vector<MatVecOpInputs<double, int>> inputsd_i32 = {
   {0.0000001, 1024, 32, false, false, true, 1234ULL},
   {0.0000001, 1024, 64, false, false, true, 1234ULL}};
 typedef MatVecOpTest<double, int> MatVecOpTestD_i32;
-TEST_P(MatVecOpTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols,
+TEST_P(MatVecOpTestD_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref.data(),
+                          out.data(),
+                          params.rows * params.cols,
                           CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32,
-                         ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MatVecOpInputs<double, size_t>> inputsd_i64 = {
   {0.0000001, 2500, 250, false, false, false, 1234ULL},
   {0.0000001, 2500, 250, false, false, true, 1234ULL}};
 typedef MatVecOpTest<double, size_t> MatVecOpTestD_i64;
-TEST_P(MatVecOpTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.rows * params.cols,
+TEST_P(MatVecOpTestD_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref.data(),
+                          out.data(),
+                          params.rows * params.cols,
                           CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64,
-                         ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index 69c45c9866..5f9c6f1ef3 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -22,9 +22,15 @@ namespace raft {
 namespace linalg {
 
 template <typename Type, typename IdxType = int>
-__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec,
-                                  IdxType D, IdxType N, bool rowMajor,
-                                  bool bcastAlongRows, Type scalar) {
+__global__ void naiveMatVecKernel(Type* out,
+                                  const Type* mat,
+                                  const Type* vec,
+                                  IdxType D,
+                                  IdxType N,
+                                  bool rowMajor,
+                                  bool bcastAlongRows,
+                                  Type scalar)
+{
   IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
   IdxType len = N * D;
   IdxType col;
@@ -37,27 +43,37 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec,
   } else {
     col = idx / N;
   }
-  if (idx < len) {
-    out[idx] = mat[idx] + scalar * vec[col];
-  }
+  if (idx < len) { out[idx] = mat[idx] + scalar * vec[col]; }
 }
 
 template <typename Type, typename IdxType = int>
-void naiveMatVec(Type *out, const Type *mat, const Type *vec, IdxType D,
-                 IdxType N, bool rowMajor, bool bcastAlongRows, Type scalar) {
+void naiveMatVec(Type* out,
+                 const Type* mat,
+                 const Type* vec,
+                 IdxType D,
+                 IdxType N,
+                 bool rowMajor,
+                 bool bcastAlongRows,
+                 Type scalar)
+{
   static const IdxType TPB = 64;
-  IdxType len = N * D;
-  IdxType nblks = raft::ceildiv(len, TPB);
-  naiveMatVecKernel<Type>
-    <<<nblks, TPB>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
+  IdxType len              = N * D;
+  IdxType nblks            = raft::ceildiv(len, TPB);
+  naiveMatVecKernel<Type><<<nblks, TPB>>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type, typename IdxType = int>
-__global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1,
-                                  const Type *vec2, IdxType D, IdxType N,
-                                  bool rowMajor, bool bcastAlongRows,
-                                  Type scalar) {
+__global__ void naiveMatVecKernel(Type* out,
+                                  const Type* mat,
+                                  const Type* vec1,
+                                  const Type* vec2,
+                                  IdxType D,
+                                  IdxType N,
+                                  bool rowMajor,
+                                  bool bcastAlongRows,
+                                  Type scalar)
+{
   IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
   IdxType len = N * D;
   IdxType col;
@@ -70,20 +86,25 @@ __global__ void naiveMatVecKernel(Type *out, const Type *mat, const Type *vec1,
   } else {
     col = idx / N;
   }
-  if (idx < len) {
-    out[idx] = mat[idx] + scalar * vec1[col] + vec2[col];
-  }
+  if (idx < len) { out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; }
 }
 
 template <typename Type, typename IdxType = int>
-void naiveMatVec(Type *out, const Type *mat, const Type *vec1, const Type *vec2,
-                 IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows,
-                 Type scalar) {
+void naiveMatVec(Type* out,
+                 const Type* mat,
+                 const Type* vec1,
+                 const Type* vec2,
+                 IdxType D,
+                 IdxType N,
+                 bool rowMajor,
+                 bool bcastAlongRows,
+                 Type scalar)
+{
   static const IdxType TPB = 64;
-  IdxType len = N * D;
-  IdxType nblks = raft::ceildiv(len, TPB);
-  naiveMatVecKernel<Type><<<nblks, TPB>>>(out, mat, vec1, vec2, D, N, rowMajor,
-                                          bcastAlongRows, scalar);
+  IdxType len              = N * D;
+  IdxType nblks            = raft::ceildiv(len, TPB);
+  naiveMatVecKernel<Type>
+    <<<nblks, TPB>>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
diff --git a/cpp/test/linalg/multiply.cu b/cpp/test/linalg/multiply.cu
index f78ae64f05..2a632d55b2 100644
--- a/cpp/test/linalg/multiply.cu
+++ b/cpp/test/linalg/multiply.cu
@@ -32,10 +32,13 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
       stream(handle.get_stream()),
       in(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<UnaryOpInputs<T>>::GetParam();
     raft::random::Rng r(params.seed);
     int len = params.len;
@@ -53,25 +56,23 @@ class MultiplyTest : public ::testing::TestWithParam<UnaryOpInputs<T>> {
   rmm::device_uvector<T> in, out_ref, out;
 };
 
-const std::vector<UnaryOpInputs<float>> inputsf = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<UnaryOpInputs<float>> inputsf = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef MultiplyTest<float> MultiplyTestF;
-TEST_P(MultiplyTestF, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MultiplyTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestF, ::testing::ValuesIn(inputsf));
 
 typedef MultiplyTest<double> MultiplyTestD;
-const std::vector<UnaryOpInputs<double>> inputsd = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
-TEST_P(MultiplyTestD, Result) {
-  ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                          raft::CompareApprox<double>(params.tolerance)));
+const std::vector<UnaryOpInputs<double>> inputsd = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+TEST_P(MultiplyTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MultiplyTests, MultiplyTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/norm.cu b/cpp/test/linalg/norm.cu
index 659956534e..6dae606f18 100644
--- a/cpp/test/linalg/norm.cu
+++ b/cpp/test/linalg/norm.cu
@@ -34,17 +34,19 @@ struct NormInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const NormInputs<T> &I) {
-  os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", "
-     << I.type << ", " << I.do_sqrt << ", " << I.seed << '}' << std::endl;
+::std::ostream& operator<<(::std::ostream& os, const NormInputs<T>& I)
+{
+  os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.type << ", "
+     << I.do_sqrt << ", " << I.seed << '}' << std::endl;
   return os;
 }
 
 ///// Row-wise norm test definitions
 template <typename Type>
-__global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N,
-                                   NormType type, bool do_sqrt) {
-  Type acc = (Type)0;
+__global__ void naiveRowNormKernel(
+  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt)
+{
+  Type acc     = (Type)0;
   int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
   if (rowStart < N) {
     for (int i = 0; i < D; ++i) {
@@ -59,12 +61,12 @@ __global__ void naiveRowNormKernel(Type *dots, const Type *data, int D, int N,
 }
 
 template <typename Type>
-void naiveRowNorm(Type *dots, const Type *data, int D, int N, NormType type,
-                  bool do_sqrt, cudaStream_t stream) {
+void naiveRowNorm(
+  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(N, TPB);
-  naiveRowNormKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
+  int nblks            = raft::ceildiv(N, TPB);
+  naiveRowNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -76,21 +78,22 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       dots_exp(params.rows, stream),
-      dots_act(params.rows, stream) {}
+      dots_act(params.rows, stream)
+  {
+  }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols, len = rows * cols;
     r.uniform(data.data(), len, T(-1.0), T(1.0), stream);
-    naiveRowNorm(dots_exp.data(), data.data(), cols, rows, params.type,
-                 params.do_sqrt, stream);
+    naiveRowNorm(dots_exp.data(), data.data(), cols, rows, params.type, params.do_sqrt, stream);
     if (params.do_sqrt) {
       auto fin_op = [] __device__(T in) { return raft::mySqrt(in); };
-      rowNorm(dots_act.data(), data.data(), cols, rows, params.type,
-              params.rowMajor, stream, fin_op);
+      rowNorm(
+        dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream, fin_op);
     } else {
-      rowNorm(dots_act.data(), data.data(), cols, rows, params.type,
-              params.rowMajor, stream);
+      rowNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream);
     }
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -105,10 +108,11 @@ class RowNormTest : public ::testing::TestWithParam<NormInputs<T>> {
 
 ///// Column-wise norm test definitisons
 template <typename Type>
-__global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N,
-                                   NormType type, bool do_sqrt) {
+__global__ void naiveColNormKernel(
+  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt)
+{
   int colID = threadIdx.x + blockIdx.x * blockDim.x;
-  if (colID > D) return;  //avoid out-of-bounds thread
+  if (colID > D) return;  // avoid out-of-bounds thread
 
   Type acc = 0;
   for (int i = 0; i < N; i++) {
@@ -120,12 +124,12 @@ __global__ void naiveColNormKernel(Type *dots, const Type *data, int D, int N,
 }
 
 template <typename Type>
-void naiveColNorm(Type *dots, const Type *data, int D, int N, NormType type,
-                  bool do_sqrt, cudaStream_t stream) {
+void naiveColNorm(
+  Type* dots, const Type* data, int D, int N, NormType type, bool do_sqrt, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(D, TPB);
-  naiveColNormKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
+  int nblks            = raft::ceildiv(D, TPB);
+  naiveColNormKernel<Type><<<nblks, TPB, 0, stream>>>(dots, data, D, N, type, do_sqrt);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -137,22 +141,23 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       dots_exp(params.cols, stream),
-      dots_act(params.cols, stream) {}
+      dots_act(params.cols, stream)
+  {
+  }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols, len = rows * cols;
     r.uniform(data.data(), len, T(-1.0), T(1.0), stream);
 
-    naiveColNorm(dots_exp.data(), data.data(), cols, rows, params.type,
-                 params.do_sqrt, stream);
+    naiveColNorm(dots_exp.data(), data.data(), cols, rows, params.type, params.do_sqrt, stream);
     if (params.do_sqrt) {
       auto fin_op = [] __device__(T in) { return raft::mySqrt(in); };
-      colNorm(dots_act.data(), data.data(), cols, rows, params.type,
-              params.rowMajor, stream, fin_op);
+      colNorm(
+        dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream, fin_op);
     } else {
-      colNorm(dots_act.data(), data.data(), cols, rows, params.type,
-              params.rowMajor, stream);
+      colNorm(dots_act.data(), data.data(), cols, rows, params.type, params.rowMajor, stream);
     }
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -166,24 +171,23 @@ class ColNormTest : public ::testing::TestWithParam<NormInputs<T>> {
 };
 
 ///// Row- and column-wise tests
-const std::vector<NormInputs<float>> inputsf = {
-  {0.00001f, 1024, 32, L1Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL},
-  {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL},
-
-  {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL},
-  {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}};
+const std::vector<NormInputs<float>> inputsf = {{0.00001f, 1024, 32, L1Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 64, L1Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 128, L1Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 256, L1Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 32, L2Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 64, L2Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 128, L2Norm, false, true, 1234ULL},
+                                                {0.00001f, 1024, 256, L2Norm, false, true, 1234ULL},
+
+                                                {0.00001f, 1024, 32, L1Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 64, L1Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 128, L1Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 256, L1Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 32, L2Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 64, L2Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 128, L2Norm, true, true, 1234ULL},
+                                                {0.00001f, 1024, 256, L2Norm, true, true, 1234ULL}};
 
 const std::vector<NormInputs<double>> inputsd = {
   {0.00000001, 1024, 32, L1Norm, false, true, 1234ULL},
@@ -205,22 +209,22 @@ const std::vector<NormInputs<double>> inputsd = {
   {0.00000001, 1024, 256, L2Norm, true, true, 1234ULL}};
 
 typedef RowNormTest<float> RowNormTestF;
-TEST_P(RowNormTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(RowNormTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef RowNormTest<double> RowNormTestD;
-TEST_P(RowNormTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.rows,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(RowNormTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp.data(), dots_act.data(), params.rows, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(RowNormTests, RowNormTestD, ::testing::ValuesIn(inputsd));
 
 const std::vector<NormInputs<float>> inputscf = {
   {0.00001f, 32, 1024, L1Norm, false, true, 1234ULL},
@@ -261,22 +265,22 @@ const std::vector<NormInputs<double>> inputscd = {
   {0.00000001, 256, 1024, L2Norm, true, true, 1234ULL}};
 
 typedef ColNormTest<float> ColNormTestF;
-TEST_P(ColNormTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.cols,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(ColNormTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef ColNormTest<double> ColNormTestD;
-TEST_P(ColNormTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(dots_exp.data(), dots_act.data(), params.cols,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(ColNormTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF,
-                        ::testing::ValuesIn(inputscf));
+INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestF, ::testing::ValuesIn(inputscf));
 
-INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD,
-                        ::testing::ValuesIn(inputscd));
+INSTANTIATE_TEST_CASE_P(ColNormTests, ColNormTestD, ::testing::ValuesIn(inputscd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu
index 9822ca2c60..25ee0a7b77 100644
--- a/cpp/test/linalg/reduce.cu
+++ b/cpp/test/linalg/reduce.cu
@@ -34,8 +34,8 @@ struct ReduceInputs {
 };
 
 template <typename InType, typename OutType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const ReduceInputs<InType, OutType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const ReduceInputs<InType, OutType>& dims)
+{
   return os;
 }
 
@@ -43,44 +43,58 @@ template <typename InType, typename OutType>
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename OutType>
-void reduceLaunch(OutType *dots, const InType *data, int cols, int rows,
-                  bool rowMajor, bool alongRows, bool inplace,
-                  cudaStream_t stream) {
-  reduce(
-    dots, data, cols, rows, (OutType)0, rowMajor, alongRows, stream, inplace,
-    [] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
+void reduceLaunch(OutType* dots,
+                  const InType* data,
+                  int cols,
+                  int rows,
+                  bool rowMajor,
+                  bool alongRows,
+                  bool inplace,
+                  cudaStream_t stream)
+{
+  reduce(dots,
+         data,
+         cols,
+         rows,
+         (OutType)0,
+         rowMajor,
+         alongRows,
+         stream,
+         inplace,
+         [] __device__(InType in, int i) { return static_cast<OutType>(in * in); });
 }
 
 template <typename InType, typename OutType>
-class ReduceTest
-  : public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
+class ReduceTest : public ::testing::TestWithParam<ReduceInputs<InType, OutType>> {
  public:
   ReduceTest()
-    : params(
-        ::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam()),
+    : params(::testing::TestWithParam<ReduceInputs<InType, OutType>>::GetParam()),
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       dots_exp(params.alongRows ? params.rows : params.cols, stream),
-      dots_act(params.alongRows ? params.rows : params.cols, stream) {}
+      dots_act(params.alongRows ? params.rows : params.cols, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
-    outlen = params.alongRows ? rows : cols;
+    outlen  = params.alongRows ? rows : cols;
     r.uniform(data.data(), len, InType(-1.0), InType(1.0), stream);
-    naiveReduction(dots_exp.data(), data.data(), cols, rows, params.rowMajor,
-                   params.alongRows, stream);
+    naiveReduction(
+      dots_exp.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, stream);
 
     // Perform reduction with default inplace = false first
-    reduceLaunch(dots_act.data(), data.data(), cols, rows, params.rowMajor,
-                 params.alongRows, false, stream);
+    reduceLaunch(
+      dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, false, stream);
     // Add to result with inplace = true next, which shouldn't affect
     // in the case of coalescedReduction!
     if (!(params.rowMajor ^ params.alongRows)) {
-      reduceLaunch(dots_act.data(), data.data(), cols, rows, params.rowMajor,
-                   params.alongRows, true, stream);
+      reduceLaunch(
+        dots_act.data(), data.data(), cols, rows, params.rowMajor, params.alongRows, true, stream);
     }
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -150,31 +164,31 @@ const std::vector<ReduceInputs<float, double>> inputsfd = {
   {0.000002f, 1024, 256, false, false, 1234ULL}};
 
 typedef ReduceTest<float, float> ReduceTestFF;
-TEST_P(ReduceTestFF, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(ReduceTestFF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef ReduceTest<double, double> ReduceTestDD;
-TEST_P(ReduceTestDD, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(ReduceTestDD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox<double>(params.tolerance)));
 }
 
 typedef ReduceTest<float, double> ReduceTestFD;
-TEST_P(ReduceTestFD, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), outlen,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(ReduceTestFD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dots_exp.data(), dots_act.data(), outlen, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF,
-                        ::testing::ValuesIn(inputsff));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFF, ::testing::ValuesIn(inputsff));
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD,
-                        ::testing::ValuesIn(inputsdd));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestDD, ::testing::ValuesIn(inputsdd));
 
-INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD,
-                        ::testing::ValuesIn(inputsfd));
+INSTANTIATE_TEST_CASE_P(ReduceTests, ReduceTestFD, ::testing::ValuesIn(inputsfd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/reduce.cuh b/cpp/test/linalg/reduce.cuh
index 7f8319636b..82ddfd4661 100644
--- a/cpp/test/linalg/reduce.cuh
+++ b/cpp/test/linalg/reduce.cuh
@@ -26,55 +26,60 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType>
-__global__ void naiveCoalescedReductionKernel(OutType *dots, const InType *data,
-                                              int D, int N) {
-  OutType acc = (OutType)0;
+__global__ void naiveCoalescedReductionKernel(OutType* dots, const InType* data, int D, int N)
+{
+  OutType acc  = (OutType)0;
   int rowStart = threadIdx.x + blockIdx.x * blockDim.x;
   if (rowStart < N) {
     for (int i = 0; i < D; ++i) {
-      acc +=
-        static_cast<OutType>(data[rowStart * D + i] * data[rowStart * D + i]);
+      acc += static_cast<OutType>(data[rowStart * D + i] * data[rowStart * D + i]);
     }
     dots[rowStart] = 2 * acc;
   }
 }
 
 template <typename InType, typename OutType>
-void naiveCoalescedReduction(OutType *dots, const InType *data, int D, int N,
-                             cudaStream_t stream) {
+void naiveCoalescedReduction(OutType* dots, const InType* data, int D, int N, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(N, TPB);
-  naiveCoalescedReductionKernel<InType, OutType>
-    <<<nblks, TPB, 0, stream>>>(dots, data, D, N);
+  int nblks            = raft::ceildiv(N, TPB);
+  naiveCoalescedReductionKernel<InType, OutType><<<nblks, TPB, 0, stream>>>(dots, data, D, N);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename InType, typename OutType>
-void unaryAndGemv(OutType *dots, const InType *data, int D, int N,
-                  cudaStream_t stream) {
-  //computes a MLCommon unary op on data (squares it), then computes Ax
+void unaryAndGemv(OutType* dots, const InType* data, int D, int N, cudaStream_t stream)
+{
+  // computes a MLCommon unary op on data (squares it), then computes Ax
   //(A input matrix and x column vector) to sum columns
   rmm::device_uvector<OutType> sq(D * N, stream);
   raft::linalg::unaryOp(
-    thrust::raw_pointer_cast(sq.data()), data, D * N,
-    [] __device__(InType v) { return static_cast<OutType>(v * v); }, stream);
+    thrust::raw_pointer_cast(sq.data()),
+    data,
+    D * N,
+    [] __device__(InType v) { return static_cast<OutType>(v * v); },
+    stream);
   cublasHandle_t handle;
   CUBLAS_CHECK(cublasCreate(&handle));
-  rmm::device_uvector<OutType> ones(N, stream);  //column vector [1...1]
+  rmm::device_uvector<OutType> ones(N, stream);  // column vector [1...1]
   raft::linalg::unaryOp<OutType>(
-    ones.data(), ones.data(), ones.size(),
-    [=] __device__(OutType input) { return 1; }, stream);
+    ones.data(), ones.data(), ones.size(), [=] __device__(OutType input) { return 1; }, stream);
   OutType alpha = 1, beta = 0;
-  CUBLAS_CHECK(raft::linalg::cublasgemv(handle, CUBLAS_OP_N, D, N, &alpha,
-                                        sq.data(), D, ones.data(), 1, &beta,
-                                        dots, 1, stream));
+  CUBLAS_CHECK(raft::linalg::cublasgemv(
+    handle, CUBLAS_OP_N, D, N, &alpha, sq.data(), D, ones.data(), 1, &beta, dots, 1, stream));
   CUDA_CHECK(cudaDeviceSynchronize());
   CUBLAS_CHECK(cublasDestroy(handle));
 }
 
 template <typename InType, typename OutType>
-void naiveReduction(OutType *dots, const InType *data, int D, int N,
-                    bool rowMajor, bool alongRows, cudaStream_t stream) {
+void naiveReduction(OutType* dots,
+                    const InType* data,
+                    int D,
+                    int N,
+                    bool rowMajor,
+                    bool alongRows,
+                    cudaStream_t stream)
+{
   if (rowMajor && alongRows) {
     naiveCoalescedReduction(dots, data, D, N, stream);
   } else if (rowMajor && !alongRows) {
diff --git a/cpp/test/linalg/strided_reduction.cu b/cpp/test/linalg/strided_reduction.cu
index 4f761d39f6..ac387c16bb 100644
--- a/cpp/test/linalg/strided_reduction.cu
+++ b/cpp/test/linalg/strided_reduction.cu
@@ -32,15 +32,14 @@ struct stridedReductionInputs {
 };
 
 template <typename T>
-void stridedReductionLaunch(T *dots, const T *data, int cols, int rows,
-                            cudaStream_t stream) {
-  stridedReduction(dots, data, cols, rows, (T)0, stream, false,
-                   [] __device__(T in, int i) { return in * in; });
+void stridedReductionLaunch(T* dots, const T* data, int cols, int rows, cudaStream_t stream)
+{
+  stridedReduction(
+    dots, data, cols, rows, (T)0, stream, false, [] __device__(T in, int i) { return in * in; });
 }
 
 template <typename T>
-class stridedReductionTest
-  : public ::testing::TestWithParam<stridedReductionInputs<T>> {
+class stridedReductionTest : public ::testing::TestWithParam<stridedReductionInputs<T>> {
  public:
   stridedReductionTest()
     : params(::testing::TestWithParam<stridedReductionInputs<T>>::GetParam()),
@@ -48,15 +47,17 @@ class stridedReductionTest
       data(params.rows * params.cols, stream),
       dots_exp(params.cols, stream),  // expected dot products (from test)
       dots_act(params.cols, stream)   // actual dot products (from prim)
-  {}
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int rows = params.rows, cols = params.cols;
     int len = rows * cols;
     r.uniform(data.data(), len, T(-1.0), T(1.0),
-              stream);  //initialize matrix to random
+              stream);  // initialize matrix to random
 
     unaryAndGemv(dots_exp.data(), data.data(), cols, rows, stream);
     stridedReductionLaunch(dots_act.data(), data.data(), cols, rows, stream);
@@ -71,35 +72,33 @@ class stridedReductionTest
   rmm::device_uvector<T> data, dots_exp, dots_act;
 };
 
-const std::vector<stridedReductionInputs<float>> inputsf = {
-  {0.00001f, 1024, 32, 1234ULL},
-  {0.00001f, 1024, 64, 1234ULL},
-  {0.00001f, 1024, 128, 1234ULL},
-  {0.00001f, 1024, 256, 1234ULL}};
+const std::vector<stridedReductionInputs<float>> inputsf = {{0.00001f, 1024, 32, 1234ULL},
+                                                            {0.00001f, 1024, 64, 1234ULL},
+                                                            {0.00001f, 1024, 128, 1234ULL},
+                                                            {0.00001f, 1024, 256, 1234ULL}};
 
-const std::vector<stridedReductionInputs<double>> inputsd = {
-  {0.000000001, 1024, 32, 1234ULL},
-  {0.000000001, 1024, 64, 1234ULL},
-  {0.000000001, 1024, 128, 1234ULL},
-  {0.000000001, 1024, 256, 1234ULL}};
+const std::vector<stridedReductionInputs<double>> inputsd = {{0.000000001, 1024, 32, 1234ULL},
+                                                             {0.000000001, 1024, 64, 1234ULL},
+                                                             {0.000000001, 1024, 128, 1234ULL},
+                                                             {0.000000001, 1024, 256, 1234ULL}};
 
 typedef stridedReductionTest<float> stridedReductionTestF;
-TEST_P(stridedReductionTestF, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), params.cols,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(stridedReductionTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef stridedReductionTest<double> stridedReductionTestD;
-TEST_P(stridedReductionTestD, Result) {
-  ASSERT_TRUE(devArrMatch(dots_exp.data(), dots_act.data(), params.cols,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(stridedReductionTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    dots_exp.data(), dots_act.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD,
-                        ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_CASE_P(stridedReductionTests, stridedReductionTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/subtract.cu b/cpp/test/linalg/subtract.cu
index 0a82da61c9..77c14a8a7b 100644
--- a/cpp/test/linalg/subtract.cu
+++ b/cpp/test/linalg/subtract.cu
@@ -24,39 +24,34 @@ namespace raft {
 namespace linalg {
 
 template <typename Type>
-__global__ void naiveSubtractElemKernel(Type *out, const Type *in1,
-                                        const Type *in2, int len) {
+__global__ void naiveSubtractElemKernel(Type* out, const Type* in1, const Type* in2, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in1[idx] - in2[idx];
-  }
+  if (idx < len) { out[idx] = in1[idx] - in2[idx]; }
 }
 
 template <typename Type>
-void naiveSubtractElem(Type *out, const Type *in1, const Type *in2, int len,
-                       cudaStream_t stream) {
+void naiveSubtractElem(Type* out, const Type* in1, const Type* in2, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   naiveSubtractElemKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void naiveSubtractScalarKernel(Type *out, const Type *in1,
-                                          const Type in2, int len) {
+__global__ void naiveSubtractScalarKernel(Type* out, const Type* in1, const Type in2, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in1[idx] - in2;
-  }
+  if (idx < len) { out[idx] = in1[idx] - in2; }
 }
 
 template <typename Type>
-void naiveSubtractScalar(Type *out, const Type *in1, const Type in2, int len,
-                         cudaStream_t stream) {
+void naiveSubtractScalar(Type* out, const Type* in1, const Type in2, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
-  naiveSubtractScalarKernel<Type>
-    <<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
+  int nblks            = raft::ceildiv(len, TPB);
+  naiveSubtractScalarKernel<Type><<<nblks, TPB, 0, stream>>>(out, in1, in2, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -68,7 +63,8 @@ struct SubtractInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const SubtractInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SubtractInputs<T>& dims)
+{
   return os;
 }
 
@@ -81,10 +77,13 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
       in1(params.len, stream),
       in2(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int len = params.len;
     r.uniform(in1.data(), len, T(-1.0), T(1.0), stream);
@@ -108,35 +107,33 @@ class SubtractTest : public ::testing::TestWithParam<SubtractInputs<T>> {
   rmm::device_uvector<T> in1, in2, out_ref, out;
 };
 
-const std::vector<SubtractInputs<float>> inputsf2 = {
-  {0.000001f, 1024 * 1024, 1234ULL}};
+const std::vector<SubtractInputs<float>> inputsf2 = {{0.000001f, 1024 * 1024, 1234ULL}};
 
-const std::vector<SubtractInputs<double>> inputsd2 = {
-  {0.00000001, 1024 * 1024, 1234ULL}};
+const std::vector<SubtractInputs<double>> inputsd2 = {{0.00000001, 1024 * 1024, 1234ULL}};
 
 typedef SubtractTest<float> SubtractTestF;
-TEST_P(SubtractTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(SubtractTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(raft::devArrMatch(out_ref.data(), in1.data(), params.len,
-                                raft::CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), in1.data(), params.len, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef SubtractTest<double> SubtractTestD;
-TEST_P(SubtractTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(out_ref.data(), out.data(), params.len,
-                                raft::CompareApprox<double>(params.tolerance)));
+TEST_P(SubtractTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), out.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(raft::devArrMatch(out_ref.data(), in1.data(), params.len,
-                                raft::CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(raft::devArrMatch(
+    out_ref.data(), in1.data(), params.len, raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(SubtractTests, SubtractTestD, ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/svd.cu b/cpp/test/linalg/svd.cu
index 8ebbf19683..61c2c2e3db 100644
--- a/cpp/test/linalg/svd.cu
+++ b/cpp/test/linalg/svd.cu
@@ -35,7 +35,8 @@ struct SvdInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const SvdInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SvdInputs<T>& dims)
+{
   return os;
 }
 
@@ -51,10 +52,13 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
       sing_vals_qr(params.n_col, stream),
       left_eig_vectors_ref(params.n_row * params.n_col, stream),
       right_eig_vectors_ref(params.n_col * params.n_col, stream),
-      sing_vals_ref(params.len, stream) {}
+      sing_vals_ref(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int len = params.len;
 
@@ -63,26 +67,30 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
     T data_h[] = {1.0, 4.0, 2.0, 2.0, 5.0, 1.0};
     raft::update_device(data.data(), data_h, len, stream);
 
-    int left_evl = params.n_row * params.n_col;
+    int left_evl  = params.n_row * params.n_col;
     int right_evl = params.n_col * params.n_col;
 
-    T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695,
-                                  0.488195,  0.110706,  -0.865685};
+    T left_eig_vectors_ref_h[] = {-0.308219, -0.906133, -0.289695, 0.488195, 0.110706, -0.865685};
 
     T right_eig_vectors_ref_h[] = {-0.638636, -0.769509, -0.769509, 0.638636};
 
     T sing_vals_ref_h[] = {7.065283, 1.040081};
 
-    raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h,
-                        left_evl, stream);
-    raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h,
-                        right_evl, stream);
-    raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, params.n_col,
-                        stream);
-
-    svdQR(handle, data.data(), params.n_row, params.n_col, sing_vals_qr.data(),
-          left_eig_vectors_qr.data(), right_eig_vectors_trans_qr.data(), true,
-          true, true, stream);
+    raft::update_device(left_eig_vectors_ref.data(), left_eig_vectors_ref_h, left_evl, stream);
+    raft::update_device(right_eig_vectors_ref.data(), right_eig_vectors_ref_h, right_evl, stream);
+    raft::update_device(sing_vals_ref.data(), sing_vals_ref_h, params.n_col, stream);
+
+    svdQR(handle,
+          data.data(),
+          params.n_row,
+          params.n_col,
+          sing_vals_qr.data(),
+          left_eig_vectors_qr.data(),
+          right_eig_vectors_trans_qr.data(),
+          true,
+          true,
+          true,
+          stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
@@ -91,71 +99,75 @@ class SvdTest : public ::testing::TestWithParam<SvdInputs<T>> {
   cudaStream_t stream;
 
   SvdInputs<T> params;
-  rmm::device_uvector<T> data, left_eig_vectors_qr, right_eig_vectors_trans_qr,
-    sing_vals_qr, left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref;
+  rmm::device_uvector<T> data, left_eig_vectors_qr, right_eig_vectors_trans_qr, sing_vals_qr,
+    left_eig_vectors_ref, right_eig_vectors_ref, sing_vals_ref;
 };
 
-const std::vector<SvdInputs<float>> inputsf2 = {
-  {0.00001f, 3 * 2, 3, 2, 1234ULL}};
+const std::vector<SvdInputs<float>> inputsf2 = {{0.00001f, 3 * 2, 3, 2, 1234ULL}};
 
-const std::vector<SvdInputs<double>> inputsd2 = {
-  {0.00001, 3 * 2, 3, 2, 1234ULL}};
+const std::vector<SvdInputs<double>> inputsd2 = {{0.00001, 3 * 2, 3, 2, 1234ULL}};
 
 typedef SvdTest<float> SvdTestValF;
-TEST_P(SvdTestValF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(sing_vals_ref.data(), sing_vals_qr.data(), params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(SvdTestValF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(sing_vals_ref.data(),
+                                sing_vals_qr.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestValD;
-TEST_P(SvdTestValD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(sing_vals_ref.data(), sing_vals_qr.data(), params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(SvdTestValD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(sing_vals_ref.data(),
+                                sing_vals_qr.data(),
+                                params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef SvdTest<float> SvdTestLeftVecF;
-TEST_P(SvdTestLeftVecF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(left_eig_vectors_ref.data(), left_eig_vectors_qr.data(),
-                      params.n_row * params.n_col,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(SvdTestLeftVecF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref.data(),
+                                left_eig_vectors_qr.data(),
+                                params.n_row * params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestLeftVecD;
-TEST_P(SvdTestLeftVecD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(left_eig_vectors_ref.data(), left_eig_vectors_qr.data(),
-                      params.n_row * params.n_col,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(SvdTestLeftVecD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(left_eig_vectors_ref.data(),
+                                left_eig_vectors_qr.data(),
+                                params.n_row * params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 typedef SvdTest<float> SvdTestRightVecF;
-TEST_P(SvdTestRightVecF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    right_eig_vectors_ref.data(), right_eig_vectors_trans_qr.data(),
-    params.n_col * params.n_col,
-    raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(SvdTestRightVecF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref.data(),
+                                right_eig_vectors_trans_qr.data(),
+                                params.n_col * params.n_col,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef SvdTest<double> SvdTestRightVecD;
-TEST_P(SvdTestRightVecD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(
-    right_eig_vectors_ref.data(), right_eig_vectors_trans_qr.data(),
-    params.n_col * params.n_col,
-    raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(SvdTestRightVecD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(right_eig_vectors_ref.data(),
+                                right_eig_vectors_trans_qr.data(),
+                                params.n_col * params.n_col,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValF, ::testing::ValuesIn(inputsf2));
 
 INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestValD, ::testing::ValuesIn(inputsd2));
 
-INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestLeftVecD, ::testing::ValuesIn(inputsd2));
 
 // INSTANTIATE_TEST_SUITE_P(SvdTests, SvdTestRightVecF,
 // ::testing::ValuesIn(inputsf2));
diff --git a/cpp/test/linalg/transpose.cu b/cpp/test/linalg/transpose.cu
index 1d8ef08673..fde5599bc1 100644
--- a/cpp/test/linalg/transpose.cu
+++ b/cpp/test/linalg/transpose.cu
@@ -34,7 +34,8 @@ struct TranposeInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const TranposeInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const TranposeInputs<T>& dims)
+{
   return os;
 }
 
@@ -46,10 +47,13 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
       stream(handle.get_stream()),
       data(params.len, stream),
       data_trans_ref(params.len, stream),
-      data_trans(params.len, stream) {}
+      data_trans(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     int len = params.len;
     ASSERT(params.len == 9, "This test works only with len=9!");
     T data_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0};
@@ -57,8 +61,7 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
     T data_ref_h[] = {1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0};
     raft::update_device(data_trans_ref.data(), data_ref_h, len, stream);
 
-    transpose(handle, data.data(), data_trans.data(), params.n_row,
-              params.n_col, stream);
+    transpose(handle, data.data(), data_trans.data(), params.n_row, params.n_col, stream);
     transpose(data.data(), params.n_row, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -71,39 +74,41 @@ class TransposeTest : public ::testing::TestWithParam<TranposeInputs<T>> {
   rmm::device_uvector<T> data, data_trans, data_trans_ref;
 };
 
-const std::vector<TranposeInputs<float>> inputsf2 = {
-  {0.1f, 3 * 3, 3, 3, 1234ULL}};
+const std::vector<TranposeInputs<float>> inputsf2 = {{0.1f, 3 * 3, 3, 3, 1234ULL}};
 
-const std::vector<TranposeInputs<double>> inputsd2 = {
-  {0.1, 3 * 3, 3, 3, 1234ULL}};
+const std::vector<TranposeInputs<double>> inputsd2 = {{0.1, 3 * 3, 3, 3, 1234ULL}};
 
 typedef TransposeTest<float> TransposeTestValF;
-TEST_P(TransposeTestValF, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref.data(), data_trans.data(), params.len,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
-
-  ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref.data(), data.data(), params.len,
-                      raft::CompareApproxAbs<float>(params.tolerance)));
+TEST_P(TransposeTestValF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
+                                data_trans.data(),
+                                params.len,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
+                                data.data(),
+                                params.len,
+                                raft::CompareApproxAbs<float>(params.tolerance)));
 }
 
 typedef TransposeTest<double> TransposeTestValD;
-TEST_P(TransposeTestValD, Result) {
-  ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref.data(), data_trans.data(), params.len,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
-
-  ASSERT_TRUE(
-    raft::devArrMatch(data_trans_ref.data(), data.data(), params.len,
-                      raft::CompareApproxAbs<double>(params.tolerance)));
+TEST_P(TransposeTestValD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
+                                data_trans.data(),
+                                params.len,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
+
+  ASSERT_TRUE(raft::devArrMatch(data_trans_ref.data(),
+                                data.data(),
+                                params.len,
+                                raft::CompareApproxAbs<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(TransposeTests, TransposeTestValD, ::testing::ValuesIn(inputsd2));
 
 }  // end namespace linalg
 }  // end namespace raft
diff --git a/cpp/test/linalg/unary_op.cu b/cpp/test/linalg/unary_op.cu
index 0fcf465150..ff6723973d 100644
--- a/cpp/test/linalg/unary_op.cu
+++ b/cpp/test/linalg/unary_op.cu
@@ -28,49 +28,49 @@ namespace linalg {
 // for an extended __device__ lambda cannot have private or protected access
 // within its class
 template <typename InType, typename IdxType = int, typename OutType = InType>
-void unaryOpLaunch(OutType *out, const InType *in, InType scalar, IdxType len,
-                   cudaStream_t stream) {
+void unaryOpLaunch(OutType* out, const InType* in, InType scalar, IdxType len, cudaStream_t stream)
+{
   if (in == nullptr) {
     auto op = [scalar] __device__(OutType * ptr, IdxType idx) {
       *ptr = static_cast<OutType>(scalar * idx);
     };
     writeOnlyUnaryOp<OutType, decltype(op), IdxType>(out, len, op, stream);
   } else {
-    auto op = [scalar] __device__(InType in) {
-      return static_cast<OutType>(in * scalar);
-    };
+    auto op = [scalar] __device__(InType in) { return static_cast<OutType>(in * scalar); };
     unaryOp<InType, decltype(op), IdxType, OutType>(out, in, len, op, stream);
   }
 }
 
 template <typename InType, typename IdxType, typename OutType = InType>
-class UnaryOpTest
-  : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
+class UnaryOpTest : public ::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>> {
  public:
   UnaryOpTest()
-    : params(::testing::TestWithParam<
-             UnaryOpInputs<InType, IdxType, OutType>>::GetParam()),
+    : params(::testing::TestWithParam<UnaryOpInputs<InType, IdxType, OutType>>::GetParam()),
       stream(handle.get_stream()),
       in(params.len, stream),
       out_ref(params.len, stream),
-      out(params.len, stream) {}
+      out(params.len, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     auto len = params.len;
     r.uniform(in.data(), len, InType(-1.0), InType(1.0), stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  virtual void DoTest() {
-    auto len = params.len;
+  virtual void DoTest()
+  {
+    auto len    = params.len;
     auto scalar = params.scalar;
     naiveScale(out_ref.data(), in.data(), scalar, len, stream);
     unaryOpLaunch(out.data(), in.data(), scalar, len, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
-    ASSERT_TRUE(devArrMatch(out_ref.data(), out.data(), params.len,
-                            CompareApprox<OutType>(params.tolerance)));
+    ASSERT_TRUE(devArrMatch(
+      out_ref.data(), out.data(), params.len, CompareApprox<OutType>(params.tolerance)));
   }
 
  protected:
@@ -85,15 +85,15 @@ class UnaryOpTest
 template <typename OutType, typename IdxType>
 class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
  protected:
-  void DoTest() override {
-    auto len = this->params.len;
+  void DoTest() override
+  {
+    auto len    = this->params.len;
     auto scalar = this->params.scalar;
-    naiveScale(this->out_ref.data(), (OutType *)nullptr, scalar, len,
-               this->stream);
-    unaryOpLaunch(this->out.data(), (OutType *)nullptr, scalar, len,
-                  this->stream);
+    naiveScale(this->out_ref.data(), (OutType*)nullptr, scalar, len, this->stream);
+    unaryOpLaunch(this->out.data(), (OutType*)nullptr, scalar, len, this->stream);
     CUDA_CHECK(cudaStreamSynchronize(this->stream));
-    ASSERT_TRUE(devArrMatch(this->out_ref.data(), this->out.data(),
+    ASSERT_TRUE(devArrMatch(this->out_ref.data(),
+                            this->out.data(),
                             this->params.len,
                             CompareApprox<OutType>(this->params.tolerance)));
   }
@@ -103,8 +103,7 @@ class WriteOnlyUnaryOpTest : public UnaryOpTest<OutType, IdxType, OutType> {
   TEST_P(Name, Result) { DoTest(); } \
   INSTANTIATE_TEST_SUITE_P(UnaryOpTests, Name, ::testing::ValuesIn(inputs))
 
-const std::vector<UnaryOpInputs<float, int>> inputsf_i32 = {
-  {0.000001f, 1024 * 1024, 2.f, 1234ULL}};
+const std::vector<UnaryOpInputs<float, int>> inputsf_i32 = {{0.000001f, 1024 * 1024, 2.f, 1234ULL}};
 typedef UnaryOpTest<float, int> UnaryOpTestF_i32;
 UNARY_OP_TEST(UnaryOpTestF_i32, inputsf_i32);
 typedef WriteOnlyUnaryOpTest<float, int> WriteOnlyUnaryOpTestF_i32;
diff --git a/cpp/test/linalg/unary_op.cuh b/cpp/test/linalg/unary_op.cuh
index be3f1124c5..3343389af8 100644
--- a/cpp/test/linalg/unary_op.cuh
+++ b/cpp/test/linalg/unary_op.cuh
@@ -24,8 +24,8 @@ namespace raft {
 namespace linalg {
 
 template <typename InType, typename OutType, typename IdxType>
-__global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar,
-                                 IdxType len) {
+__global__ void naiveScaleKernel(OutType* out, const InType* in, InType scalar, IdxType len)
+{
   IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * (IdxType)blockDim.x);
   if (idx < len) {
     if (in == nullptr) {
@@ -38,12 +38,11 @@ __global__ void naiveScaleKernel(OutType *out, const InType *in, InType scalar,
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-void naiveScale(OutType *out, const InType *in, InType scalar, int len,
-                cudaStream_t stream) {
+void naiveScale(OutType* out, const InType* in, InType scalar, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
-  naiveScaleKernel<InType, OutType, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
+  int nblks            = raft::ceildiv(len, TPB);
+  naiveScaleKernel<InType, OutType, IdxType><<<nblks, TPB, 0, stream>>>(out, in, scalar, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
@@ -56,8 +55,8 @@ struct UnaryOpInputs {
 };
 
 template <typename InType, typename IdxType = int, typename OutType = InType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const UnaryOpInputs<InType, IdxType, OutType> &d) {
+::std::ostream& operator<<(::std::ostream& os, const UnaryOpInputs<InType, IdxType, OutType>& d)
+{
   return os;
 }
 
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index 7c7f29815b..7042f5b48d 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -24,53 +24,51 @@ namespace raft {
 namespace matrix {
 
 template <typename Type>
-__global__ void nativePowerKernel(Type *in, Type *out, int len) {
+__global__ void nativePowerKernel(Type* in, Type* out, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = in[idx] * in[idx];
-  }
+  if (idx < len) { out[idx] = in[idx] * in[idx]; }
 }
 
 template <typename Type>
-void naivePower(Type *in, Type *out, int len, cudaStream_t stream) {
+void naivePower(Type* in, Type* out, int len, cudaStream_t stream)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   nativePowerKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void nativeSqrtKernel(Type *in, Type *out, int len) {
+__global__ void nativeSqrtKernel(Type* in, Type* out, int len)
+{
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < len) {
-    out[idx] = sqrt(in[idx]);
-  }
+  if (idx < len) { out[idx] = sqrt(in[idx]); }
 }
 
 template <typename Type>
-void naiveSqrt(Type *in, Type *out, int len) {
+void naiveSqrt(Type* in, Type* out, int len)
+{
   static const int TPB = 64;
-  int nblks = raft::ceildiv(len, TPB);
+  int nblks            = raft::ceildiv(len, TPB);
   nativeSqrtKernel<Type><<<nblks, TPB>>>(in, out, len);
   CUDA_CHECK(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount,
-                                    int colCount) {
+__global__ void naiveSignFlipKernel(Type* in, Type* out, int rowCount, int colCount)
+{
   int d_i = blockIdx.x * rowCount;
   int end = d_i + rowCount;
 
   if (blockIdx.x < colCount) {
-    Type max = 0.0;
+    Type max      = 0.0;
     int max_index = 0;
     for (int i = d_i; i < end; i++) {
       Type val = in[i];
-      if (val < 0.0) {
-        val = -val;
-      }
+      if (val < 0.0) { val = -val; }
       if (val > max) {
-        max = val;
+        max       = val;
         max_index = i;
       }
     }
@@ -88,7 +86,8 @@ __global__ void naiveSignFlipKernel(Type *in, Type *out, int rowCount,
 }
 
 template <typename Type>
-void naiveSignFlip(Type *in, Type *out, int rowCount, int colCount) {
+void naiveSignFlip(Type* in, Type* out, int rowCount, int colCount)
+{
   naiveSignFlipKernel<Type><<<colCount, 1>>>(in, out, rowCount, colCount);
   CUDA_CHECK(cudaPeekAtLastError());
 }
@@ -103,7 +102,8 @@ struct MathInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const MathInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MathInputs<T>& dims)
+{
   return os;
 }
 
@@ -126,12 +126,15 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
       out_recip(4, stream),
       in_smallzero(4, stream),
       out_smallzero(4, stream),
-      out_smallzero_ref(4, stream) {}
+      out_smallzero_ref(4, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     random::Rng r(params.seed);
-    int len = params.len;
+    int len         = params.len;
     T in_ratio_h[4] = {1.0, 2.0, 2.0, 3.0};
     update_device(in_ratio.data(), in_ratio_h, 4, stream);
 
@@ -151,12 +154,11 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
 
     ratio(handle, in_ratio.data(), in_ratio.data(), 4, stream);
 
-    naiveSignFlip(in_sign_flip.data(), out_sign_flip_ref.data(), params.n_row,
-                  params.n_col);
+    naiveSignFlip(in_sign_flip.data(), out_sign_flip_ref.data(), params.n_row, params.n_col);
     signFlip(in_sign_flip.data(), params.n_row, params.n_col, stream);
 
     // default threshold is 1e-15
-    std::vector<T> in_recip_h = {0.1, 0.01, -0.01, 0.1e-16};
+    std::vector<T> in_recip_h     = {0.1, 0.01, -0.01, 0.1e-16};
     std::vector<T> in_recip_ref_h = {10.0, 100.0, -100.0, 0.0};
     update_device(in_recip.data(), in_recip_h.data(), 4, stream);
     update_device(in_recip_ref.data(), in_recip_ref_h.data(), 4, stream);
@@ -167,12 +169,11 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
 
     reciprocal(in_recip.data(), recip_scalar, 4, stream, true);
 
-    std::vector<T> in_small_val_zero_h = {0.1, 1e-16, -1e-16, -0.1};
+    std::vector<T> in_small_val_zero_h     = {0.1, 1e-16, -1e-16, -0.1};
     std::vector<T> in_small_val_zero_ref_h = {0.1, 0.0, 0.0, -0.1};
 
     update_device(in_smallzero.data(), in_small_val_zero_h.data(), 4, stream);
-    update_device(out_smallzero_ref.data(), in_small_val_zero_ref_h.data(), 4,
-                  stream);
+    update_device(out_smallzero_ref.data(), in_small_val_zero_ref_h.data(), 4, stream);
     setSmallValuesZero(out_smallzero.data(), in_smallzero.data(), 4, stream);
     setSmallValuesZero(in_smallzero.data(), 4, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -183,137 +184,139 @@ class MathTest : public ::testing::TestWithParam<MathInputs<T>> {
   cudaStream_t stream;
 
   MathInputs<T> params;
-  rmm::device_uvector<T> in_power, out_power_ref, in_sqrt, out_sqrt_ref,
-    in_ratio, out_ratio_ref, in_sign_flip, out_sign_flip_ref, in_recip,
-    in_recip_ref, out_recip, in_smallzero, out_smallzero, out_smallzero_ref;
+  rmm::device_uvector<T> in_power, out_power_ref, in_sqrt, out_sqrt_ref, in_ratio, out_ratio_ref,
+    in_sign_flip, out_sign_flip_ref, in_recip, in_recip_ref, out_recip, in_smallzero, out_smallzero,
+    out_smallzero_ref;
 };
 
-const std::vector<MathInputs<float>> inputsf = {
-  {0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}};
+const std::vector<MathInputs<float>> inputsf = {{0.00001f, 1024, 1024, 1024 * 1024, 1234ULL}};
 
-const std::vector<MathInputs<double>> inputsd = {
-  {0.00001, 1024, 1024, 1024 * 1024, 1234ULL}};
+const std::vector<MathInputs<double>> inputsd = {{0.00001, 1024, 1024, 1024 * 1024, 1234ULL}};
 
 typedef MathTest<float> MathPowerTestF;
-TEST_P(MathPowerTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_power.data(), out_power_ref.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathPowerTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_power.data(), out_power_ref.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathPowerTestD;
-TEST_P(MathPowerTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_power.data(), out_power_ref.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathPowerTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_power.data(), out_power_ref.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSqrtTestF;
-TEST_P(MathSqrtTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_sqrt.data(), out_sqrt_ref.data(), params.len,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathSqrtTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_sqrt.data(), out_sqrt_ref.data(), params.len, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSqrtTestD;
-TEST_P(MathSqrtTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_sqrt.data(), out_sqrt_ref.data(), params.len,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathSqrtTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_sqrt.data(), out_sqrt_ref.data(), params.len, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathRatioTestF;
-TEST_P(MathRatioTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathRatioTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathRatioTestD;
-TEST_P(MathRatioTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathRatioTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_ratio.data(), out_ratio_ref.data(), 4, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSignFlipTestF;
-TEST_P(MathSignFlipTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_sign_flip.data(), out_sign_flip_ref.data(),
-                          params.len, CompareApprox<float>(params.tolerance)));
+TEST_P(MathSignFlipTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(in_sign_flip.data(),
+                          out_sign_flip_ref.data(),
+                          params.len,
+                          CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSignFlipTestD;
-TEST_P(MathSignFlipTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_sign_flip.data(), out_sign_flip_ref.data(),
-                          params.len, CompareApprox<double>(params.tolerance)));
+TEST_P(MathSignFlipTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(in_sign_flip.data(),
+                          out_sign_flip_ref.data(),
+                          params.len,
+                          CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathReciprocalTestF;
-TEST_P(MathReciprocalTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_recip.data(), in_recip_ref.data(), 4,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathReciprocalTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_recip.data(), in_recip_ref.data(), 4, CompareApprox<float>(params.tolerance)));
 
   // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
-  ASSERT_TRUE(devArrMatch(out_recip.data(), in_recip_ref.data(), 3,
-                          CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(out_recip.data(), in_recip_ref.data(), 3, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathReciprocalTestD;
-TEST_P(MathReciprocalTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_recip.data(), in_recip_ref.data(), 4,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathReciprocalTestD, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(in_recip.data(), in_recip_ref.data(), 4, CompareApprox<double>(params.tolerance)));
 
   // 4-th term tests `setzero=true` functionality, not present in this version of `reciprocal`.
-  ASSERT_TRUE(devArrMatch(out_recip.data(), in_recip_ref.data(), 3,
-                          CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(
+    devArrMatch(out_recip.data(), in_recip_ref.data(), 3, CompareApprox<double>(params.tolerance)));
 }
 
 typedef MathTest<float> MathSetSmallZeroTestF;
-TEST_P(MathSetSmallZeroTestF, Result) {
-  ASSERT_TRUE(devArrMatch(in_smallzero.data(), out_smallzero_ref.data(), 4,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MathSetSmallZeroTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(out_smallzero.data(), out_smallzero_ref.data(), 4,
-                          CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MathTest<double> MathSetSmallZeroTestD;
-TEST_P(MathSetSmallZeroTestD, Result) {
-  ASSERT_TRUE(devArrMatch(in_smallzero.data(), out_smallzero_ref.data(), 4,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MathSetSmallZeroTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    in_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(out_smallzero.data(), out_smallzero_ref.data(), 4,
-                          CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    out_smallzero.data(), out_smallzero_ref.data(), 4, CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathPowerTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSqrtTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathRatioTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSignFlipTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathReciprocalTestD, ::testing::ValuesIn(inputsd));
 
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF,
-                         ::testing::ValuesIn(inputsf));
-INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(MathTests, MathSetSmallZeroTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace matrix
 }  // namespace raft
diff --git a/cpp/test/matrix/matrix.cu b/cpp/test/matrix/matrix.cu
index e247abad1e..6f052f7b46 100644
--- a/cpp/test/matrix/matrix.cu
+++ b/cpp/test/matrix/matrix.cu
@@ -33,7 +33,8 @@ struct MatrixInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const MatrixInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MatrixInputs<T>& dims)
+{
   return os;
 }
 
@@ -45,10 +46,13 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
       stream(handle.get_stream()),
       in1(params.n_row * params.n_col, stream),
       in2(params.n_row * params.n_col, stream),
-      in1_revr(params.n_row * params.n_col, stream) {}
+      in1_revr(params.n_row * params.n_col, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int len = params.n_row * params.n_col;
     r.uniform(in1.data(), len, T(-1.0), T(1.0), stream);
@@ -72,87 +76,84 @@ class MatrixTest : public ::testing::TestWithParam<MatrixInputs<T>> {
 
 const std::vector<MatrixInputs<float>> inputsf2 = {{0.000001f, 4, 4, 1234ULL}};
 
-const std::vector<MatrixInputs<double>> inputsd2 = {
-  {0.00000001, 4, 4, 1234ULL}};
+const std::vector<MatrixInputs<double>> inputsd2 = {{0.00000001, 4, 4, 1234ULL}};
 
 typedef MatrixTest<float> MatrixTestF;
-TEST_P(MatrixTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(in1.data(), in2.data(),
+TEST_P(MatrixTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(in1.data(),
+                                in2.data(),
                                 params.n_row * params.n_col,
                                 raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef MatrixTest<double> MatrixTestD;
-TEST_P(MatrixTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(in1.data(), in2.data(),
+TEST_P(MatrixTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(in1.data(),
+                                in2.data(),
                                 params.n_row * params.n_col,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF,
-                         ::testing::ValuesIn(inputsf2));
+INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestF, ::testing::ValuesIn(inputsf2));
 
-INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD,
-                         ::testing::ValuesIn(inputsd2));
+INSTANTIATE_TEST_SUITE_P(MatrixTests, MatrixTestD, ::testing::ValuesIn(inputsd2));
 
 template <typename T>
 class MatrixCopyRowsTest : public ::testing::Test {
-  using math_t = typename std::tuple_element<0, T>::type;
-  using idx_t = typename std::tuple_element<1, T>::type;
+  using math_t      = typename std::tuple_element<0, T>::type;
+  using idx_t       = typename std::tuple_element<1, T>::type;
   using idx_array_t = typename std::tuple_element<2, T>::type;
 
  protected:
   MatrixCopyRowsTest()
     : input(n_cols * n_rows, handle.get_stream()),
       indices(n_selected, handle.get_stream()),
-      output(n_cols * n_selected, handle.get_stream()) {
+      output(n_cols * n_selected, handle.get_stream())
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     handle.set_stream(stream);
     raft::update_device(indices.data(), indices_host, n_selected, stream);
     // Init input array
     thrust::counting_iterator<idx_t> first(0);
     thrust::device_ptr<math_t> ptr(input.data());
-    thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows,
-                 ptr);
+    thrust::copy(handle.get_thrust_policy(), first, first + n_cols * n_rows, ptr);
   }
 
   void TearDown() override { CUDA_CHECK(cudaStreamDestroy(stream)); }
 
-  void testCopyRows() {
-    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(),
-             n_selected, stream, false);
-    EXPECT_TRUE(raft::devArrMatchHost(output_exp_colmajor, output.data(),
-                                      n_selected * n_cols,
-                                      raft::Compare<math_t>()));
-    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(),
-             n_selected, stream, true);
-    EXPECT_TRUE(raft::devArrMatchHost(output_exp_rowmajor, output.data(),
-                                      n_selected * n_cols,
-                                      raft::Compare<math_t>()));
+  void testCopyRows()
+  {
+    copyRows(
+      input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, false);
+    EXPECT_TRUE(raft::devArrMatchHost(
+      output_exp_colmajor, output.data(), n_selected * n_cols, raft::Compare<math_t>()));
+    copyRows(input.data(), n_rows, n_cols, output.data(), indices.data(), n_selected, stream, true);
+    EXPECT_TRUE(raft::devArrMatchHost(
+      output_exp_rowmajor, output.data(), n_selected * n_cols, raft::Compare<math_t>()));
   }
 
  protected:
   raft::handle_t handle;
   cudaStream_t stream;
 
-  int n_rows = 10;
-  int n_cols = 3;
+  int n_rows     = 10;
+  int n_cols     = 3;
   int n_selected = 5;
 
-  idx_array_t indices_host[5] = {0, 3, 4, 7, 9};
-  math_t output_exp_colmajor[15] = {0,  3,  4,  7,  9,  10, 13, 14,
-                                    17, 19, 20, 23, 24, 27, 29};
-  math_t output_exp_rowmajor[15] = {0,  1,  2,  9,  10, 11, 12, 13,
-                                    14, 21, 22, 23, 27, 28, 29};
+  idx_array_t indices_host[5]    = {0, 3, 4, 7, 9};
+  math_t output_exp_colmajor[15] = {0, 3, 4, 7, 9, 10, 13, 14, 17, 19, 20, 23, 24, 27, 29};
+  math_t output_exp_rowmajor[15] = {0, 1, 2, 9, 10, 11, 12, 13, 14, 21, 22, 23, 27, 28, 29};
   rmm::device_uvector<math_t> input;
   rmm::device_uvector<math_t> output;
   rmm::device_uvector<idx_array_t> indices;
 };
 
-using TypeTuple =
-  ::testing::Types<std::tuple<float, int, int>, std::tuple<float, int64_t, int>,
-                   std::tuple<double, int, int>,
-                   std::tuple<double, int64_t, int>>;
+using TypeTuple = ::testing::Types<std::tuple<float, int, int>,
+                                   std::tuple<float, int64_t, int>,
+                                   std::tuple<double, int, int>,
+                                   std::tuple<double, int64_t, int>>;
 
 TYPED_TEST_CASE(MatrixCopyRowsTest, TypeTuple);
 TYPED_TEST(MatrixCopyRowsTest, CopyRows) { this->testCopyRows(); }
diff --git a/cpp/test/mr/device/buffer.cpp b/cpp/test/mr/device/buffer.cpp
index fe42cea8b3..5cfcc910fd 100644
--- a/cpp/test/mr/device/buffer.cpp
+++ b/cpp/test/mr/device/buffer.cpp
@@ -25,7 +25,8 @@ namespace raft {
 namespace mr {
 namespace device {
 
-TEST(Raft, DeviceBufferAlloc) {
+TEST(Raft, DeviceBufferAlloc)
+{
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
   // no allocation at construction
@@ -51,13 +52,14 @@ TEST(Raft, DeviceBufferAlloc) {
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, DeviceBufferZeroResize) {
+TEST(Raft, DeviceBufferZeroResize)
+{
   // Create a limiting_resource_adaptor to track allocations
-  auto curr_mr = dynamic_cast<rmm::mr::cuda_memory_resource*>(
-    rmm::mr::get_current_device_resource());
-  auto limit_mr = std::make_shared<
-    rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_memory_resource>>(curr_mr,
-                                                                       1000);
+  auto curr_mr =
+    dynamic_cast<rmm::mr::cuda_memory_resource*>(rmm::mr::get_current_device_resource());
+  auto limit_mr =
+    std::make_shared<rmm::mr::limiting_resource_adaptor<rmm::mr::cuda_memory_resource>>(curr_mr,
+                                                                                        1000);
 
   rmm::mr::set_current_device_resource(limit_mr.get());
 
diff --git a/cpp/test/mr/host/buffer.cpp b/cpp/test/mr/host/buffer.cpp
index 953f65ddfb..aadf05285c 100644
--- a/cpp/test/mr/host/buffer.cpp
+++ b/cpp/test/mr/host/buffer.cpp
@@ -24,7 +24,8 @@ namespace raft {
 namespace mr {
 namespace host {
 
-TEST(Raft, HostBuffer) {
+TEST(Raft, HostBuffer)
+{
   auto alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
@@ -51,14 +52,14 @@ TEST(Raft, HostBuffer) {
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-TEST(Raft, DeviceToHostBuffer) {
+TEST(Raft, DeviceToHostBuffer)
+{
   auto d_alloc = std::make_shared<device::default_allocator>();
   auto h_alloc = std::make_shared<default_allocator>();
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
   device::buffer<char> d_buff(d_alloc, stream, 32);
-  CUDA_CHECK(
-    cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
+  CUDA_CHECK(cudaMemsetAsync(d_buff.data(), 0, sizeof(char) * d_buff.size(), stream));
   buffer<char> h_buff(h_alloc, d_buff);
   ASSERT_EQ(d_buff.size(), h_buff.size());
   CUDA_CHECK(cudaStreamSynchronize(stream));
diff --git a/cpp/test/mst.cu b/cpp/test/mst.cu
index 781e6d1d3f..90a6d7bd87 100644
--- a/cpp/test/mst.cu
+++ b/cpp/test/mst.cu
@@ -61,7 +61,8 @@ namespace mst {
 // Sequential prims function
 // Returns total weight of MST
 template <typename vertex_t, typename edge_t, typename weight_t>
-weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
+weight_t prims(CSRHost<vertex_t, edge_t, weight_t>& csr_h)
+{
   std::size_t n_vertices = csr_h.offsets.size() - 1;
 
   bool active_vertex[n_vertices];
@@ -70,19 +71,18 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
 
   for (std::size_t i = 0; i < n_vertices; i++) {
     active_vertex[i] = false;
-    curr_edge[i] = static_cast<weight_t>(std::numeric_limits<int>::max());
+    curr_edge[i]     = static_cast<weight_t>(std::numeric_limits<int>::max());
   }
   curr_edge[0] = 0;
 
   // function to pick next min vertex-edge
-  auto min_vertex_edge = [](auto *curr_edge, auto *active_vertex,
-                            auto n_vertices) {
+  auto min_vertex_edge = [](auto* curr_edge, auto* active_vertex, auto n_vertices) {
     auto min = static_cast<weight_t>(std::numeric_limits<int>::max());
     vertex_t min_vertex{};
 
     for (std::size_t v = 0; v < n_vertices; v++) {
       if (!active_vertex[v] && curr_edge[v] < min) {
-        min = curr_edge[v];
+        min        = curr_edge[v];
         min_vertex = v;
       }
     }
@@ -98,14 +98,13 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
     active_vertex[curr_v] = true;  // set to active
 
     // iterate through edges of current active vertex
-    auto edge_st = csr_h.offsets[curr_v];
+    auto edge_st  = csr_h.offsets[curr_v];
     auto edge_end = csr_h.offsets[curr_v + 1];
 
     for (auto e = edge_st; e < edge_end; e++) {
       // put edges to be considered for next iteration
       auto neighbor_idx = csr_h.indices[e];
-      if (!active_vertex[neighbor_idx] &&
-          csr_h.weights[e] < curr_edge[neighbor_idx]) {
+      if (!active_vertex[neighbor_idx] && csr_h.weights[e] < curr_edge[neighbor_idx]) {
         curr_edge[neighbor_idx] = csr_h.weights[e];
       }
     }
@@ -121,15 +120,15 @@ weight_t prims(CSRHost<vertex_t, edge_t, weight_t> &csr_h) {
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-class MSTTest
-  : public ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, weight_t>> {
+class MSTTest : public ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, weight_t>> {
  protected:
   std::pair<raft::Graph_COO<vertex_t, edge_t, weight_t>,
             raft::Graph_COO<vertex_t, edge_t, weight_t>>
-  mst_gpu() {
-    edge_t *offsets = static_cast<edge_t *>(csr_d.offsets.data());
-    vertex_t *indices = static_cast<vertex_t *>(csr_d.indices.data());
-    weight_t *weights = static_cast<weight_t *>(csr_d.weights.data());
+  mst_gpu()
+  {
+    edge_t* offsets   = static_cast<edge_t*>(csr_d.offsets.data());
+    vertex_t* indices = static_cast<vertex_t*>(csr_d.indices.data());
+    weight_t* weights = static_cast<weight_t*>(csr_d.weights.data());
 
     v = static_cast<vertex_t>((csr_d.offsets.size() / sizeof(vertex_t)) - 1);
     e = static_cast<edge_t>(csr_d.indices.size() / sizeof(edge_t));
@@ -138,89 +137,95 @@ class MSTTest
     rmm::device_uvector<vertex_t> mst_dst(2 * v - 2, handle.get_stream());
     rmm::device_uvector<vertex_t> color(v, handle.get_stream());
 
-    CUDA_CHECK(
-      cudaMemsetAsync(mst_src.data(), std::numeric_limits<vertex_t>::max(),
-                      mst_src.size() * sizeof(vertex_t), handle.get_stream()));
-    CUDA_CHECK(
-      cudaMemsetAsync(mst_dst.data(), std::numeric_limits<vertex_t>::max(),
-                      mst_dst.size() * sizeof(vertex_t), handle.get_stream()));
-    CUDA_CHECK(cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t),
+    CUDA_CHECK(cudaMemsetAsync(mst_src.data(),
+                               std::numeric_limits<vertex_t>::max(),
+                               mst_src.size() * sizeof(vertex_t),
+                               handle.get_stream()));
+    CUDA_CHECK(cudaMemsetAsync(mst_dst.data(),
+                               std::numeric_limits<vertex_t>::max(),
+                               mst_dst.size() * sizeof(vertex_t),
                                handle.get_stream()));
+    CUDA_CHECK(
+      cudaMemsetAsync(color.data(), 0, color.size() * sizeof(vertex_t), handle.get_stream()));
 
-    vertex_t *color_ptr = thrust::raw_pointer_cast(color.data());
+    vertex_t* color_ptr = thrust::raw_pointer_cast(color.data());
 
     if (iterations == 0) {
       MST_solver<vertex_t, edge_t, weight_t, float> symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        true, true, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, true, 0);
       auto symmetric_result = symmetric_solver.solve();
 
       MST_solver<vertex_t, edge_t, weight_t, float> non_symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        false, true, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0);
       auto non_symmetric_result = non_symmetric_solver.solve();
 
       EXPECT_LE(symmetric_result.n_edges, 2 * v - 2);
       EXPECT_LE(non_symmetric_result.n_edges, v - 1);
 
-      return std::make_pair(std::move(symmetric_result),
-                            std::move(non_symmetric_result));
+      return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result));
     } else {
-      MST_solver<vertex_t, edge_t, weight_t, float> intermediate_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        true, true, iterations);
+      MST_solver<vertex_t, edge_t, weight_t, float> intermediate_solver(handle,
+                                                                        offsets,
+                                                                        indices,
+                                                                        weights,
+                                                                        v,
+                                                                        e,
+                                                                        color_ptr,
+                                                                        handle.get_stream(),
+                                                                        true,
+                                                                        true,
+                                                                        iterations);
       auto intermediate_result = intermediate_solver.solve();
 
       MST_solver<vertex_t, edge_t, weight_t, float> symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        true, false, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), true, false, 0);
       auto symmetric_result = symmetric_solver.solve();
 
       // symmetric_result.n_edges += intermediate_result.n_edges;
-      auto total_edge_size =
-        symmetric_result.n_edges + intermediate_result.n_edges;
+      auto total_edge_size = symmetric_result.n_edges + intermediate_result.n_edges;
       symmetric_result.src.resize(total_edge_size, handle.get_stream());
       symmetric_result.dst.resize(total_edge_size, handle.get_stream());
       symmetric_result.weights.resize(total_edge_size, handle.get_stream());
 
       raft::copy(symmetric_result.src.data() + symmetric_result.n_edges,
-                 intermediate_result.src.data(), intermediate_result.n_edges,
+                 intermediate_result.src.data(),
+                 intermediate_result.n_edges,
                  handle.get_stream());
       raft::copy(symmetric_result.dst.data() + symmetric_result.n_edges,
-                 intermediate_result.dst.data(), intermediate_result.n_edges,
+                 intermediate_result.dst.data(),
+                 intermediate_result.n_edges,
                  handle.get_stream());
       raft::copy(symmetric_result.weights.data() + symmetric_result.n_edges,
                  intermediate_result.weights.data(),
-                 intermediate_result.n_edges, handle.get_stream());
+                 intermediate_result.n_edges,
+                 handle.get_stream());
       symmetric_result.n_edges = total_edge_size;
 
       MST_solver<vertex_t, edge_t, weight_t, float> non_symmetric_solver(
-        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(),
-        false, true, 0);
+        handle, offsets, indices, weights, v, e, color_ptr, handle.get_stream(), false, true, 0);
       auto non_symmetric_result = non_symmetric_solver.solve();
 
       EXPECT_LE(symmetric_result.n_edges, 2 * v - 2);
       EXPECT_LE(non_symmetric_result.n_edges, v - 1);
 
-      return std::make_pair(std::move(symmetric_result),
-                            std::move(non_symmetric_result));
+      return std::make_pair(std::move(symmetric_result), std::move(non_symmetric_result));
     }
   }
 
-  void SetUp() override {
-    mst_input = ::testing::TestWithParam<
-      MSTTestInput<vertex_t, edge_t, weight_t>>::GetParam();
+  void SetUp() override
+  {
+    mst_input  = ::testing::TestWithParam<MSTTestInput<vertex_t, edge_t, weight_t>>::GetParam();
     iterations = mst_input.iterations;
 
-    csr_d.offsets = rmm::device_buffer(
-      mst_input.csr_h.offsets.data(),
-      mst_input.csr_h.offsets.size() * sizeof(edge_t), handle.get_stream());
-    csr_d.indices = rmm::device_buffer(
-      mst_input.csr_h.indices.data(),
-      mst_input.csr_h.indices.size() * sizeof(vertex_t), handle.get_stream());
-    csr_d.weights = rmm::device_buffer(
-      mst_input.csr_h.weights.data(),
-      mst_input.csr_h.weights.size() * sizeof(weight_t), handle.get_stream());
+    csr_d.offsets = rmm::device_buffer(mst_input.csr_h.offsets.data(),
+                                       mst_input.csr_h.offsets.size() * sizeof(edge_t),
+                                       handle.get_stream());
+    csr_d.indices = rmm::device_buffer(mst_input.csr_h.indices.data(),
+                                       mst_input.csr_h.indices.size() * sizeof(vertex_t),
+                                       handle.get_stream());
+    csr_d.weights = rmm::device_buffer(mst_input.csr_h.weights.data(),
+                                       mst_input.csr_h.weights.size() * sizeof(weight_t),
+                                       handle.get_stream());
   }
 
   void TearDown() override {}
@@ -272,41 +277,68 @@ const std::vector<MSTTestInput<int, int, float>> csr_in_h = {
 const std::vector<CSRHost<int, int, float>> csr_in4_h = {
   {{0, 3, 5, 8, 10, 12, 14, 16},
    {2, 4, 5, 3, 6, 0, 4, 5, 1, 6, 0, 2, 0, 2, 1, 3},
-   {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f,
-    1.0f, 6.0f, 7.0f, 10.0f}}};
+   {5.0f,
+    9.0f,
+    1.0f,
+    8.0f,
+    7.0f,
+    5.0f,
+    2.0f,
+    6.0f,
+    8.0f,
+    10.0f,
+    9.0f,
+    2.0f,
+    1.0f,
+    6.0f,
+    7.0f,
+    10.0f}}};
 
 //  singletons
 const std::vector<CSRHost<int, int, float>> csr_in5_h = {
   {{0, 3, 5, 8, 10, 10, 10, 12, 14, 16, 16},
    {2, 8, 7, 3, 8, 0, 8, 7, 1, 8, 0, 2, 0, 2, 1, 3},
-   {5.0f, 9.0f, 1.0f, 8.0f, 7.0f, 5.0f, 2.0f, 6.0f, 8.0f, 10.0f, 9.0f, 2.0f,
-    1.0f, 6.0f, 7.0f, 10.0f}}};
+   {5.0f,
+    9.0f,
+    1.0f,
+    8.0f,
+    7.0f,
+    5.0f,
+    2.0f,
+    6.0f,
+    8.0f,
+    10.0f,
+    9.0f,
+    2.0f,
+    1.0f,
+    6.0f,
+    7.0f,
+    10.0f}}};
 
 typedef MSTTest<int, int, float> MSTTestSequential;
-TEST_P(MSTTestSequential, Sequential) {
-  auto results_pair = mst_gpu();
-  auto &symmetric_result = results_pair.first;
-  auto &non_symmetric_result = results_pair.second;
+TEST_P(MSTTestSequential, Sequential)
+{
+  auto results_pair          = mst_gpu();
+  auto& symmetric_result     = results_pair.first;
+  auto& non_symmetric_result = results_pair.second;
 
   // do assertions here
   // in this case, running sequential MST
   auto prims_result = prims(mst_input.csr_h);
 
-  auto symmetric_sum =
-    thrust::reduce(thrust::device, symmetric_result.weights.data(),
-                   symmetric_result.weights.data() + symmetric_result.n_edges);
-  auto non_symmetric_sum = thrust::reduce(
-    thrust::device, non_symmetric_result.weights.data(),
-    non_symmetric_result.weights.data() + non_symmetric_result.n_edges);
-
-  ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum,
-                          raft::CompareApprox<float>(0.1)));
-  ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum,
-                          raft::CompareApprox<float>(0.1)));
+  auto symmetric_sum = thrust::reduce(thrust::device,
+                                      symmetric_result.weights.data(),
+                                      symmetric_result.weights.data() + symmetric_result.n_edges);
+  auto non_symmetric_sum =
+    thrust::reduce(thrust::device,
+                   non_symmetric_result.weights.data(),
+                   non_symmetric_result.weights.data() + non_symmetric_result.n_edges);
+
+  ASSERT_TRUE(raft::match(2 * prims_result, symmetric_sum, raft::CompareApprox<float>(0.1)));
+  ASSERT_TRUE(raft::match(prims_result, non_symmetric_sum, raft::CompareApprox<float>(0.1)));
 }
 
-INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential,
-                         ::testing::ValuesIn(csr_in_h));
+INSTANTIATE_TEST_SUITE_P(MSTTests, MSTTestSequential, ::testing::ValuesIn(csr_in_h));
 
 }  // namespace mst
 }  // namespace raft
diff --git a/cpp/test/pow2_utils.cu b/cpp/test/pow2_utils.cu
index 92976e5c61..c76064ade7 100644
--- a/cpp/test/pow2_utils.cu
+++ b/cpp/test/pow2_utils.cu
@@ -24,7 +24,8 @@ struct Pow2Test : public ::testing::Test {
   typedef Pow2<Val> P;
   std::vector<TargetT> data;
 
-  void SetUp() override {
+  void SetUp() override
+  {
     std::vector<TargetT> pos = {0, 1, 2, 7, 15, 16, 17, 31, 35, 1024, 1623};
     data.insert(data.end(), pos.begin(), pos.end());
     if constexpr (std::is_signed<TargetT>::value) {
@@ -35,7 +36,8 @@ struct Pow2Test : public ::testing::Test {
     data.push_back(std::numeric_limits<TargetT>::max());
   }
 
-  void quotRem() {
+  void quotRem()
+  {
     for (auto x : data) {
       ASSERT_EQ(P::quot(x), x / P::Value) << "  where x = " << x;
       ASSERT_EQ(P::rem(x), x % P::Value) << "  where x = " << x;
@@ -43,31 +45,32 @@ struct Pow2Test : public ::testing::Test {
     }
   }
 
-  void divMod() {
+  void divMod()
+  {
     for (auto x : data) {
       ASSERT_GE(P::mod(x), 0) << "  where x = " << x;
       ASSERT_EQ(x, P::div(x) * P::Value + P::mod(x));
     }
   }
 
-  void round() {
+  void round()
+  {
     for (auto x : data) {
-      if (x <= std::numeric_limits<TargetT>::max() - TargetT(P::Value))
-        ASSERT_GE(P::roundUp(x), x);
+      if (x <= std::numeric_limits<TargetT>::max() - TargetT(P::Value)) ASSERT_GE(P::roundUp(x), x);
       if (x >= std::numeric_limits<TargetT>::min() + TargetT(P::Value))
         ASSERT_LE(P::roundDown(x), x);
       ASSERT_EQ(x - P::roundDown(x), P::mod(x)) << "  where x = " << x;
-      ASSERT_EQ(P::mod(P::roundUp(x) + P::mod(x) - x), 0)
-        << "  where x = " << x;
+      ASSERT_EQ(P::mod(P::roundUp(x) + P::mod(x) - x), 0) << "  where x = " << x;
     }
   }
 
-  void alignment() {
+  void alignment()
+  {
     for (auto x : data) {
       ASSERT_TRUE(P::areSameAlignOffsets(x, x));
       if (x <= std::numeric_limits<TargetT>::max() - TargetT(P::Value)) {
         ASSERT_TRUE(P::areSameAlignOffsets(x, x + TargetT(P::Value)));
-        int aligned_count = 0;
+        int aligned_count      = 0;
         int same_aligned_count = 0;
         for (int i = 0; i < int(P::Value); i++) {
           aligned_count += P::isAligned(x + i);
@@ -97,10 +100,11 @@ TEST_IT(Pow2_u64_i32_128);
 TEST_IT(Pow2_ll_u16_32);
 TEST_IT(Pow2_i32_u64_16);
 
-TEST(Pow2, pointers) {
+TEST(Pow2, pointers)
+{
   typedef Pow2<32UL> P;
   for (ptrdiff_t i = 0; i <= ptrdiff_t(P::Value); i++) {
-    auto *p = reinterpret_cast<float *>(16345 + i);
+    auto* p = reinterpret_cast<float*>(16345 + i);
     ASSERT_GE(P::roundUp(p), p);
     ASSERT_LE(P::roundDown(p), p);
   }
diff --git a/cpp/test/random/rng.cu b/cpp/test/random/rng.cu
index 810d6cb871..69dc146486 100644
--- a/cpp/test/random/rng.cu
+++ b/cpp/test/random/rng.cu
@@ -40,12 +40,13 @@ enum RandomType {
 };
 
 template <typename T, int TPB>
-__global__ void meanKernel(T* out, const T* data, int len) {
+__global__ void meanKernel(T* out, const T* data, int len)
+{
   typedef cub::BlockReduce<T, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  T val = tid < len ? data[tid] : T(0);
-  T x = BlockReduce(temp_storage).Sum(val);
+  T val   = tid < len ? data[tid] : T(0);
+  T x     = BlockReduce(temp_storage).Sum(val);
   __syncthreads();
   T xx = BlockReduce(temp_storage).Sum(val * val);
   __syncthreads();
@@ -72,7 +73,8 @@ struct RngInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims) {
+::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims)
+{
   return os;
 }
 
@@ -86,47 +88,36 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     : params(::testing::TestWithParam<RngInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       data(0, stream),
-      stats(2, stream) {
+      stats(2, stream)
+  {
     data.resize(params.len, stream);
     CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream));
   }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     // Tests are configured with their expected test-values sigma. For example,
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
     num_sigma = 10;
     Rng r(params.seed, params.gtype);
     switch (params.type) {
-      case RNG_Normal:
-        r.normal(data.data(), params.len, params.start, params.end, stream);
-        break;
+      case RNG_Normal: r.normal(data.data(), params.len, params.start, params.end, stream); break;
       case RNG_LogNormal:
         r.lognormal(data.data(), params.len, params.start, params.end, stream);
         break;
-      case RNG_Uniform:
-        r.uniform(data.data(), params.len, params.start, params.end, stream);
-        break;
-      case RNG_Gumbel:
-        r.gumbel(data.data(), params.len, params.start, params.end, stream);
-        break;
+      case RNG_Uniform: r.uniform(data.data(), params.len, params.start, params.end, stream); break;
+      case RNG_Gumbel: r.gumbel(data.data(), params.len, params.start, params.end, stream); break;
       case RNG_Logistic:
         r.logistic(data.data(), params.len, params.start, params.end, stream);
         break;
-      case RNG_Exp:
-        r.exponential(data.data(), params.len, params.start, stream);
-        break;
-      case RNG_Rayleigh:
-        r.rayleigh(data.data(), params.len, params.start, stream);
-        break;
-      case RNG_Laplace:
-        r.laplace(data.data(), params.len, params.start, params.end, stream);
-        break;
+      case RNG_Exp: r.exponential(data.data(), params.len, params.start, stream); break;
+      case RNG_Rayleigh: r.rayleigh(data.data(), params.len, params.start, stream); break;
+      case RNG_Laplace: r.laplace(data.data(), params.len, params.start, params.end, stream); break;
     };
     static const int threads = 128;
-    meanKernel<T, threads>
-      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
-        stats.data(), data.data(), params.len);
+    meanKernel<T, threads><<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
+      stats.data(), data.data(), params.len);
     update_host<T>(h_stats, stats.data(), 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
@@ -134,18 +125,18 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void getExpectedMeanVar(T meanvar[2]) {
+  void getExpectedMeanVar(T meanvar[2])
+  {
     switch (params.type) {
       case RNG_Normal:
         meanvar[0] = params.start;
         meanvar[1] = params.end * params.end;
         break;
       case RNG_LogNormal: {
-        auto var = params.end * params.end;
-        auto mu = params.start;
+        auto var   = params.end * params.end;
+        auto mu    = params.start;
         meanvar[0] = raft::myExp(mu + var * T(0.5));
-        meanvar[1] =
-          (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
+        meanvar[1] = (raft::myExp(var) - T(1.0)) * raft::myExp(T(2.0) * mu + var);
         break;
       }
       case RNG_Uniform:
@@ -169,8 +160,7 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
         break;
       case RNG_Rayleigh:
         meanvar[0] = params.start * raft::mySqrt(T(3.1415 / 2.0));
-        meanvar[1] =
-          ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
+        meanvar[1] = ((T(4.0) - T(3.1415)) / T(2.0)) * params.start * params.start;
         break;
       case RNG_Laplace:
         meanvar[0] = params.start;
@@ -264,13 +254,12 @@ const std::vector<RngInputs<float>> inputsf = {
   {0.02, 32 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL},
   {0.04, 8 * 1024, 1.f, 1.f, RNG_Laplace, GenKiss99, 1234ULL}};
 
-TEST_P(RngTestF, Result) {
+TEST_P(RngTestF, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0],
-                    CompareApprox<float>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1],
-                    CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(num_sigma * params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestF, ::testing::ValuesIn(inputsf));
 
@@ -326,13 +315,12 @@ const std::vector<RngInputs<double>> inputsd = {
   {0.025, 8 * 1024, 1.0, 1.0, RNG_Rayleigh, GenKiss99, 1234ULL},
   {0.02, 32 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL},
   {0.04, 8 * 1024, 1.0, 1.0, RNG_Laplace, GenKiss99, 1234ULL}};
-TEST_P(RngTestD, Result) {
+TEST_P(RngTestD, Result)
+{
   double meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0],
-                    CompareApprox<double>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1],
-                    CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<double>(num_sigma * params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd));
 
@@ -340,7 +328,8 @@ INSTANTIATE_TEST_SUITE_P(RngTests, RngTestD, ::testing::ValuesIn(inputsd));
 // Test for expected variance in mean calculations
 
 template <typename T>
-T quick_mean(const std::vector<T>& d) {
+T quick_mean(const std::vector<T>& d)
+{
   T acc = T(0);
   for (const auto& di : d) {
     acc += di;
@@ -349,8 +338,9 @@ T quick_mean(const std::vector<T>& d) {
 }
 
 template <typename T>
-T quick_std(const std::vector<T>& d) {
-  T acc = T(0);
+T quick_std(const std::vector<T>& d)
+{
+  T acc    = T(0);
   T d_mean = quick_mean(d);
   for (const auto& di : d) {
     acc += ((di - d_mean) * (di - d_mean));
@@ -359,7 +349,8 @@ T quick_std(const std::vector<T>& d) {
 }
 
 template <typename T>
-std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
+std::ostream& operator<<(std::ostream& out, const std::vector<T>& v)
+{
   if (!v.empty()) {
     out << '[';
     std::copy(v.begin(), v.end(), std::ostream_iterator<T>(out, ", "));
@@ -374,13 +365,14 @@ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
 // experiments computing the mean, giving us a distribution of the mean
 // itself. The mean error is simply the standard deviation of this
 // distribution (the standard deviation of the mean).
-TEST(Rng, MeanError) {
+TEST(Rng, MeanError)
+{
   timeb time_struct;
   ftime(&time_struct);
-  int seed = time_struct.millitm;
-  int num_samples = 1024;
+  int seed            = time_struct.millitm;
+  int num_samples     = 1024;
   int num_experiments = 1024;
-  int len = num_samples * num_experiments;
+  int len             = num_samples * num_experiments;
 
   cudaStream_t stream;
   CUDA_CHECK(cudaStreamCreate(&stream));
@@ -393,22 +385,26 @@ TEST(Rng, MeanError) {
     Rng r(seed, rtype);
     r.normal(data.data(), len, 3.3f, 0.23f, stream);
     // r.uniform(data, len, -1.0, 2.0);
-    raft::stats::mean(mean_result.data(), data.data(), num_samples,
-                      num_experiments, false, false, stream);
-    raft::stats::stddev(std_result.data(), data.data(), mean_result.data(),
-                        num_samples, num_experiments, false, false, stream);
+    raft::stats::mean(
+      mean_result.data(), data.data(), num_samples, num_experiments, false, false, stream);
+    raft::stats::stddev(std_result.data(),
+                        data.data(),
+                        mean_result.data(),
+                        num_samples,
+                        num_experiments,
+                        false,
+                        false,
+                        stream);
     std::vector<float> h_mean_result(num_experiments);
     std::vector<float> h_std_result(num_experiments);
-    update_host(h_mean_result.data(), mean_result.data(), num_experiments,
-                stream);
-    update_host(h_std_result.data(), std_result.data(), num_experiments,
-                stream);
+    update_host(h_mean_result.data(), mean_result.data(), num_experiments, stream);
+    update_host(h_std_result.data(), std_result.data(), num_experiments, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     auto d_mean = quick_mean(h_mean_result);
 
     // std-dev of mean; also known as mean error
-    auto d_std_of_mean = quick_std(h_mean_result);
-    auto d_std = quick_mean(h_std_result);
+    auto d_std_of_mean            = quick_std(h_mean_result);
+    auto d_std                    = quick_mean(h_std_result);
     auto d_std_of_mean_analytical = d_std / std::sqrt(num_samples);
 
     // std::cout << "measured mean error: " << d_std_of_mean << "\n";
@@ -417,8 +413,7 @@ TEST(Rng, MeanError) {
     auto diff_expected_vs_measured_mean_error =
       std::abs(d_std_of_mean - d_std / std::sqrt(num_samples));
 
-    ASSERT_TRUE(
-      (diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
+    ASSERT_TRUE((diff_expected_vs_measured_mean_error / d_std_of_mean_analytical < 0.5));
   }
   CUDA_CHECK(cudaStreamDestroy(stream));
 
@@ -431,18 +426,19 @@ class ScaledBernoulliTest : public ::testing::Test {
   ScaledBernoulliTest() : stream(handle.get_stream()), data(len, stream) {}
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     CUDA_CHECK(cudaStreamCreate(&stream));
     Rng r(42);
     r.scaled_bernoulli(data.data(), len, T(0.5), T(scale), stream);
   }
 
-  void rangeCheck() {
+  void rangeCheck()
+  {
     T* h_data = new T[len];
     update_host(h_data, data.data(), len, stream);
-    ASSERT_TRUE(std::none_of(h_data, h_data + len, [](const T& a) {
-      return a < -scale || a > scale;
-    }));
+    ASSERT_TRUE(
+      std::none_of(h_data, h_data + len, [](const T& a) { return a < -scale || a > scale; }));
     delete[] h_data;
   }
 
@@ -464,13 +460,15 @@ class BernoulliTest : public ::testing::Test {
   BernoulliTest() : stream(handle.get_stream()), data(len, stream) {}
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     Rng r(42);
     r.bernoulli(data.data(), len, T(0.5), stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void trueFalseCheck() {
+  void trueFalseCheck()
+  {
     // both true and false values must be present
     bool* h_data = new bool[len];
     update_host(h_data, data.data(), len, stream);
@@ -502,38 +500,39 @@ struct RngNormalTableInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os,
-                           const RngNormalTableInputs<T>& dims) {
+::std::ostream& operator<<(::std::ostream& os, const RngNormalTableInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
-class RngNormalTableTest
-  : public ::testing::TestWithParam<RngNormalTableInputs<T>> {
+class RngNormalTableTest : public ::testing::TestWithParam<RngNormalTableInputs<T>> {
  public:
   RngNormalTableTest()
     : params(::testing::TestWithParam<RngNormalTableInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       data(params.rows * params.cols, stream),
       stats(2, stream),
-      mu_vec(params.cols, stream) {
+      mu_vec(params.cols, stream)
+  {
     CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(T), stream));
   }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     // Tests are configured with their expected test-values sigma. For example,
     // 4 x sigma indicates the test shouldn't fail 99.9% of the time.
     num_sigma = 10;
-    int len = params.rows * params.cols;
+    int len   = params.rows * params.cols;
     Rng r(params.seed, params.gtype);
     r.fill(mu_vec.data(), params.cols, params.mu, stream);
     T* sigma_vec = nullptr;
-    r.normalTable(data.data(), params.rows, params.cols, mu_vec.data(),
-                  sigma_vec, params.sigma, stream);
+    r.normalTable(
+      data.data(), params.rows, params.cols, mu_vec.data(), sigma_vec, params.sigma, stream);
     static const int threads = 128;
-    meanKernel<T, threads><<<raft::ceildiv(len, threads), threads, 0, stream>>>(
-      stats.data(), data.data(), len);
+    meanKernel<T, threads>
+      <<<raft::ceildiv(len, threads), threads, 0, stream>>>(stats.data(), data.data(), len);
     update_host<T>(h_stats, stats.data(), 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= len;
@@ -541,7 +540,8 @@ class RngNormalTableTest
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void getExpectedMeanVar(T meanvar[2]) {
+  void getExpectedMeanVar(T meanvar[2])
+  {
     meanvar[0] = params.mu;
     meanvar[1] = params.sigma * params.sigma;
   }
@@ -565,16 +565,14 @@ const std::vector<RngNormalTableInputs<float>> inputsf_t = {
   {0.0055, 32, 1024, 1.f, 1.f, GenKiss99, 1234ULL},
   {0.011, 8, 1024, 1.f, 1.f, GenKiss99, 1234ULL}};
 
-TEST_P(RngNormalTableTestF, Result) {
+TEST_P(RngNormalTableTestF, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0],
-                    CompareApprox<float>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1],
-                    CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(num_sigma * params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF,
-                         ::testing::ValuesIn(inputsf_t));
+INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestF, ::testing::ValuesIn(inputsf_t));
 
 typedef RngNormalTableTest<double> RngNormalTableTestD;
 const std::vector<RngNormalTableInputs<double>> inputsd_t = {
@@ -584,16 +582,14 @@ const std::vector<RngNormalTableInputs<double>> inputsd_t = {
   {0.011, 8, 1024, 1.0, 1.0, GenTaps, 1234ULL},
   {0.0055, 32, 1024, 1.0, 1.0, GenKiss99, 1234ULL},
   {0.011, 8, 1024, 1.0, 1.0, GenKiss99, 1234ULL}};
-TEST_P(RngNormalTableTestD, Result) {
+TEST_P(RngNormalTableTestD, Result)
+{
   double meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(match(meanvar[0], h_stats[0],
-                    CompareApprox<double>(num_sigma * params.tolerance)));
-  ASSERT_TRUE(match(meanvar[1], h_stats[1],
-                    CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<double>(num_sigma * params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<double>(num_sigma * params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD,
-                         ::testing::ValuesIn(inputsd_t));
+INSTANTIATE_TEST_SUITE_P(RngNormalTableTests, RngNormalTableTestD, ::testing::ValuesIn(inputsd_t));
 
 struct RngAffineInputs {
   int n;
@@ -602,13 +598,15 @@ struct RngAffineInputs {
 
 class RngAffineTest : public ::testing::TestWithParam<RngAffineInputs> {
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<RngAffineInputs>::GetParam();
     Rng r(params.seed);
     r.affine_transform_params(params.n, a, b);
   }
 
-  void check() {
+  void check()
+  {
     ASSERT_TRUE(gcd(a, params.n) == 1);
     ASSERT_TRUE(0 <= b && b < params.n);
   }
@@ -619,13 +617,17 @@ class RngAffineTest : public ::testing::TestWithParam<RngAffineInputs> {
 };  // RngAffineTest
 
 const std::vector<RngAffineInputs> inputs_affine = {
-  {100, 123456ULL},     {100, 1234567890ULL},  {101, 123456ULL},
-  {101, 1234567890ULL}, {7, 123456ULL},        {7, 1234567890ULL},
-  {2568, 123456ULL},    {2568, 1234567890ULL},
+  {100, 123456ULL},
+  {100, 1234567890ULL},
+  {101, 123456ULL},
+  {101, 1234567890ULL},
+  {7, 123456ULL},
+  {7, 1234567890ULL},
+  {2568, 123456ULL},
+  {2568, 1234567890ULL},
 };
 TEST_P(RngAffineTest, Result) { check(); }
-INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest,
-                         ::testing::ValuesIn(inputs_affine));
+INSTANTIATE_TEST_SUITE_P(RngAffineTests, RngAffineTest, ::testing::ValuesIn(inputs_affine));
 
 }  // namespace random
 }  // namespace raft
diff --git a/cpp/test/random/rng_int.cu b/cpp/test/random/rng_int.cu
index cef2d47276..f0331b7746 100644
--- a/cpp/test/random/rng_int.cu
+++ b/cpp/test/random/rng_int.cu
@@ -29,12 +29,13 @@ using namespace raft::random::detail;
 enum RandomType { RNG_Uniform };
 
 template <typename T, int TPB>
-__global__ void meanKernel(float *out, const T *data, int len) {
+__global__ void meanKernel(float* out, const T* data, int len)
+{
   typedef cub::BlockReduce<float, TPB> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int tid   = threadIdx.x + blockIdx.x * blockDim.x;
   float val = tid < len ? data[tid] : T(0);
-  float x = BlockReduce(temp_storage).Sum(val);
+  float x   = BlockReduce(temp_storage).Sum(val);
   __syncthreads();
   float xx = BlockReduce(temp_storage).Sum(val * val);
   __syncthreads();
@@ -61,7 +62,8 @@ struct RngInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const RngInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const RngInputs<T>& dims)
+{
   return os;
 }
 
@@ -72,13 +74,15 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     : params(::testing::TestWithParam<RngInputs<T>>::GetParam()),
       stream(handle.get_stream()),
       data(0, stream),
-      stats(2, stream) {
+      stats(2, stream)
+  {
     data.resize(params.len, stream);
     CUDA_CHECK(cudaMemsetAsync(stats.data(), 0, 2 * sizeof(float), stream));
   }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     Rng r(params.seed, params.gtype);
 
     switch (params.type) {
@@ -87,9 +91,8 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
         break;
     };
     static const int threads = 128;
-    meanKernel<T, threads>
-      <<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
-        stats.data(), data.data(), params.len);
+    meanKernel<T, threads><<<raft::ceildiv(params.len, threads), threads, 0, stream>>>(
+      stats.data(), data.data(), params.len);
     update_host<float>(h_stats, stats.data(), 2, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
     h_stats[0] /= params.len;
@@ -97,7 +100,8 @@ class RngTest : public ::testing::TestWithParam<RngInputs<T>> {
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void getExpectedMeanVar(float meanvar[2]) {
+  void getExpectedMeanVar(float meanvar[2])
+  {
     switch (params.type) {
       case RNG_Uniform:
         meanvar[0] = (params.start + params.end) * 0.5f;
@@ -125,13 +129,12 @@ const std::vector<RngInputs<uint32_t>> inputs_u32 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestU32, Result) {
+TEST_P(RngTestU32, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(
-    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(
-    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU32, ::testing::ValuesIn(inputs_u32));
 
@@ -143,13 +146,12 @@ const std::vector<RngInputs<uint64_t>> inputs_u64 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestU64, Result) {
+TEST_P(RngTestU64, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(
-    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(
-    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestU64, ::testing::ValuesIn(inputs_u64));
 
@@ -161,13 +163,12 @@ const std::vector<RngInputs<int32_t>> inputs_s32 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestS32, Result) {
+TEST_P(RngTestS32, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(
-    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(
-    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS32, ::testing::ValuesIn(inputs_s32));
 
@@ -179,13 +180,12 @@ const std::vector<RngInputs<int64_t>> inputs_s64 = {
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenTaps, 1234ULL},
   {0.1f, 32 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL},
   {0.1f, 8 * 1024, 0, 20, RNG_Uniform, GenKiss99, 1234ULL}};
-TEST_P(RngTestS64, Result) {
+TEST_P(RngTestS64, Result)
+{
   float meanvar[2];
   getExpectedMeanVar(meanvar);
-  ASSERT_TRUE(
-    match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
-  ASSERT_TRUE(
-    match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[0], h_stats[0], CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(match(meanvar[1], h_stats[1], CompareApprox<float>(params.tolerance)));
 }
 INSTANTIATE_TEST_SUITE_P(RngTests, RngTestS64, ::testing::ValuesIn(inputs_s64));
 
diff --git a/cpp/test/random/sample_without_replacement.cu b/cpp/test/random/sample_without_replacement.cu
index 1d33f08c62..a681bbb07d 100644
--- a/cpp/test/random/sample_without_replacement.cu
+++ b/cpp/test/random/sample_without_replacement.cu
@@ -40,7 +40,8 @@ struct SWoRInputs {
 };
 
 template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const SWoRInputs<T>& dims) {
+::std::ostream& operator<<(::std::ostream& os, const SWoRInputs<T>& dims)
+{
   return os;
 }
 
@@ -53,20 +54,27 @@ class SWoRTest : public ::testing::TestWithParam<SWoRInputs<T>> {
       in(params.len, stream),
       wts(params.len, stream),
       out(params.sampledLen, stream),
-      outIdx(params.sampledLen, stream) {}
+      outIdx(params.sampledLen, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     Rng r(params.seed, params.gtype);
     h_outIdx.resize(params.sampledLen);
     r.uniform(in.data(), params.len, T(-1.0), T(1.0), stream);
     r.uniform(wts.data(), params.len, T(1.0), T(2.0), stream);
     if (params.largeWeightIndex >= 0) {
-      update_device(wts.data() + params.largeWeightIndex, &params.largeWeight,
-                    1, stream);
+      update_device(wts.data() + params.largeWeightIndex, &params.largeWeight, 1, stream);
     }
-    r.sampleWithoutReplacement(handle, out.data(), outIdx.data(), in.data(),
-                               wts.data(), params.sampledLen, params.len,
+    r.sampleWithoutReplacement(handle,
+                               out.data(),
+                               outIdx.data(),
+                               in.data(),
+                               wts.data(),
+                               params.sampledLen,
+                               params.len,
                                stream);
     update_host(&(h_outIdx[0]), outIdx.data(), params.sampledLen, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -145,14 +153,14 @@ const std::vector<SWoRInputs<float>> inputsf = {
   {1024, 512, 10, 100000.f, GenKiss99, 1234ULL},
 };
 
-TEST_P(SWoRTestF, Result) {
+TEST_P(SWoRTestF, Result)
+{
   std::set<int> occurence;
   for (int i = 0; i < params.sampledLen; ++i) {
     auto val = h_outIdx[i];
     // indices must be in the given range
     ASSERT_TRUE(0 <= val && val < params.len)
-      << "out-of-range index @i=" << i << " val=" << val
-      << " sampledLen=" << params.sampledLen;
+      << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen;
     // indices should not repeat
     ASSERT_TRUE(occurence.find(val) == occurence.end())
       << "repeated index @i=" << i << " idx=" << val;
@@ -160,9 +168,7 @@ TEST_P(SWoRTestF, Result) {
   }
   // if there's a skewed distribution, the top index should correspond to the
   // particular item with a large weight
-  if (params.largeWeightIndex >= 0) {
-    ASSERT_EQ(h_outIdx[0], params.largeWeightIndex);
-  }
+  if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); }
 }
 INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestF, ::testing::ValuesIn(inputsf));
 
@@ -229,14 +235,14 @@ const std::vector<SWoRInputs<double>> inputsd = {
   {1024, 512, 10, 100000.0, GenKiss99, 1234ULL},
 };
 
-TEST_P(SWoRTestD, Result) {
+TEST_P(SWoRTestD, Result)
+{
   std::set<int> occurence;
   for (int i = 0; i < params.sampledLen; ++i) {
     auto val = h_outIdx[i];
     // indices must be in the given range
     ASSERT_TRUE(0 <= val && val < params.len)
-      << "out-of-range index @i=" << i << " val=" << val
-      << " sampledLen=" << params.sampledLen;
+      << "out-of-range index @i=" << i << " val=" << val << " sampledLen=" << params.sampledLen;
     // indices should not repeat
     ASSERT_TRUE(occurence.find(val) == occurence.end())
       << "repeated index @i=" << i << " idx=" << val;
@@ -244,9 +250,7 @@ TEST_P(SWoRTestD, Result) {
   }
   // if there's a skewed distribution, the top index should correspond to the
   // particular item with a large weight
-  if (params.largeWeightIndex >= 0) {
-    ASSERT_EQ(h_outIdx[0], params.largeWeightIndex);
-  }
+  if (params.largeWeightIndex >= 0) { ASSERT_EQ(h_outIdx[0], params.largeWeightIndex); }
 }
 INSTANTIATE_TEST_SUITE_P(SWoRTests, SWoRTestD, ::testing::ValuesIn(inputsd));
 
diff --git a/cpp/test/sparse/add.cu b/cpp/test/sparse/add.cu
index a5f08489f1..d7e11e8fef 100644
--- a/cpp/test/sparse/add.cu
+++ b/cpp/test/sparse/add.cu
@@ -44,12 +44,10 @@ struct CSRAddInputs {
 };
 
 template <typename Type_f, typename Index_>
-class CSRAddTest
-  : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>> {
+class CSRAddTest : public ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>> {
  public:
   CSRAddTest()
-    : params(
-        ::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>::GetParam()),
+    : params(::testing::TestWithParam<CSRAddInputs<Type_f, Index_>>::GetParam()),
       stream(handle.get_stream()),
       ind_a(params.matrix_a.row_ind.size(), stream),
       ind_ptr_a(params.matrix_a.row_ind_ptr.size(), stream),
@@ -62,59 +60,69 @@ class CSRAddTest
       values_verify(params.matrix_verify.row_ind_ptr.size(), stream),
       ind_result(params.matrix_a.row_ind.size(), stream),
       ind_ptr_result(params.matrix_verify.row_ind_ptr.size(), stream),
-      values_result(params.matrix_verify.row_ind_ptr.size(), stream) {}
+      values_result(params.matrix_verify.row_ind_ptr.size(), stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
-    n_rows = params.matrix_a.row_ind.size();
-    nnz_a = params.matrix_a.row_ind_ptr.size();
-    nnz_b = params.matrix_b.row_ind_ptr.size();
+  void SetUp() override
+  {
+    n_rows     = params.matrix_a.row_ind.size();
+    nnz_a      = params.matrix_a.row_ind_ptr.size();
+    nnz_b      = params.matrix_b.row_ind_ptr.size();
     nnz_result = params.matrix_verify.row_ind_ptr.size();
   }
 
-  void Run() {
-    raft::update_device(ind_a.data(), params.matrix_a.row_ind.data(), n_rows,
-                        stream);
-    raft::update_device(ind_ptr_a.data(), params.matrix_a.row_ind_ptr.data(),
-                        nnz_a, stream);
-    raft::update_device(values_a.data(), params.matrix_a.values.data(), nnz_a,
-                        stream);
-
-    raft::update_device(ind_b.data(), params.matrix_b.row_ind.data(), n_rows,
-                        stream);
-    raft::update_device(ind_ptr_b.data(), params.matrix_b.row_ind_ptr.data(),
-                        nnz_b, stream);
-    raft::update_device(values_b.data(), params.matrix_b.values.data(), nnz_b,
-                        stream);
-
-    raft::update_device(ind_verify.data(), params.matrix_verify.row_ind.data(),
-                        n_rows, stream);
-    raft::update_device(ind_ptr_verify.data(),
-                        params.matrix_verify.row_ind_ptr.data(), nnz_result,
-                        stream);
-    raft::update_device(values_verify.data(),
-                        params.matrix_verify.values.data(), nnz_result, stream);
-
-    Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(
-      ind_a.data(), ind_ptr_a.data(), values_a.data(), nnz_a, ind_b.data(),
-      ind_ptr_b.data(), values_b.data(), nnz_b, n_rows, ind_result.data(),
-      stream);
+  void Run()
+  {
+    raft::update_device(ind_a.data(), params.matrix_a.row_ind.data(), n_rows, stream);
+    raft::update_device(ind_ptr_a.data(), params.matrix_a.row_ind_ptr.data(), nnz_a, stream);
+    raft::update_device(values_a.data(), params.matrix_a.values.data(), nnz_a, stream);
+
+    raft::update_device(ind_b.data(), params.matrix_b.row_ind.data(), n_rows, stream);
+    raft::update_device(ind_ptr_b.data(), params.matrix_b.row_ind_ptr.data(), nnz_b, stream);
+    raft::update_device(values_b.data(), params.matrix_b.values.data(), nnz_b, stream);
+
+    raft::update_device(ind_verify.data(), params.matrix_verify.row_ind.data(), n_rows, stream);
+    raft::update_device(
+      ind_ptr_verify.data(), params.matrix_verify.row_ind_ptr.data(), nnz_result, stream);
+    raft::update_device(
+      values_verify.data(), params.matrix_verify.values.data(), nnz_result, stream);
+
+    Index_ nnz = linalg::csr_add_calc_inds<Type_f, 32>(ind_a.data(),
+                                                       ind_ptr_a.data(),
+                                                       values_a.data(),
+                                                       nnz_a,
+                                                       ind_b.data(),
+                                                       ind_ptr_b.data(),
+                                                       values_b.data(),
+                                                       nnz_b,
+                                                       n_rows,
+                                                       ind_result.data(),
+                                                       stream);
 
     ASSERT_TRUE(nnz == nnz_result);
-    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_verify.data(), ind_result.data(),
-                                          n_rows, raft::Compare<Index_>()));
-
-    linalg::csr_add_finalize<Type_f, 32>(
-      ind_a.data(), ind_ptr_a.data(), values_a.data(), nnz_a, ind_b.data(),
-      ind_ptr_b.data(), values_b.data(), nnz_b, n_rows, ind_result.data(),
-      ind_ptr_result.data(), values_result.data(), stream);
-
-    ASSERT_TRUE(raft::devArrMatch<Index_>(ind_ptr_verify.data(),
-                                          ind_ptr_result.data(), nnz,
-                                          raft::Compare<Index_>()));
-    ASSERT_TRUE(raft::devArrMatch<Type_f>(values_verify.data(),
-                                          values_result.data(), nnz,
-                                          raft::Compare<Type_f>()));
+    ASSERT_TRUE(raft::devArrMatch<Index_>(
+      ind_verify.data(), ind_result.data(), n_rows, raft::Compare<Index_>()));
+
+    linalg::csr_add_finalize<Type_f, 32>(ind_a.data(),
+                                         ind_ptr_a.data(),
+                                         values_a.data(),
+                                         nnz_a,
+                                         ind_b.data(),
+                                         ind_ptr_b.data(),
+                                         values_b.data(),
+                                         nnz_b,
+                                         n_rows,
+                                         ind_result.data(),
+                                         ind_ptr_result.data(),
+                                         values_result.data(),
+                                         stream);
+
+    ASSERT_TRUE(raft::devArrMatch<Index_>(
+      ind_ptr_verify.data(), ind_ptr_result.data(), nnz, raft::Compare<Index_>()));
+    ASSERT_TRUE(raft::devArrMatch<Type_f>(
+      values_verify.data(), values_result.data(), nnz, raft::Compare<Type_f>()));
   }
 
  protected:
@@ -123,8 +131,8 @@ class CSRAddTest
 
   CSRAddInputs<Type_f, Index_> params;
   Index_ n_rows, nnz_a, nnz_b, nnz_result;
-  rmm::device_uvector<Index_> ind_a, ind_b, ind_verify, ind_result, ind_ptr_a,
-    ind_ptr_b, ind_ptr_verify, ind_ptr_result;
+  rmm::device_uvector<Index_> ind_a, ind_b, ind_verify, ind_result, ind_ptr_a, ind_ptr_b,
+    ind_ptr_verify, ind_ptr_result;
   rmm::device_uvector<Type_f> values_a, values_b, values_verify, values_result;
 };
 
@@ -157,10 +165,8 @@ const std::vector<CSRAddInputs<double, int>> csradd_inputs_d = {
     {2.0, 2.0, 0.5, 1.0, 0.5, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF,
-                        ::testing::ValuesIn(csradd_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD,
-                        ::testing::ValuesIn(csradd_inputs_d));
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestF, ::testing::ValuesIn(csradd_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseAddTest, CSRAddTestD, ::testing::ValuesIn(csradd_inputs_d));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/connect_components.cu b/cpp/test/sparse/connect_components.cu
index dd6ba1479e..5e4b164b37 100644
--- a/cpp/test/sparse/connect_components.cu
+++ b/cpp/test/sparse/connect_components.cu
@@ -50,24 +50,22 @@ struct ConnectComponentsInputs {
 };
 
 template <typename value_idx, typename value_t>
-class ConnectComponentsTest : public ::testing::TestWithParam<
-                                ConnectComponentsInputs<value_t, value_idx>> {
+class ConnectComponentsTest
+  : public ::testing::TestWithParam<ConnectComponentsInputs<value_t, value_idx>> {
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     raft::handle_t handle;
 
     auto stream = handle.get_stream();
 
-    params = ::testing::TestWithParam<
-      ConnectComponentsInputs<value_t, value_idx>>::GetParam();
+    params = ::testing::TestWithParam<ConnectComponentsInputs<value_t, value_idx>>::GetParam();
 
     raft::sparse::COO<value_t, value_idx> out_edges(handle.get_stream());
 
-    rmm::device_uvector<value_t> data(params.n_row * params.n_col,
-                                      handle.get_stream());
+    rmm::device_uvector<value_t> data(params.n_row * params.n_col, handle.get_stream());
 
-    raft::copy(data.data(), params.data.data(), data.size(),
-               handle.get_stream());
+    raft::copy(data.data(), params.data.data(), data.size(), handle.get_stream());
 
     rmm::device_uvector<value_idx> indptr(params.n_row + 1, stream);
 
@@ -76,44 +74,58 @@ class ConnectComponentsTest : public ::testing::TestWithParam<
      */
     raft::sparse::COO<value_t, value_idx> knn_graph_coo(stream);
 
-    raft::sparse::selection::knn_graph(
-      handle, data.data(), params.n_row, params.n_col,
-      raft::distance::DistanceType::L2SqrtExpanded, knn_graph_coo, params.c);
+    raft::sparse::selection::knn_graph(handle,
+                                       data.data(),
+                                       params.n_row,
+                                       params.n_col,
+                                       raft::distance::DistanceType::L2SqrtExpanded,
+                                       knn_graph_coo,
+                                       params.c);
 
-    raft::sparse::convert::sorted_coo_to_csr(knn_graph_coo.rows(),
-                                             knn_graph_coo.nnz, indptr.data(),
-                                             params.n_row + 1, stream);
+    raft::sparse::convert::sorted_coo_to_csr(
+      knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), params.n_row + 1, stream);
 
     /**
      * 2. Construct MST, sorted by weights
      */
     rmm::device_uvector<value_idx> colors(params.n_row, stream);
 
-    auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(
-      handle, indptr.data(), knn_graph_coo.cols(), knn_graph_coo.vals(),
-      params.n_row, knn_graph_coo.nnz, colors.data(), stream, false, true);
+    auto mst_coo = raft::mst::mst<value_idx, value_idx, value_t, double>(handle,
+                                                                         indptr.data(),
+                                                                         knn_graph_coo.cols(),
+                                                                         knn_graph_coo.vals(),
+                                                                         params.n_row,
+                                                                         knn_graph_coo.nnz,
+                                                                         colors.data(),
+                                                                         stream,
+                                                                         false,
+                                                                         true);
 
     /**
      * 3. connect_components to fix connectivities
      */
-    raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> red_op(
-      colors.data(), params.n_row);
+    raft::linkage::FixConnectivitiesRedOp<value_idx, value_t> red_op(colors.data(), params.n_row);
     raft::linkage::connect_components<value_idx, value_t>(
-      handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col,
-      red_op);
+      handle, out_edges, data.data(), colors.data(), params.n_row, params.n_col, red_op);
 
     /**
      * Construct final edge list
      */
     rmm::device_uvector<value_idx> indptr2(params.n_row + 1, stream);
 
-    raft::sparse::convert::sorted_coo_to_csr(out_edges.rows(), out_edges.nnz,
-                                             indptr2.data(), params.n_row + 1,
-                                             stream);
+    raft::sparse::convert::sorted_coo_to_csr(
+      out_edges.rows(), out_edges.nnz, indptr2.data(), params.n_row + 1, stream);
 
-    auto output_mst = raft::mst::mst<value_idx, value_idx, value_t>(
-      handle, indptr2.data(), out_edges.cols(), out_edges.vals(), params.n_row,
-      out_edges.nnz, colors.data(), stream, false, false);
+    auto output_mst = raft::mst::mst<value_idx, value_idx, value_t>(handle,
+                                                                    indptr2.data(),
+                                                                    out_edges.cols(),
+                                                                    out_edges.vals(),
+                                                                    params.n_row,
+                                                                    out_edges.nnz,
+                                                                    colors.data(),
+                                                                    stream,
+                                                                    false,
+                                                                    false);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
@@ -135,366 +147,199 @@ const std::vector<ConnectComponentsInputs<float, int>> fix_conn_inputsf2 = {
   // Test n_clusters == n_points
   {10,
    5,
-   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392,
-    0.77782677, 0.43772379, 0.4035871,  0.3282796,  0.47544681, 0.59862974,
-    0.12319357, 0.06239463, 0.28200272, 0.1345717,  0.50498218, 0.5113505,
-    0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
-    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,
-    0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792,
-    0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692,
-    0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
     0.76166195, 0.66613745},
    -1},
   // Test n_points == 100
   {100,
    10,
-   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01,
-    2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
-    6.88942598e-01, 5.79163537e-01, 6.70341547e-01,
-    2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
-    9.89948537e-01, 7.75253347e-01, 1.34491522e-02,
-    2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
-    7.86373507e-01, 7.18748577e-01, 8.66998621e-01,
-    6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
-    3.76246281e-01, 4.86828710e-01, 5.67464772e-01,
-    5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
-    9.49339111e-01, 3.55248484e-01, 9.06046929e-01,
-    4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
-    7.74840000e-01, 5.21046603e-01, 4.66423971e-02,
-    5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
-    4.31536306e-01, 5.83857744e-01, 4.41787364e-01,
-    4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
-    3.19650588e-01, 6.12579596e-01, 6.49126442e-02,
-    8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
-    9.46507115e-01, 8.58440748e-01, 3.61528940e-01,
-    2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
-    8.82216988e-01, 8.31498633e-01, 7.23474381e-01,
-    7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
-    5.13985168e-01, 3.00686418e-01, 8.70109949e-01,
-    2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
-    8.70985521e-01, 8.77491176e-01, 6.72537226e-01,
-    3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
-    6.18239142e-01, 2.64768597e-01, 5.76145451e-01,
-    8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
-    1.27645356e-01, 4.51004673e-01, 3.92292980e-01,
-    4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
-    5.71832605e-02, 2.06763039e-01, 3.70116249e-01,
-    2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
-    9.84156240e-02, 2.66249156e-01, 3.87635103e-01,
-    2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
-    6.86227676e-01, 1.08848960e-01, 5.96731841e-02,
-    3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
-    9.00700636e-01, 8.76363105e-01, 2.67334632e-01,
-    1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
-    1.09372387e-01, 8.74028108e-01, 6.46403232e-01,
-    4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
-    8.83865057e-01, 3.15879821e-01, 2.27043992e-01,
-    9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
-    2.40548962e-01, 3.21795663e-01, 8.75087904e-02,
-    8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
-    1.21958818e-01, 3.44348628e-02, 8.72630414e-01,
-    3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
-    5.33896401e-01, 6.21642973e-01, 4.93062535e-01,
-    4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
-    4.43447610e-01, 8.95646149e-01, 6.05220676e-01,
-    1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
-    6.92582693e-01, 7.55946922e-01, 7.95086143e-01,
-    6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
-    9.81114529e-01, 4.98266428e-01, 6.37127930e-03,
-    1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
-    7.38827633e-01, 8.93214770e-01, 2.16494306e-01,
-    9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
-    7.86240041e-01, 7.06854594e-01, 2.13725879e-02,
-    7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
-    5.01989826e-03, 4.22081572e-02, 1.65337732e-01,
-    8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
-    1.14028379e-01, 3.69739861e-01, 1.32955599e-01,
-    2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
-    6.88449594e-01, 4.44921417e-01, 8.23296587e-01,
-    1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
-    3.42600285e-01, 5.64505195e-01, 5.57594559e-01,
-    7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
-    3.21010077e-01, 8.55081359e-01, 4.30105779e-01,
-    1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
-    4.50880592e-01, 2.72289598e-01, 6.31615256e-01,
-    8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
-    2.68767748e-02, 2.43374608e-01, 4.02141103e-01,
-    4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
-    7.16149148e-01, 4.19664401e-01, 2.29335357e-01,
-    2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
-    5.69849716e-01, 5.86454477e-01, 3.54474989e-01,
-    9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
-    7.88039746e-02, 2.04814126e-01, 7.82251754e-01,
-    2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
-    2.95349590e-01, 6.57991826e-01, 8.81214312e-01,
-    5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
-    7.69797417e-02, 6.44792402e-01, 9.46950998e-01,
-    7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
-    1.67498426e-01, 2.66514296e-01, 6.50140368e-01,
-    1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
-    9.85033484e-01, 2.92909152e-01, 8.65816607e-01,
-    1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
-    2.89234322e-01, 8.18668708e-01, 4.71706924e-01,
-    9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
-    4.13915748e-01, 9.31274932e-02, 6.66322195e-01,
-    9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
-    5.03096313e-02, 6.95225201e-01, 5.78469859e-01,
-    6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
-    6.80663678e-01, 6.34607157e-01, 6.42765834e-01,
-    1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
-    4.68676824e-01, 2.86003928e-01, 7.18608322e-01,
-    8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
-    5.24379196e-01, 2.13526524e-01, 5.88375435e-01,
-    9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
-    9.53760881e-01, 5.27151288e-01, 7.03017278e-01,
-    3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
-    1.98979011e-01, 4.24917361e-01, 5.73172761e-01,
-    2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
-    9.29665524e-01, 2.26135696e-01, 9.20563384e-01,
-    7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
-    3.78559302e-03, 9.15219382e-01, 3.55705698e-01,
-    6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
-    2.49478206e-01, 7.93679304e-01, 4.75830027e-01,
-    4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
-    1.70386675e-01, 7.04056121e-01, 4.85963102e-01,
-    9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
-    2.58915007e-01, 6.70052890e-01, 2.61945109e-01,
-    8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
-    2.45776933e-01, 2.66658783e-01, 3.71724077e-01,
-    4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
-    8.07997684e-01, 1.64296275e-01, 6.01638065e-01,
-    8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
-    9.75844338e-01, 7.81226782e-01, 2.20925515e-01,
-    7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
-    9.08058083e-01, 6.88010677e-01, 8.14271847e-01,
-    5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
-    9.17455497e-01, 2.12052706e-01, 7.04074603e-01,
-    8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
-    8.54801557e-01, 2.49729159e-01, 9.76594604e-01,
-    2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
-    4.25193986e-01, 7.61869994e-01, 5.13334255e-01,
-    6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
-    1.08154647e-01, 8.78446825e-01, 2.43833016e-01,
-    9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
-    3.74510294e-01, 4.08451278e-02, 9.78392777e-01,
-    3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
-    5.25978080e-01, 1.42803678e-01, 4.05451674e-01,
-    7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
-    1.43159543e-02, 1.80363779e-01, 5.05096904e-01,
-    2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
-    8.73223968e-01, 4.38545619e-01, 4.81348800e-01,
-    6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
-    1.89869550e-01, 2.34083070e-01, 2.94066207e-01,
-    5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
-    6.72650672e-02, 8.47345378e-01, 2.80916761e-01,
-    7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
-    8.48781331e-01, 8.83225408e-01, 7.34398275e-01,
-    7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
-    5.40732486e-01, 3.69704071e-01, 5.77305837e-01,
-    2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
-    3.49496706e-01, 8.34948910e-01, 1.56403291e-02,
-    6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
-    1.43943153e-01, 3.49618530e-01, 2.10440392e-01,
-    3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
-    2.72177079e-01, 7.07946300e-01, 4.33717726e-02,
-    3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
-    6.22777789e-01, 2.95989228e-02, 4.32855769e-01,
-    7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
-    6.43721247e-01, 6.58025802e-01, 1.05247633e-02,
-    5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
-    6.62634841e-01, 8.25936616e-01, 9.91253704e-01,
-    6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
-    3.32139049e-01, 7.98732398e-01, 7.38865223e-01,
-    9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
-    1.83778839e-01, 7.27558919e-02, 5.91602822e-01,
-    3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
-    9.18556407e-01, 9.35373324e-01, 6.89209070e-01,
-    2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
-    9.84983432e-01, 6.62322741e-01, 2.04144457e-01,
-    3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
-    3.14043787e-01, 5.91072666e-01, 7.44703771e-01,
-    8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
-    1.41526372e-01, 4.14878484e-01, 6.80683651e-01,
-    5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
-    9.03269815e-01, 8.68443745e-01, 9.86939190e-01,
-    7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
-    9.69509248e-01, 1.11908818e-01, 4.49198556e-01,
-    1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
-    2.10747488e-01, 9.53884090e-01, 8.43167950e-01,
-    4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
-    3.55290379e-01, 2.95705968e-01, 1.69622690e-01,
-    1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
-    3.70932500e-01, 9.94292830e-01, 4.62587505e-01,
-    7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
-    5.75768304e-01, 9.71448393e-01, 6.95574827e-02,
-    3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
-    6.73797120e-02, 6.76596969e-01, 5.50948898e-01,
-    3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
-    3.03264879e-01, 7.61037886e-03, 2.72289601e-01,
-    1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
-    1.92088941e-01, 2.19043977e-01, 9.09320161e-01,
-    2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
-    2.23355609e-01, 1.84789435e-01, 4.16104518e-01,
-    4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
-    4.50328256e-01, 8.72199917e-01, 2.51279916e-01,
-    4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
-    1.06187277e-01, 4.92341327e-01, 1.46017513e-01,
-    5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
-    8.72648431e-01, 5.54051490e-01, 1.80745062e-01,
-    2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
-    8.30254678e-01, 5.00003328e-01, 4.69017439e-01,
-    6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
-    9.06516882e-02, 8.52975842e-01, 1.19985883e-01,
-    3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
-    6.28362507e-02, 4.32693501e-01, 3.10500685e-01,
-    6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
-    7.91284868e-01, 7.93054570e-01, 2.93406765e-01,
-    8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
-    8.67523104e-01, 1.47963482e-01, 1.25584706e-01,
-    3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
-    5.75553531e-02, 5.31607516e-01, 2.63869588e-01,
-    9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
-    7.74866558e-01, 5.65210610e-01, 7.28015327e-02,
-    6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
-    1.29932405e-01, 8.64026259e-01, 9.92599934e-01,
-    7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
-    8.11335531e-01, 7.87734900e-01, 9.87344678e-01,
-    5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
-    1.66085871e-01, 1.12937664e-01, 5.24423470e-01,
-    6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
-    3.08722276e-02, 6.26979315e-01, 4.49754105e-01,
-    8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
-    9.22168418e-01, 3.73210378e-01, 8.04432575e-01,
-    5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
-    2.40407640e-01, 5.91631279e-01, 1.59369206e-01,
-    7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
-    6.39105224e-01, 4.85274738e-01, 2.12630838e-01,
-    2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
-    5.23869697e-01, 9.99418314e-01, 8.35331599e-01,
-    4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
-    2.77001890e-02, 5.75809742e-01, 2.78513031e-01,
-    8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
-    7.88311357e-01, 9.64676177e-01, 1.75752651e-01,
-    4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
-    4.06647450e-01, 8.46539387e-01, 2.12620694e-01,
-    9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
-    9.63626055e-01, 5.96689242e-01, 1.63372670e-01,
-    4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
-    2.82327625e-01, 4.75535418e-01, 6.27760926e-01,
-    8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
-    5.05508062e-01, 5.28102944e-01, 6.13045057e-01,
-    7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
-    4.89839179e-01, 3.10496849e-01, 8.82309038e-01,
-    2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
-    4.62955493e-01, 2.38185305e-01, 5.47259907e-02,
-    7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
-    8.77741168e-01, 4.19881322e-01, 4.81222328e-01,
-    1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
-    7.37216484e-01, 5.62134821e-02, 7.14089724e-01,
-    9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
-    4.70237690e-01, 2.66524167e-01, 7.93875484e-01,
-    4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
-    1.70082405e-01, 6.35905179e-01, 3.75379109e-01,
-    4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
-    2.24643800e-01, 2.42142981e-01, 6.57283636e-01,
-    3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
-    9.43856291e-01, 4.47518596e-01, 5.44453573e-01,
-    9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
-    1.01179183e-01, 4.45473958e-01, 4.60327322e-01,
-    4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
-    3.41027487e-01, 1.56175026e-01, 7.58283148e-01,
-    6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
-    3.92517893e-01, 6.70418431e-01, 5.16440832e-01,
-    8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
-    7.39396341e-01, 7.20852434e-01, 2.35653246e-02,
-    3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
-    8.79339335e-01, 7.41599159e-02, 5.62433904e-01,
-    6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
-    5.26845015e-02, 5.58471266e-01, 1.63632233e-01,
-    5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
-    8.99035326e-01, 7.20847756e-01, 5.68954684e-01,
-    7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
-    4.89328290e-01, 5.62208561e-01, 4.97540804e-02,
-    4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
-    7.89548214e-01, 8.46136387e-01, 8.46816189e-01,
-    1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
-    4.50646654e-01, 3.74785037e-01, 4.87196697e-01,
-    4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
-    7.13597697e-01, 1.23641270e-02, 5.10031271e-01,
-    4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
-    1.91165703e-01, 4.51170940e-01, 7.50843157e-01,
-    4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
-    6.55689206e-01, 9.68257670e-02, 1.96528793e-01,
-    8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
-    9.41828079e-01, 4.54397338e-01, 5.61893331e-01,
-    5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
-    1.74888861e-01, 6.65641378e-01, 2.81668336e-01,
-    1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
-    8.25092797e-01, 5.18106324e-01, 1.71904024e-01,
-    3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
-    9.30274827e-01, 2.38198517e-01, 9.52222901e-01,
-    5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
-    4.83356794e-01, 2.73050402e-01, 3.68027050e-01,
-    5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
-    7.13926203e-01, 8.16750052e-01, 1.57890291e-01,
-    6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
-    1.02429784e-01, 9.17488471e-01, 4.03584434e-01,
-    9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
-    2.45200576e-01, 1.28896951e-01, 3.15713052e-01,
-    5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
-    7.74738919e-02, 8.42422142e-01, 3.75598924e-01,
-    3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
-    7.43107867e-01, 9.46182666e-01, 9.44344819e-01,
-    3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
-    3.84060507e-01, 2.91057722e-01, 7.68173662e-02,
-    1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
-    7.21342202e-01, 6.69471294e-03, 9.07298311e-01,
-    5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
-    2.06407453e-01, 2.59590556e-01, 7.58512718e-01,
-    5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
-    2.42829343e-01, 9.19323719e-01, 3.46832864e-01,
-    3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
-    9.58438860e-01, 5.66326411e-01, 6.60292846e-01,
-    5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
-    4.44713264e-01, 2.09732933e-01, 5.22732436e-01,
-    1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
-    4.94036404e-01, 4.09785794e-01, 6.40025507e-01,
-    5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
-    5.41072639e-01, 5.18847173e-01, 1.97093284e-01,
-    8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
-    3.87699807e-01, 4.50705808e-01, 2.49371643e-01,
-    3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
-    9.07275994e-01, 3.73075859e-01, 4.14044139e-03,
-    2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
-    4.50350196e-01, 3.48618117e-01, 5.07193932e-01,
-    5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
-    1.02623450e-01, 3.06088345e-01, 7.80461650e-01,
-    2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
-    3.68286735e-01, 7.39358243e-01, 8.97879394e-01,
-    9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
-    4.23976657e-02, 8.25922012e-01, 2.60956996e-01,
-    2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
-    8.49071471e-01, 3.45835425e-01, 7.65458276e-01,
-    5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
-    5.63368667e-02, 4.26548945e-01, 5.46745780e-01,
-    5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
-    4.46492976e-01, 6.40240123e-01, 2.73246969e-01,
-    2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
-    1.96617189e-01, 6.61271644e-01, 8.12687657e-01,
-    8.66342445e-01
+   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
+    6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
+    9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
+    7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
+    3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
+    9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
+    7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
+    4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
+    3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
+    9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
+    8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
+    5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
+    8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
+    6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
+    1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
+    5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
+    9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
+    6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
+    9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
+    1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
+    8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
+    2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
+    1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
+    5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
+    4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
+    6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
+    9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
+    7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
+    7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
+    5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
+    1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
+    6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
+    3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
+    3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
+    4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
+    2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
+    7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
+    5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
+    7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
+    2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
+    7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
+    1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
+    9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
+    2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
+    4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
+    5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
+    6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
+    4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
+    5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
+    9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
+    1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
+    9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
+    3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
+    2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
+    1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
+    2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
+    2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
+    8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
+    9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
+    9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
+    9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
+    8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
+    4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
+    1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
+    3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
+    5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
+    1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
+    8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
+    1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
+    6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
+    8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
+    5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
+    3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
+    1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
+    2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
+    6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
+    6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
+    6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
+    3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
+    1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
+    9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
+    9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
+    3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
+    1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
+    9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
+    9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
+    2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
+    3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
+    3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
+    5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
+    6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
+    3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
+    1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
+    2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
+    4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
+    1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
+    8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
+    8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
+    9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
+    6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
+    7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
+    8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
+    5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
+    7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
+    1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
+    8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
+    1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
+    3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
+    9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
+    2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
+    6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
+    5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
+    2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
+    7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
+    4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
+    9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
+    2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
+    5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
+    4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
+    4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
+    8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
+    7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
+    4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
+    1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
+    2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
+    9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
+    1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
+    3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
+    3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
+    7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
+    8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
+    5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
+    8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
+    4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
+    7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
+    4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
+    7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
+    1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
+    6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
+    9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
+    1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
+    8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
+    9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
+    4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
+    7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
+    1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
+    2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
+    7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
+    7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
+    3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
+    7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
+    2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
+    2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
+    9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
+    4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
+    4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
+    5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
+    3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
+    9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
+    4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
+    1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
+    3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
+    4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
+    8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
+    5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
+    4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
+    1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01
 
    },
    -4}};
 
 typedef ConnectComponentsTest<int, float> ConnectComponentsTestF_Int;
-TEST_P(ConnectComponentsTestF_Int, Result) {
+TEST_P(ConnectComponentsTestF_Int, Result)
+{
   /**
-     * Verify the src & dst vertices on each edge have different colors
-     */
+   * Verify the src & dst vertices on each edge have different colors
+   */
   EXPECT_TRUE(final_edges == params.n_row - 1);
 }
 
-INSTANTIATE_TEST_CASE_P(ConnectComponentsTest, ConnectComponentsTestF_Int,
+INSTANTIATE_TEST_CASE_P(ConnectComponentsTest,
+                        ConnectComponentsTestF_Int,
                         ::testing::ValuesIn(fix_conn_inputsf2));
 };  // namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/convert_coo.cu b/cpp/test/sparse/convert_coo.cu
index d30114bbcb..2028513010 100644
--- a/cpp/test/sparse/convert_coo.cu
+++ b/cpp/test/sparse/convert_coo.cu
@@ -44,23 +44,25 @@ class CSRtoCOOTest : public ::testing::TestWithParam<CSRtoCOOInputs<Index_>> {
       stream(handle.get_stream()),
       ex_scan(params.ex_scan.size(), stream),
       verify(params.verify.size(), stream),
-      result(params.verify.size(), stream) {}
+      result(params.verify.size(), stream)
+  {
+  }
 
  protected:
   void SetUp() override {}
 
-  void Run() {
+  void Run()
+  {
     Index_ n_rows = params.ex_scan.size();
-    Index_ nnz = params.verify.size();
+    Index_ nnz    = params.verify.size();
 
     raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
     raft::update_device(verify.data(), params.verify.data(), nnz, stream);
 
-    convert::csr_to_coo<Index_, 32>(ex_scan.data(), n_rows, result.data(), nnz,
-                                    stream);
+    convert::csr_to_coo<Index_, 32>(ex_scan.data(), n_rows, result.data(), nnz, stream);
 
-    ASSERT_TRUE(raft::devArrMatch<Index_>(verify.data(), result.data(), nnz,
-                                          raft::Compare<float>(), stream));
+    ASSERT_TRUE(
+      raft::devArrMatch<Index_>(verify.data(), result.data(), nnz, raft::Compare<float>(), stream));
   }
 
  protected:
@@ -86,9 +88,11 @@ const std::vector<CSRtoCOOInputs<int64_t>> csrtocoo_inputs_64 = {
   {{0, 4, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 2, 3}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestI,
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest,
+                        CSRtoCOOTestI,
                         ::testing::ValuesIn(csrtocoo_inputs_32));
-INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest, CSRtoCOOTestL,
+INSTANTIATE_TEST_CASE_P(SparseConvertCOOTest,
+                        CSRtoCOOTestL,
                         ::testing::ValuesIn(csrtocoo_inputs_64));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/convert_csr.cu b/cpp/test/sparse/convert_csr.cu
index cd665934c2..18e8b874bb 100644
--- a/cpp/test/sparse/convert_csr.cu
+++ b/cpp/test/sparse/convert_csr.cu
@@ -36,14 +36,13 @@ struct SparseConvertCSRInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os,
-                           const SparseConvertCSRInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SparseConvertCSRInputs<T>& dims)
+{
   return os;
 }
 
 template <typename T>
-class SparseConvertCSRTest
-  : public ::testing::TestWithParam<SparseConvertCSRInputs<T>> {
+class SparseConvertCSRTest : public ::testing::TestWithParam<SparseConvertCSRInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -53,18 +52,18 @@ class SparseConvertCSRTest
   SparseConvertCSRInputs<T> params;
 };
 
-const std::vector<SparseConvertCSRInputs<float>> inputsf = {
-  {5, 10, 5, 1234ULL}};
+const std::vector<SparseConvertCSRInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseConvertCSRTest<float> SortedCOOToCSR;
-TEST_P(SortedCOOToCSR, Result) {
+TEST_P(SortedCOOToCSR, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
   int nnz = 8;
 
-  int *in_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
-  int *exp_h = new int[4]{0, 2, 4, 6};
+  int* in_h  = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
+  int* exp_h = new int[4]{0, 2, 4, 6};
 
   rmm::device_uvector<int> in(nnz, stream);
   rmm::device_uvector<int> exp(4, stream);
@@ -78,8 +77,7 @@ TEST_P(SortedCOOToCSR, Result) {
 
   convert::sorted_coo_to_csr<int>(in.data(), nnz, out.data(), 4, stream);
 
-  ASSERT_TRUE(
-    raft::devArrMatch<int>(out.data(), exp.data(), 4, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.data(), exp.data(), 4, raft::Compare<int>()));
 
   cudaStreamDestroy(stream);
 
@@ -87,8 +85,7 @@ TEST_P(SortedCOOToCSR, Result) {
   delete[] exp_h;
 }
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, SortedCOOToCSR, ::testing::ValuesIn(inputsf));
 
 /******************************** adj graph ********************************/
 
@@ -102,8 +99,7 @@ struct CSRAdjGraphInputs {
 };
 
 template <typename Index_>
-class CSRAdjGraphTest
-  : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_>> {
+class CSRAdjGraphTest : public ::testing::TestWithParam<CSRAdjGraphInputs<Index_>> {
  public:
   CSRAdjGraphTest()
     : params(::testing::TestWithParam<CSRAdjGraphInputs<Index_>>::GetParam()),
@@ -111,24 +107,27 @@ class CSRAdjGraphTest
       row_ind(params.n_rows, stream),
       adj(params.n_rows * params.n_cols, stream),
       result(params.verify.size(), stream),
-      verify(params.verify.size(), stream) {}
+      verify(params.verify.size(), stream)
+  {
+  }
 
  protected:
   void SetUp() override { nnz = params.verify.size(); }
 
-  void Run() {
-    raft::update_device(row_ind.data(), params.row_ind.data(), params.n_rows,
+  void Run()
+  {
+    raft::update_device(row_ind.data(), params.row_ind.data(), params.n_rows, stream);
+    raft::update_device(adj.data(),
+                        reinterpret_cast<bool*>(params.adj.data()),
+                        params.n_rows * params.n_cols,
                         stream);
-    raft::update_device(adj.data(), reinterpret_cast<bool *>(params.adj.data()),
-                        params.n_rows * params.n_cols, stream);
     raft::update_device(verify.data(), params.verify.data(), nnz, stream);
 
-    convert::csr_adj_graph_batched<Index_, 32>(row_ind.data(), params.n_cols,
-                                               nnz, params.n_rows, adj.data(),
-                                               result.data(), stream);
+    convert::csr_adj_graph_batched<Index_, 32>(
+      row_ind.data(), params.n_cols, nnz, params.n_rows, adj.data(), result.data(), stream);
 
-    ASSERT_TRUE(raft::devArrMatch<Index_>(verify.data(), result.data(), nnz,
-                                          raft::Compare<Index_>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<Index_>(verify.data(), result.data(), nnz, raft::Compare<Index_>()));
   }
 
  protected:
@@ -162,9 +161,11 @@ const std::vector<CSRAdjGraphInputs<int64_t>> csradjgraph_inputs_l = {
    {0, 1, 2, 0, 1, 2, 0, 1, 2}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestI,
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        CSRAdjGraphTestI,
                         ::testing::ValuesIn(csradjgraph_inputs_i));
-INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest, CSRAdjGraphTestL,
+INSTANTIATE_TEST_CASE_P(SparseConvertCSRTest,
+                        CSRAdjGraphTestL,
                         ::testing::ValuesIn(csradjgraph_inputs_l));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/csr_row_slice.cu b/cpp/test/sparse/csr_row_slice.cu
index 33893649bd..16372dc0f6 100644
--- a/cpp/test/sparse/csr_row_slice.cu
+++ b/cpp/test/sparse/csr_row_slice.cu
@@ -47,18 +47,16 @@ struct CSRRowSliceInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const CSRRowSliceInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const CSRRowSliceInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class CSRRowSliceTest
-  : public ::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>> {
+class CSRRowSliceTest : public ::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>> {
  public:
   CSRRowSliceTest()
-    : params(::testing::TestWithParam<
-             CSRRowSliceInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<CSRRowSliceInputs<value_idx, value_t>>::GetParam()),
       stream(handle.get_stream()),
       indptr(0, stream),
       indices(0, stream),
@@ -68,7 +66,8 @@ class CSRRowSliceTest
       out_data_ref(0, stream),
       out_indptr(0, stream),
       out_indices(0, stream),
-      out_data(0, stream) {
+      out_data(0, stream)
+  {
     indptr.resize(params.indptr_h.size(), stream);
     indices.resize(params.indices_h.size(), stream);
     data.resize(params.data_h.size(), stream);
@@ -81,54 +80,65 @@ class CSRRowSliceTest
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
     update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
     update_device(data.data(), data_h.data(), data_h.size(), stream);
 
-    std::vector<value_idx> out_indptr_ref_h = params.out_indptr_ref_h;
+    std::vector<value_idx> out_indptr_ref_h  = params.out_indptr_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
-    std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
-
-    update_device(out_indptr_ref.data(), out_indptr_ref_h.data(),
-                  out_indptr_ref_h.size(), stream);
-    update_device(out_indices_ref.data(), out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), stream);
-    update_device(out_data_ref.data(), out_data_ref_h.data(),
-                  out_data_ref_h.size(), stream);
+    std::vector<value_t> out_data_ref_h      = params.out_data_ref_h;
+
+    update_device(out_indptr_ref.data(), out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream);
+    update_device(
+      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_data_ref.data(), out_data_ref_h.data(), out_data_ref_h.size(), stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     make_data();
 
     int csr_start_offset;
     int csr_stop_offset;
 
-    raft::sparse::op::csr_row_slice_indptr(
-      params.start_row, params.stop_row, indptr.data(), out_indptr.data(),
-      &csr_start_offset, &csr_stop_offset, stream);
-
-    raft::sparse::op::csr_row_slice_populate(
-      csr_start_offset, csr_stop_offset, indices.data(), data.data(),
-      out_indices.data(), out_data.data(), stream);
+    raft::sparse::op::csr_row_slice_indptr(params.start_row,
+                                           params.stop_row,
+                                           indptr.data(),
+                                           out_indptr.data(),
+                                           &csr_start_offset,
+                                           &csr_stop_offset,
+                                           stream);
+
+    raft::sparse::op::csr_row_slice_populate(csr_start_offset,
+                                             csr_stop_offset,
+                                             indices.data(),
+                                             data.data(),
+                                             out_indices.data(),
+                                             out_data.data(),
+                                             stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_indptr.data(), out_indptr_ref.data(),
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_indptr.data(),
+                            out_indptr_ref.data(),
                             params.out_indptr_ref_h.size(),
                             Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices.data(), out_indices_ref.data(),
+    ASSERT_TRUE(devArrMatch(out_indices.data(),
+                            out_indices_ref.data(),
                             params.out_indices_ref_h.size(),
                             Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_data.data(), out_data_ref.data(),
-                            params.out_data_ref_h.size(), Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(
+      out_data.data(), out_data_ref.data(), params.out_data_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
@@ -173,8 +183,7 @@ const std::vector<CSRRowSliceInputs<int, float>> inputs_i32_f = {
 };
 typedef CSRRowSliceTest<int, float> CSRRowSliceTestF;
 TEST_P(CSRRowSliceTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(CSRRowSliceTest, CSRRowSliceTestF, ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/csr_to_dense.cu b/cpp/test/sparse/csr_to_dense.cu
index 1a206c8499..85f00cdd27 100644
--- a/cpp/test/sparse/csr_to_dense.cu
+++ b/cpp/test/sparse/csr_to_dense.cu
@@ -45,24 +45,23 @@ struct CSRToDenseInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const CSRToDenseInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const CSRToDenseInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class CSRToDenseTest
-  : public ::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>> {
+class CSRToDenseTest : public ::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>> {
  public:
   CSRToDenseTest()
-    : params(::testing::TestWithParam<
-             CSRToDenseInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<CSRToDenseInputs<value_idx, value_t>>::GetParam()),
       stream(raft_handle.get_stream()),
       indptr(0, stream),
       indices(0, stream),
       data(0, stream),
       out_ref(0, stream),
-      out(0, stream) {
+      out(0, stream)
+  {
     indptr.resize(params.indptr_h.size(), stream);
     indices.resize(params.indices_h.size(), stream);
     data.resize(params.data_h.size(), stream);
@@ -71,10 +70,11 @@ class CSRToDenseTest
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
     update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
@@ -86,22 +86,31 @@ class CSRToDenseTest
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
     make_data();
 
-    convert::csr_to_dense(handle, params.nrows, params.ncols, indptr.data(),
-                          indices.data(), data.data(), params.nrows, out.data(),
-                          stream, true);
+    convert::csr_to_dense(handle,
+                          params.nrows,
+                          params.ncols,
+                          indptr.data(),
+                          indices.data(),
+                          data.data(),
+                          params.nrows,
+                          out.data(),
+                          stream,
+                          true);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.out_ref_h.size(),
-                            Compare<value_t>()));
+  void compare()
+  {
+    ASSERT_TRUE(
+      devArrMatch(out.data(), out_ref.data(), params.out_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
@@ -129,13 +138,26 @@ const std::vector<CSRToDenseInputs<int, float>> inputs_i32_f = {
    {0, 2, 4, 6, 8},
    {0, 1, 2, 3, 0, 1, 2, 3},  // indices
    {1.0f, 3.0f, 1.0f, 5.0f, 50.0f, 28.0f, 16.0f, 2.0f},
-   {1.0f, 3.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 5.0f, 50.0f, 28.0f, 0.0f, 0.0f,
-    0.0f, 0.0f, 16.0f, 2.0f}},
+   {1.0f,
+    3.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    1.0f,
+    5.0f,
+    50.0f,
+    28.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    0.0f,
+    16.0f,
+    2.0f}},
 };
 typedef CSRToDenseTest<int, float> CSRToDenseTestF;
 TEST_P(CSRToDenseTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(CSRToDenseTest, CSRToDenseTestF, ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/csr_transpose.cu b/cpp/test/sparse/csr_transpose.cu
index 8983f10d2b..3380eaa6fb 100644
--- a/cpp/test/sparse/csr_transpose.cu
+++ b/cpp/test/sparse/csr_transpose.cu
@@ -47,18 +47,16 @@ struct CSRTransposeInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const CSRTransposeInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const CSRTransposeInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class CSRTransposeTest
-  : public ::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>> {
+class CSRTransposeTest : public ::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>> {
  public:
   CSRTransposeTest()
-    : params(::testing::TestWithParam<
-             CSRTransposeInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<CSRTransposeInputs<value_idx, value_t>>::GetParam()),
       stream(raft_handle.get_stream()),
       indptr(0, stream),
       indices(0, stream),
@@ -68,7 +66,8 @@ class CSRTransposeTest
       out_data_ref(0, stream),
       out_indptr(0, stream),
       out_indices(0, stream),
-      out_data(0, stream) {
+      out_data(0, stream)
+  {
     indptr.resize(params.indptr_h.size(), stream);
     indices.resize(params.indices_h.size(), stream);
     data.resize(params.data_h.size(), stream);
@@ -81,50 +80,60 @@ class CSRTransposeTest
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     update_device(indptr.data(), indptr_h.data(), indptr_h.size(), stream);
     update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
     update_device(data.data(), data_h.data(), data_h.size(), stream);
 
-    std::vector<value_idx> out_indptr_ref_h = params.out_indptr_ref_h;
+    std::vector<value_idx> out_indptr_ref_h  = params.out_indptr_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
-    std::vector<value_t> out_data_ref_h = params.out_data_ref_h;
-
-    update_device(out_indptr_ref.data(), out_indptr_ref_h.data(),
-                  out_indptr_ref_h.size(), stream);
-    update_device(out_indices_ref.data(), out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), stream);
-    update_device(out_data_ref.data(), out_data_ref_h.data(),
-                  out_data_ref_h.size(), stream);
+    std::vector<value_t> out_data_ref_h      = params.out_data_ref_h;
+
+    update_device(out_indptr_ref.data(), out_indptr_ref_h.data(), out_indptr_ref_h.size(), stream);
+    update_device(
+      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_data_ref.data(), out_data_ref_h.data(), out_data_ref_h.size(), stream);
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     CUSPARSE_CHECK(cusparseCreate(&handle));
 
     make_data();
 
-    raft::sparse::linalg::csr_transpose(
-      handle, indptr.data(), indices.data(), data.data(), out_indptr.data(),
-      out_indices.data(), out_data.data(), params.nrows, params.ncols,
-      params.nnz, stream);
+    raft::sparse::linalg::csr_transpose(handle,
+                                        indptr.data(),
+                                        indices.data(),
+                                        data.data(),
+                                        out_indptr.data(),
+                                        out_indices.data(),
+                                        out_data.data(),
+                                        params.nrows,
+                                        params.ncols,
+                                        params.nnz,
+                                        stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
     CUSPARSE_CHECK(cusparseDestroy(handle));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_indptr.data(), out_indptr_ref.data(),
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_indptr.data(),
+                            out_indptr_ref.data(),
                             params.out_indptr_ref_h.size(),
                             Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices.data(), out_indices_ref.data(),
+    ASSERT_TRUE(devArrMatch(out_indices.data(),
+                            out_indices_ref.data(),
                             params.out_indices_ref_h.size(),
                             Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_data.data(), out_data_ref.data(),
-                            params.out_data_ref_h.size(), Compare<value_t>()));
+    ASSERT_TRUE(devArrMatch(
+      out_data.data(), out_data_ref.data(), params.out_data_ref_h.size(), Compare<value_t>()));
   }
 
  protected:
@@ -163,8 +172,7 @@ const std::vector<CSRTransposeInputs<int, float>> inputs_i32_f = {
 };
 typedef CSRTransposeTest<int, float> CSRTransposeTestF;
 TEST_P(CSRTransposeTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(CSRTransposeTest, CSRTransposeTestF, ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace sparse
 };  // end namespace raft
diff --git a/cpp/test/sparse/degree.cu b/cpp/test/sparse/degree.cu
index fbadadb29d..8b1c7988d6 100644
--- a/cpp/test/sparse/degree.cu
+++ b/cpp/test/sparse/degree.cu
@@ -33,8 +33,7 @@ struct SparseDegreeInputs {
 };
 
 template <typename T>
-class SparseDegreeTests
-  : public ::testing::TestWithParam<SparseDegreeInputs<T>> {
+class SparseDegreeTests : public ::testing::TestWithParam<SparseDegreeInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -47,20 +46,19 @@ class SparseDegreeTests
 const std::vector<SparseDegreeInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseDegreeTests<float> COODegree;
-TEST_P(COODegree, Result) {
+TEST_P(COODegree, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
   int in_rows_h[5] = {0, 0, 1, 2, 2};
-  int verify_h[5] = {2, 1, 2, 0, 0};
+  int verify_h[5]  = {2, 1, 2, 0, 0};
 
   rmm::device_uvector<int> in_rows(5, stream);
   rmm::device_uvector<int> verify(5, stream);
   rmm::device_uvector<int> results(5, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
-  CUDA_CHECK(
-    cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
 
   raft::update_device(in_rows.data(), *&in_rows_h, 5, stream);
   raft::update_device(verify.data(), *&verify_h, 5, stream);
@@ -68,50 +66,43 @@ TEST_P(COODegree, Result) {
   linalg::coo_degree<32>(in_rows.data(), 5, results.data(), stream);
   cudaDeviceSynchronize();
 
-  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5,
-                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5, raft::Compare<int>()));
 
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
 typedef SparseDegreeTests<float> COODegreeNonzero;
-TEST_P(COODegreeNonzero, Result) {
+TEST_P(COODegreeNonzero, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
-  int in_rows_h[5] = {0, 0, 1, 2, 2};
+  int in_rows_h[5]   = {0, 0, 1, 2, 2};
   float in_vals_h[5] = {0.0, 5.0, 0.0, 1.0, 1.0};
-  int verify_h[5] = {1, 0, 2, 0, 0};
+  int verify_h[5]    = {1, 0, 2, 0, 0};
 
   rmm::device_uvector<int> in_rows(5, stream);
   rmm::device_uvector<int> verify(5, stream);
   rmm::device_uvector<int> results(5, stream);
   rmm::device_uvector<float> in_vals(5, stream);
-  CUDA_CHECK(
-    cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
-  CUDA_CHECK(
-    cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
-  CUDA_CHECK(
-    cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream));
+  CUDA_CHECK(cudaMemsetAsync(verify.data(), 0, verify.size() * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(results.data(), 0, results.size() * sizeof(int), stream));
+  CUDA_CHECK(cudaMemsetAsync(in_vals.data(), 0, in_vals.size() * sizeof(float), stream));
 
   raft::update_device(in_rows.data(), *&in_rows_h, 5, stream);
   raft::update_device(verify.data(), *&verify_h, 5, stream);
   raft::update_device(in_vals.data(), *&in_vals_h, 5, stream);
 
-  linalg::coo_degree_nz<32, float>(in_rows.data(), in_vals.data(), 5,
-                                   results.data(), stream);
+  linalg::coo_degree_nz<32, float>(in_rows.data(), in_vals.data(), 5, results.data(), stream);
   cudaDeviceSynchronize();
 
-  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5,
-                                     raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), results.data(), 5, raft::Compare<int>()));
 
   CUDA_CHECK(cudaStreamDestroy(stream));
 }
 
-INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree,
-                        ::testing::ValuesIn(inputsf));
-INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegree, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseDegreeTests, COODegreeNonzero, ::testing::ValuesIn(inputsf));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/dist_coo_spmv.cu b/cpp/test/sparse/dist_coo_spmv.cu
index d24199c5fc..000d58d029 100644
--- a/cpp/test/sparse/dist_coo_spmv.cu
+++ b/cpp/test/sparse/dist_coo_spmv.cu
@@ -55,28 +55,26 @@ struct InputConfiguration {
 };
 
 using dense_smem_strategy_t = detail::dense_smem_strategy<int, float, 1024>;
-using hash_strategy_t = detail::hash_strategy<int, float, 1024>;
+using hash_strategy_t       = detail::hash_strategy<int, float, 1024>;
 
 template <typename value_idx, typename value_t, typename strategy_t>
 struct SparseDistanceCOOSPMVInputs {
   InputConfiguration<value_idx, value_t> input_configuration;
 
   float capacity_threshold = 0.5;
-  int map_size =
-    detail::hash_strategy<value_idx, value_t, 1024>::get_map_size();
+  int map_size             = detail::hash_strategy<value_idx, value_t, 1024>::get_map_size();
 };
 
 template <typename value_idx, typename value_t, typename strategy_t>
-::std::ostream &operator<<(
-  ::std::ostream &os,
-  const SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os,
+                           const SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t, typename strategy_t>
 class SparseDistanceCOOSPMVTest
-  : public ::testing::TestWithParam<
-      SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>> {
+  : public ::testing::TestWithParam<SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>> {
  public:
   SparseDistanceCOOSPMVTest()
     : dist_config(handle),
@@ -84,62 +82,74 @@ class SparseDistanceCOOSPMVTest
       indices(0, handle.get_stream()),
       data(0, handle.get_stream()),
       out_dists(0, handle.get_stream()),
-      out_dists_ref(0, handle.get_stream()) {}
+      out_dists_ref(0, handle.get_stream())
+  {
+  }
 
-  template <typename U,
-            std::enable_if_t<std::is_same_v<U, hash_strategy_t>> * = nullptr>
-  U make_strategy() {
+  template <typename U, std::enable_if_t<std::is_same_v<U, hash_strategy_t>>* = nullptr>
+  U make_strategy()
+  {
     return strategy_t(dist_config, params.capacity_threshold, params.map_size);
   }
 
-  template <typename U, std::enable_if_t<
-                          std::is_same_v<U, dense_smem_strategy_t>> * = nullptr>
-  U make_strategy() {
+  template <typename U, std::enable_if_t<std::is_same_v<U, dense_smem_strategy_t>>* = nullptr>
+  U make_strategy()
+  {
     return strategy_t(dist_config);
   }
 
   template <typename reduce_f, typename accum_f, typename write_f>
-  void compute_dist(reduce_f reduce_func, accum_f accum_func,
-                    write_f write_func, bool rev = true) {
-    rmm::device_uvector<value_idx> coo_rows(
-      max(dist_config.b_nnz, dist_config.a_nnz),
-      dist_config.handle.get_stream());
-
-    raft::sparse::convert::csr_to_coo(dist_config.b_indptr, dist_config.b_nrows,
-                                      coo_rows.data(), dist_config.b_nnz,
+  void compute_dist(reduce_f reduce_func, accum_f accum_func, write_f write_func, bool rev = true)
+  {
+    rmm::device_uvector<value_idx> coo_rows(max(dist_config.b_nnz, dist_config.a_nnz),
+                                            dist_config.handle.get_stream());
+
+    raft::sparse::convert::csr_to_coo(dist_config.b_indptr,
+                                      dist_config.b_nrows,
+                                      coo_rows.data(),
+                                      dist_config.b_nnz,
                                       dist_config.handle.get_stream());
 
     strategy_t selected_strategy = make_strategy<strategy_t>();
-    detail::balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(
-      out_dists.data(), dist_config, coo_rows.data(), reduce_func, accum_func,
-      write_func, selected_strategy);
+    detail::balanced_coo_pairwise_generalized_spmv<value_idx, value_t>(out_dists.data(),
+                                                                       dist_config,
+                                                                       coo_rows.data(),
+                                                                       reduce_func,
+                                                                       accum_func,
+                                                                       write_func,
+                                                                       selected_strategy);
 
     if (rev) {
-      raft::sparse::convert::csr_to_coo(
-        dist_config.a_indptr, dist_config.a_nrows, coo_rows.data(),
-        dist_config.a_nnz, dist_config.handle.get_stream());
-
-      detail::balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(
-        out_dists.data(), dist_config, coo_rows.data(), reduce_func, accum_func,
-        write_func, selected_strategy);
+      raft::sparse::convert::csr_to_coo(dist_config.a_indptr,
+                                        dist_config.a_nrows,
+                                        coo_rows.data(),
+                                        dist_config.a_nnz,
+                                        dist_config.handle.get_stream());
+
+      detail::balanced_coo_pairwise_generalized_spmv_rev<value_idx, value_t>(out_dists.data(),
+                                                                             dist_config,
+                                                                             coo_rows.data(),
+                                                                             reduce_func,
+                                                                             accum_func,
+                                                                             write_func,
+                                                                             selected_strategy);
     }
   }
 
-  void run_spmv() {
+  void run_spmv()
+  {
     switch (params.input_configuration.metric) {
       case raft::distance::DistanceType::InnerProduct:
-        compute_dist(detail::Product(), detail::Sum(), detail::AtomicAdd(),
-                     true);
+        compute_dist(detail::Product(), detail::Sum(), detail::AtomicAdd(), true);
         break;
       case raft::distance::DistanceType::L2Unexpanded:
         compute_dist(detail::SqDiff(), detail::Sum(), detail::AtomicAdd());
         break;
       case raft::distance::DistanceType::Canberra:
         compute_dist(
-          [] __device__(value_t a, value_t b) {
-            return fabsf(a - b) / (fabsf(a) + fabsf(b));
-          },
-          detail::Sum(), detail::AtomicAdd());
+          [] __device__(value_t a, value_t b) { return fabsf(a - b) / (fabsf(a) + fabsf(b)); },
+          detail::Sum(),
+          detail::AtomicAdd());
         break;
       case raft::distance::DistanceType::L1:
         compute_dist(detail::AbsDiff(), detail::Sum(), detail::AtomicAdd());
@@ -148,26 +158,27 @@ class SparseDistanceCOOSPMVTest
         compute_dist(detail::AbsDiff(), detail::Max(), detail::AtomicMax());
         break;
       case raft::distance::DistanceType::LpUnexpanded: {
-        compute_dist(detail::PDiff(params.input_configuration.metric_arg),
-                     detail::Sum(), detail::AtomicAdd());
+        compute_dist(
+          detail::PDiff(params.input_configuration.metric_arg), detail::Sum(), detail::AtomicAdd());
         float p = 1.0f / params.input_configuration.metric_arg;
         raft::linalg::unaryOp<value_t>(
-          out_dists.data(), out_dists.data(),
+          out_dists.data(),
+          out_dists.data(),
           dist_config.a_nrows * dist_config.b_nrows,
           [=] __device__(value_t input) { return powf(input, p); },
           dist_config.handle.get_stream());
 
       } break;
-      default:
-        throw raft::exception("Unknown distance");
+      default: throw raft::exception("Unknown distance");
     }
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.input_configuration.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.input_configuration.indptr_h;
     std::vector<value_idx> indices_h = params.input_configuration.indices_h;
-    std::vector<value_t> data_h = params.input_configuration.data_h;
+    std::vector<value_t> data_h      = params.input_configuration.data_h;
 
     auto stream = handle.get_stream();
     indptr.resize(indptr_h.size(), stream);
@@ -178,33 +189,32 @@ class SparseDistanceCOOSPMVTest
     update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
     update_device(data.data(), data_h.data(), data_h.size(), stream);
 
-    std::vector<value_t> out_dists_ref_h =
-      params.input_configuration.out_dists_ref_h;
+    std::vector<value_t> out_dists_ref_h = params.input_configuration.out_dists_ref_h;
 
     out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream);
 
-    update_device(out_dists_ref.data(), out_dists_ref_h.data(),
-                  out_dists_ref_h.size(), stream);
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     params = ::testing::TestWithParam<
       SparseDistanceCOOSPMVInputs<value_idx, value_t, strategy_t>>::GetParam();
 
     make_data();
 
-    dist_config.b_nrows = params.input_configuration.indptr_h.size() - 1;
-    dist_config.b_ncols = params.input_configuration.n_cols;
-    dist_config.b_nnz = params.input_configuration.indices_h.size();
-    dist_config.b_indptr = indptr.data();
+    dist_config.b_nrows   = params.input_configuration.indptr_h.size() - 1;
+    dist_config.b_ncols   = params.input_configuration.n_cols;
+    dist_config.b_nnz     = params.input_configuration.indices_h.size();
+    dist_config.b_indptr  = indptr.data();
     dist_config.b_indices = indices.data();
-    dist_config.b_data = data.data();
-    dist_config.a_nrows = params.input_configuration.indptr_h.size() - 1;
-    dist_config.a_ncols = params.input_configuration.n_cols;
-    dist_config.a_nnz = params.input_configuration.indices_h.size();
-    dist_config.a_indptr = indptr.data();
+    dist_config.b_data    = data.data();
+    dist_config.a_nrows   = params.input_configuration.indptr_h.size() - 1;
+    dist_config.a_ncols   = params.input_configuration.n_cols;
+    dist_config.a_nnz     = params.input_configuration.indices_h.size();
+    dist_config.a_indptr  = indptr.data();
     dist_config.a_indices = indices.data();
-    dist_config.a_data = data.data();
+    dist_config.a_data    = data.data();
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
@@ -215,8 +225,10 @@ class SparseDistanceCOOSPMVTest
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(),
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_dists_ref.data(),
+                            out_dists.data(),
                             params.input_configuration.out_dists_ref_h.size(),
                             CompareApprox<value_t>(1e-3)));
   }
@@ -241,8 +253,7 @@ const InputConfiguration<int, float> input_inner_product = {
   {0, 2, 4, 6, 8},
   {0, 1, 0, 1, 0, 1, 0, 1},
   {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
-  {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
-   5.0},
+  {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0},
   raft::distance::DistanceType::InnerProduct,
   0.0};
 
@@ -273,384 +284,379 @@ const InputConfiguration<int, float> input_l2_unexpanded = {
   raft::distance::DistanceType::L2Unexpanded,
   0.0};
 
-const InputConfiguration<int, float> input_canberra =
-  {10,
-   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.0,
-    3.3954660629919076,
-    5.6469232737388815,
-    6.373112846266441,
-    4.0212880272531715,
-    6.916281504639404,
-    5.741508386786526,
-    5.411470999663036,
-    9.0,
-    4.977014354725805,
-    3.3954660629919076,
-    0.0,
-    7.56256082439209,
-    5.540261147481582,
-    4.832322929216881,
-    4.62003193872216,
-    6.498056792320361,
-    4.309846252268695,
-    6.317531174829905,
-    6.016362684141827,
-    5.6469232737388815,
-    7.56256082439209,
-    0.0,
-    5.974878731322299,
-    4.898357301336036,
-    6.442097410320605,
-    5.227077347287883,
-    7.134101195584642,
-    5.457753923371659,
-    7.0,
-    6.373112846266441,
-    5.540261147481582,
-    5.974878731322299,
-    0.0,
-    5.5507273748583,
-    4.897749658726415,
-    9.0,
-    8.398776718824767,
-    3.908281400328807,
-    4.83431066343688,
-    4.0212880272531715,
-    4.832322929216881,
-    4.898357301336036,
-    5.5507273748583,
-    0.0,
-    6.632989819428174,
-    7.438852294822894,
-    5.6631570310967465,
-    7.579428202635459,
-    6.760811985364303,
-    6.916281504639404,
-    4.62003193872216,
-    6.442097410320605,
-    4.897749658726415,
-    6.632989819428174,
-    0.0,
-    5.249404187382862,
-    6.072559523278559,
-    4.07661278488929,
-    6.19678948003145,
-    5.741508386786526,
-    6.498056792320361,
-    5.227077347287883,
-    9.0,
-    7.438852294822894,
-    5.249404187382862,
-    0.0,
-    3.854811639654704,
-    6.652724827169063,
-    5.298236851430971,
-    5.411470999663036,
-    4.309846252268695,
-    7.134101195584642,
-    8.398776718824767,
-    5.6631570310967465,
-    6.072559523278559,
-    3.854811639654704,
-    0.0,
-    7.529184598969917,
-    6.903282911791188,
-    9.0,
-    6.317531174829905,
-    5.457753923371659,
-    3.908281400328807,
-    7.579428202635459,
-    4.07661278488929,
-    6.652724827169063,
-    7.529184598969917,
-    0.0,
-    7.0,
-    4.977014354725805,
-    6.016362684141827,
-    7.0,
-    4.83431066343688,
-    6.760811985364303,
-    6.19678948003145,
-    5.298236851430971,
-    6.903282911791188,
-    7.0,
-    0.0},
-   raft::distance::DistanceType::Canberra,
-   0.0};
-
-const InputConfiguration<int, float> input_lp_unexpanded =
-  {10,
-   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.0,
-    1.31462855332296,
-    1.3690307816129905,
-    1.698603990921237,
-    1.3460470789553531,
-    1.6636670712582544,
-    1.2651744044972217,
-    1.1938329352055201,
-    1.8811409082590185,
-    1.3653115050624267,
-    1.31462855332296,
-    0.0,
-    1.9447722703291133,
-    1.42818777206562,
-    1.4685491458946494,
-    1.3071999866010466,
-    1.4988622861692171,
-    0.9698559287406783,
-    1.4972023224597841,
-    1.5243383567266802,
-    1.3690307816129905,
-    1.9447722703291133,
-    0.0,
-    1.2748400840107568,
-    1.0599569946448246,
-    1.546591282841402,
-    1.147526531928459,
-    1.447002179128145,
-    1.5982242387673176,
-    1.3112533607072414,
-    1.698603990921237,
-    1.42818777206562,
-    1.2748400840107568,
-    0.0,
-    1.038121552545461,
-    1.011788365364402,
-    1.3907391109256988,
-    1.3128200942311496,
-    1.19595706584447,
-    1.3233328139624725,
-    1.3460470789553531,
-    1.4685491458946494,
-    1.0599569946448246,
-    1.038121552545461,
-    0.0,
-    1.3642741698145529,
-    1.3493868683808095,
-    1.394942694628328,
-    1.572881849642552,
-    1.380122665319464,
-    1.6636670712582544,
-    1.3071999866010466,
-    1.546591282841402,
-    1.011788365364402,
-    1.3642741698145529,
-    0.0,
-    1.018961640373018,
-    1.0114394258945634,
-    0.8338711034820684,
-    1.1247823842299223,
-    1.2651744044972217,
-    1.4988622861692171,
-    1.147526531928459,
-    1.3907391109256988,
-    1.3493868683808095,
-    1.018961640373018,
-    0.0,
-    0.7701238110357329,
-    1.245486437864406,
-    0.5551259549534626,
-    1.1938329352055201,
-    0.9698559287406783,
-    1.447002179128145,
-    1.3128200942311496,
-    1.394942694628328,
-    1.0114394258945634,
-    0.7701238110357329,
-    0.0,
-    1.1886800117391216,
-    1.0083692448135637,
-    1.8811409082590185,
-    1.4972023224597841,
-    1.5982242387673176,
-    1.19595706584447,
-    1.572881849642552,
-    0.8338711034820684,
-    1.245486437864406,
-    1.1886800117391216,
-    0.0,
-    1.3661374102525012,
-    1.3653115050624267,
-    1.5243383567266802,
-    1.3112533607072414,
-    1.3233328139624725,
-    1.380122665319464,
-    1.1247823842299223,
-    0.5551259549534626,
-    1.0083692448135637,
-    1.3661374102525012,
-    0.0},
-   raft::distance::DistanceType::LpUnexpanded,
-   2.0};
-
-const InputConfiguration<int, float> input_linf =
-  {10,
-   {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.0,
-    0.9251771844789913,
-    0.9036452083899731,
-    0.9251771844789913,
-    0.8706483735804971,
-    0.9251771844789913,
-    0.717493881903289,
-    0.6920214832303888,
-    0.9251771844789913,
-    0.9251771844789913,
-    0.9251771844789913,
-    0.0,
-    0.9036452083899731,
-    0.8655339692155823,
-    0.8706483735804971,
-    0.8655339692155823,
-    0.8655339692155823,
-    0.6329837991017668,
-    0.8655339692155823,
-    0.8655339692155823,
-    0.9036452083899731,
-    0.9036452083899731,
-    0.0,
-    0.7988276152181608,
-    0.7028075145996631,
-    0.9036452083899731,
-    0.9036452083899731,
-    0.9036452083899731,
-    0.8429599432532096,
-    0.9036452083899731,
-    0.9251771844789913,
-    0.8655339692155823,
-    0.7988276152181608,
-    0.0,
-    0.48376552205293305,
-    0.8206394616536681,
-    0.8206394616536681,
-    0.8206394616536681,
-    0.8429599432532096,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.8706483735804971,
-    0.7028075145996631,
-    0.48376552205293305,
-    0.0,
-    0.8706483735804971,
-    0.8706483735804971,
-    0.8706483735804971,
-    0.8429599432532096,
-    0.8706483735804971,
-    0.9251771844789913,
-    0.8655339692155823,
-    0.9036452083899731,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.0,
-    0.8853924473642432,
-    0.535821510936138,
-    0.6497196601457607,
-    0.8853924473642432,
-    0.717493881903289,
-    0.8655339692155823,
-    0.9036452083899731,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.8853924473642432,
-    0.0,
-    0.5279604218147174,
-    0.6658348373853169,
-    0.33799874888632914,
-    0.6920214832303888,
-    0.6329837991017668,
-    0.9036452083899731,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.535821510936138,
-    0.5279604218147174,
-    0.0,
-    0.662579808115858,
-    0.5079750812968089,
-    0.9251771844789913,
-    0.8655339692155823,
-    0.8429599432532096,
-    0.8429599432532096,
-    0.8429599432532096,
-    0.6497196601457607,
-    0.6658348373853169,
-    0.662579808115858,
-    0.0,
-    0.8429599432532096,
-    0.9251771844789913,
-    0.8655339692155823,
-    0.9036452083899731,
-    0.8206394616536681,
-    0.8706483735804971,
-    0.8853924473642432,
-    0.33799874888632914,
-    0.5079750812968089,
-    0.8429599432532096,
-    0.0},
-   raft::distance::DistanceType::Linf,
-   0.0};
-
-const InputConfiguration<int, float> input_l1 = {
-  4,
-  {0, 1, 1, 2, 4},
-  {3, 2, 0, 1},  // indices
-  {0.99296, 0.42180, 0.11687, 0.305869},
-  {
-    // dense output
-    0.0,
-    0.99296,
-    1.41476,
-    1.415707,
-    0.99296,
-    0.0,
-    0.42180,
-    0.42274,
-    1.41476,
-    0.42180,
-    0.0,
-    0.84454,
-    1.41570,
-    0.42274,
-    0.84454,
-    0.0,
-  },
-  raft::distance::DistanceType::L1,
+const InputConfiguration<int, float> input_canberra = {
+  10,
+  {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+  {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+   6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+  {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+   0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+   0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+   0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+   0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+  {0.0,
+   3.3954660629919076,
+   5.6469232737388815,
+   6.373112846266441,
+   4.0212880272531715,
+   6.916281504639404,
+   5.741508386786526,
+   5.411470999663036,
+   9.0,
+   4.977014354725805,
+   3.3954660629919076,
+   0.0,
+   7.56256082439209,
+   5.540261147481582,
+   4.832322929216881,
+   4.62003193872216,
+   6.498056792320361,
+   4.309846252268695,
+   6.317531174829905,
+   6.016362684141827,
+   5.6469232737388815,
+   7.56256082439209,
+   0.0,
+   5.974878731322299,
+   4.898357301336036,
+   6.442097410320605,
+   5.227077347287883,
+   7.134101195584642,
+   5.457753923371659,
+   7.0,
+   6.373112846266441,
+   5.540261147481582,
+   5.974878731322299,
+   0.0,
+   5.5507273748583,
+   4.897749658726415,
+   9.0,
+   8.398776718824767,
+   3.908281400328807,
+   4.83431066343688,
+   4.0212880272531715,
+   4.832322929216881,
+   4.898357301336036,
+   5.5507273748583,
+   0.0,
+   6.632989819428174,
+   7.438852294822894,
+   5.6631570310967465,
+   7.579428202635459,
+   6.760811985364303,
+   6.916281504639404,
+   4.62003193872216,
+   6.442097410320605,
+   4.897749658726415,
+   6.632989819428174,
+   0.0,
+   5.249404187382862,
+   6.072559523278559,
+   4.07661278488929,
+   6.19678948003145,
+   5.741508386786526,
+   6.498056792320361,
+   5.227077347287883,
+   9.0,
+   7.438852294822894,
+   5.249404187382862,
+   0.0,
+   3.854811639654704,
+   6.652724827169063,
+   5.298236851430971,
+   5.411470999663036,
+   4.309846252268695,
+   7.134101195584642,
+   8.398776718824767,
+   5.6631570310967465,
+   6.072559523278559,
+   3.854811639654704,
+   0.0,
+   7.529184598969917,
+   6.903282911791188,
+   9.0,
+   6.317531174829905,
+   5.457753923371659,
+   3.908281400328807,
+   7.579428202635459,
+   4.07661278488929,
+   6.652724827169063,
+   7.529184598969917,
+   0.0,
+   7.0,
+   4.977014354725805,
+   6.016362684141827,
+   7.0,
+   4.83431066343688,
+   6.760811985364303,
+   6.19678948003145,
+   5.298236851430971,
+   6.903282911791188,
+   7.0,
+   0.0},
+  raft::distance::DistanceType::Canberra,
   0.0};
 
+const InputConfiguration<int, float> input_lp_unexpanded = {
+  10,
+  {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+  {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+   6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+  {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+   0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+   0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+   0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+   0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+  {0.0,
+   1.31462855332296,
+   1.3690307816129905,
+   1.698603990921237,
+   1.3460470789553531,
+   1.6636670712582544,
+   1.2651744044972217,
+   1.1938329352055201,
+   1.8811409082590185,
+   1.3653115050624267,
+   1.31462855332296,
+   0.0,
+   1.9447722703291133,
+   1.42818777206562,
+   1.4685491458946494,
+   1.3071999866010466,
+   1.4988622861692171,
+   0.9698559287406783,
+   1.4972023224597841,
+   1.5243383567266802,
+   1.3690307816129905,
+   1.9447722703291133,
+   0.0,
+   1.2748400840107568,
+   1.0599569946448246,
+   1.546591282841402,
+   1.147526531928459,
+   1.447002179128145,
+   1.5982242387673176,
+   1.3112533607072414,
+   1.698603990921237,
+   1.42818777206562,
+   1.2748400840107568,
+   0.0,
+   1.038121552545461,
+   1.011788365364402,
+   1.3907391109256988,
+   1.3128200942311496,
+   1.19595706584447,
+   1.3233328139624725,
+   1.3460470789553531,
+   1.4685491458946494,
+   1.0599569946448246,
+   1.038121552545461,
+   0.0,
+   1.3642741698145529,
+   1.3493868683808095,
+   1.394942694628328,
+   1.572881849642552,
+   1.380122665319464,
+   1.6636670712582544,
+   1.3071999866010466,
+   1.546591282841402,
+   1.011788365364402,
+   1.3642741698145529,
+   0.0,
+   1.018961640373018,
+   1.0114394258945634,
+   0.8338711034820684,
+   1.1247823842299223,
+   1.2651744044972217,
+   1.4988622861692171,
+   1.147526531928459,
+   1.3907391109256988,
+   1.3493868683808095,
+   1.018961640373018,
+   0.0,
+   0.7701238110357329,
+   1.245486437864406,
+   0.5551259549534626,
+   1.1938329352055201,
+   0.9698559287406783,
+   1.447002179128145,
+   1.3128200942311496,
+   1.394942694628328,
+   1.0114394258945634,
+   0.7701238110357329,
+   0.0,
+   1.1886800117391216,
+   1.0083692448135637,
+   1.8811409082590185,
+   1.4972023224597841,
+   1.5982242387673176,
+   1.19595706584447,
+   1.572881849642552,
+   0.8338711034820684,
+   1.245486437864406,
+   1.1886800117391216,
+   0.0,
+   1.3661374102525012,
+   1.3653115050624267,
+   1.5243383567266802,
+   1.3112533607072414,
+   1.3233328139624725,
+   1.380122665319464,
+   1.1247823842299223,
+   0.5551259549534626,
+   1.0083692448135637,
+   1.3661374102525012,
+   0.0},
+  raft::distance::DistanceType::LpUnexpanded,
+  2.0};
+
+const InputConfiguration<int, float> input_linf = {
+  10,
+  {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
+  {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+   6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+  {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+   0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+   0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+   0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+   0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+  {0.0,
+   0.9251771844789913,
+   0.9036452083899731,
+   0.9251771844789913,
+   0.8706483735804971,
+   0.9251771844789913,
+   0.717493881903289,
+   0.6920214832303888,
+   0.9251771844789913,
+   0.9251771844789913,
+   0.9251771844789913,
+   0.0,
+   0.9036452083899731,
+   0.8655339692155823,
+   0.8706483735804971,
+   0.8655339692155823,
+   0.8655339692155823,
+   0.6329837991017668,
+   0.8655339692155823,
+   0.8655339692155823,
+   0.9036452083899731,
+   0.9036452083899731,
+   0.0,
+   0.7988276152181608,
+   0.7028075145996631,
+   0.9036452083899731,
+   0.9036452083899731,
+   0.9036452083899731,
+   0.8429599432532096,
+   0.9036452083899731,
+   0.9251771844789913,
+   0.8655339692155823,
+   0.7988276152181608,
+   0.0,
+   0.48376552205293305,
+   0.8206394616536681,
+   0.8206394616536681,
+   0.8206394616536681,
+   0.8429599432532096,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.8706483735804971,
+   0.7028075145996631,
+   0.48376552205293305,
+   0.0,
+   0.8706483735804971,
+   0.8706483735804971,
+   0.8706483735804971,
+   0.8429599432532096,
+   0.8706483735804971,
+   0.9251771844789913,
+   0.8655339692155823,
+   0.9036452083899731,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.0,
+   0.8853924473642432,
+   0.535821510936138,
+   0.6497196601457607,
+   0.8853924473642432,
+   0.717493881903289,
+   0.8655339692155823,
+   0.9036452083899731,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.8853924473642432,
+   0.0,
+   0.5279604218147174,
+   0.6658348373853169,
+   0.33799874888632914,
+   0.6920214832303888,
+   0.6329837991017668,
+   0.9036452083899731,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.535821510936138,
+   0.5279604218147174,
+   0.0,
+   0.662579808115858,
+   0.5079750812968089,
+   0.9251771844789913,
+   0.8655339692155823,
+   0.8429599432532096,
+   0.8429599432532096,
+   0.8429599432532096,
+   0.6497196601457607,
+   0.6658348373853169,
+   0.662579808115858,
+   0.0,
+   0.8429599432532096,
+   0.9251771844789913,
+   0.8655339692155823,
+   0.9036452083899731,
+   0.8206394616536681,
+   0.8706483735804971,
+   0.8853924473642432,
+   0.33799874888632914,
+   0.5079750812968089,
+   0.8429599432532096,
+   0.0},
+  raft::distance::DistanceType::Linf,
+  0.0};
+
+const InputConfiguration<int, float> input_l1 = {4,
+                                                 {0, 1, 1, 2, 4},
+                                                 {3, 2, 0, 1},  // indices
+                                                 {0.99296, 0.42180, 0.11687, 0.305869},
+                                                 {
+                                                   // dense output
+                                                   0.0,
+                                                   0.99296,
+                                                   1.41476,
+                                                   1.415707,
+                                                   0.99296,
+                                                   0.0,
+                                                   0.42180,
+                                                   0.42274,
+                                                   1.41476,
+                                                   0.42180,
+                                                   0.0,
+                                                   0.84454,
+                                                   1.41570,
+                                                   0.42274,
+                                                   0.84454,
+                                                   0.0,
+                                                 },
+                                                 raft::distance::DistanceType::L1,
+                                                 0.0};
+
 // test dense smem strategy
-const std::vector<
-  SparseDistanceCOOSPMVInputs<int, float, dense_smem_strategy_t>>
-  inputs_dense_strategy = {{input_inner_product}, {input_l2_unexpanded},
-                           {input_canberra},      {input_lp_unexpanded},
-                           {input_linf},          {input_l1}};
+const std::vector<SparseDistanceCOOSPMVInputs<int, float, dense_smem_strategy_t>>
+  inputs_dense_strategy = {{input_inner_product},
+                           {input_l2_unexpanded},
+                           {input_canberra},
+                           {input_lp_unexpanded},
+                           {input_linf},
+                           {input_l1}};
 
 typedef SparseDistanceCOOSPMVTest<int, float, dense_smem_strategy_t>
   SparseDistanceCOOSPMVTestDenseStrategyF;
@@ -660,22 +666,22 @@ INSTANTIATE_TEST_CASE_P(SparseDistanceCOOSPMVTests,
                         ::testing::ValuesIn(inputs_dense_strategy));
 
 // test hash and chunk strategy
-const std::vector<SparseDistanceCOOSPMVInputs<int, float, hash_strategy_t>>
-  inputs_hash_strategy = {{input_inner_product},
-                          {input_inner_product, 0.5, 2},
-                          {input_l2_unexpanded},
-                          {input_l2_unexpanded, 0.5, 2},
-                          {input_canberra},
-                          {input_canberra, 0.5, 2},
-                          {input_canberra, 0.5, 6},
-                          {input_lp_unexpanded},
-                          {input_lp_unexpanded, 0.5, 2},
-                          {input_lp_unexpanded, 0.5, 6},
-                          {input_linf},
-                          {input_linf, 0.5, 2},
-                          {input_linf, 0.5, 6},
-                          {input_l1},
-                          {input_l1, 0.5, 2}};
+const std::vector<SparseDistanceCOOSPMVInputs<int, float, hash_strategy_t>> inputs_hash_strategy = {
+  {input_inner_product},
+  {input_inner_product, 0.5, 2},
+  {input_l2_unexpanded},
+  {input_l2_unexpanded, 0.5, 2},
+  {input_canberra},
+  {input_canberra, 0.5, 2},
+  {input_canberra, 0.5, 6},
+  {input_lp_unexpanded},
+  {input_lp_unexpanded, 0.5, 2},
+  {input_lp_unexpanded, 0.5, 6},
+  {input_linf},
+  {input_linf, 0.5, 2},
+  {input_linf, 0.5, 6},
+  {input_l1},
+  {input_l1, 0.5, 2}};
 
 typedef SparseDistanceCOOSPMVTest<int, float, hash_strategy_t>
   SparseDistanceCOOSPMVTestHashStrategyF;
diff --git a/cpp/test/sparse/distance.cu b/cpp/test/sparse/distance.cu
index 3bc562bb68..8538c9cf39 100644
--- a/cpp/test/sparse/distance.cu
+++ b/cpp/test/sparse/distance.cu
@@ -49,8 +49,8 @@ struct SparseDistanceInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(
-  ::std::ostream &os, const SparseDistanceInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SparseDistanceInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
@@ -59,52 +59,56 @@ class SparseDistanceTest
   : public ::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>> {
  public:
   SparseDistanceTest()
-    : params(::testing::TestWithParam<
-             SparseDistanceInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<SparseDistanceInputs<value_idx, value_t>>::GetParam()),
       dist_config(handle),
       indptr(0, handle.get_stream()),
       indices(0, handle.get_stream()),
       data(0, handle.get_stream()),
       out_dists(0, handle.get_stream()),
-      out_dists_ref(0, handle.get_stream()) {}
+      out_dists_ref(0, handle.get_stream())
+  {
+  }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     make_data();
 
-    dist_config.b_nrows = params.indptr_h.size() - 1;
-    dist_config.b_ncols = params.n_cols;
-    dist_config.b_nnz = params.indices_h.size();
-    dist_config.b_indptr = indptr.data();
+    dist_config.b_nrows   = params.indptr_h.size() - 1;
+    dist_config.b_ncols   = params.n_cols;
+    dist_config.b_nnz     = params.indices_h.size();
+    dist_config.b_indptr  = indptr.data();
     dist_config.b_indices = indices.data();
-    dist_config.b_data = data.data();
-    dist_config.a_nrows = params.indptr_h.size() - 1;
-    dist_config.a_ncols = params.n_cols;
-    dist_config.a_nnz = params.indices_h.size();
-    dist_config.a_indptr = indptr.data();
+    dist_config.b_data    = data.data();
+    dist_config.a_nrows   = params.indptr_h.size() - 1;
+    dist_config.a_ncols   = params.n_cols;
+    dist_config.a_nnz     = params.indices_h.size();
+    dist_config.a_indptr  = indptr.data();
     dist_config.a_indices = indices.data();
-    dist_config.a_data = data.data();
+    dist_config.a_data    = data.data();
 
     int out_size = dist_config.a_nrows * dist_config.b_nrows;
 
     out_dists.resize(out_size, handle.get_stream());
 
-    pairwiseDistance(out_dists.data(), dist_config, params.metric,
-                     params.metric_arg);
+    pairwiseDistance(out_dists.data(), dist_config, params.metric, params.metric_arg);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(),
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(out_dists_ref.data(),
+                            out_dists.data(),
                             params.out_dists_ref_h.size(),
                             CompareApprox<value_t>(1e-3)));
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     auto stream = handle.get_stream();
     indptr.resize(indptr_h.size(), stream);
@@ -119,8 +123,10 @@ class SparseDistanceTest
 
     out_dists_ref.resize((indptr_h.size() - 1) * (indptr_h.size() - 1), stream);
 
-    update_device(out_dists_ref.data(), out_dists_ref_h.data(),
-                  out_dists_ref_h.size(), dist_config.handle.get_stream());
+    update_device(out_dists_ref.data(),
+                  out_dists_ref_h.data(),
+                  out_dists_ref_h.size(),
+                  dist_config.handle.get_stream());
   }
 
   raft::handle_t handle;
@@ -182,8 +188,7 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {0, 2, 4, 6, 8},
    {0, 1, 0, 1, 0, 1, 0, 1},
    {1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f, 1.0f, 2.0f},
-   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,
-    5.0},
+   {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0},
    raft::distance::DistanceType::InnerProduct,
    0.0},
   {2,
@@ -214,40 +219,33 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
-   {0.,         0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219,
-    0.58146987, 0.44940102, 1.,         0.76978799, 0.39419924, 0.,
-    0.97577154, 0.48904013, 0.48300801, 0.45087445, 0.73323749, 0.21050481,
-    0.54847744, 0.78021386, 0.54823225, 0.97577154, 0.,         0.51413997,
-    0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819,  1.,
-    0.79593037, 0.48904013, 0.51413997, 0.,         0.28605559, 0.35772784,
-    1.,         0.60889396, 0.43324829, 0.84923694, 0.45658883, 0.48300801,
-    0.31195441, 0.28605559, 0.,         0.58623212, 0.6745457,  0.60287165,
-    0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784,
-    0.58623212, 0.,         0.77917274, 0.48390993, 0.24558392, 0.99166225,
-    0.58146987, 0.73323749, 0.67534399, 1.,         0.6745457,  0.77917274,
-    0.,         0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481,
-    0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0.,
-    0.51360432, 0.68185144, 1.,         0.54847744, 0.8321819,  0.43324829,
-    0.67676228, 0.24558392, 0.76064776, 0.51360432, 0.,         1.,
-    0.76978799, 0.78021386, 1.,         0.84923694, 0.73155632, 0.99166225,
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0.,         0.39419924, 0.54823225, 0.79593037, 0.45658883, 0.93634219, 0.58146987, 0.44940102,
+    1.,         0.76978799, 0.39419924, 0.,         0.97577154, 0.48904013, 0.48300801, 0.45087445,
+    0.73323749, 0.21050481, 0.54847744, 0.78021386, 0.54823225, 0.97577154, 0.,         0.51413997,
+    0.31195441, 0.96546343, 0.67534399, 0.81665436, 0.8321819,  1.,         0.79593037, 0.48904013,
+    0.51413997, 0.,         0.28605559, 0.35772784, 1.,         0.60889396, 0.43324829, 0.84923694,
+    0.45658883, 0.48300801, 0.31195441, 0.28605559, 0.,         0.58623212, 0.6745457,  0.60287165,
+    0.67676228, 0.73155632, 0.93634219, 0.45087445, 0.96546343, 0.35772784, 0.58623212, 0.,
+    0.77917274, 0.48390993, 0.24558392, 0.99166225, 0.58146987, 0.73323749, 0.67534399, 1.,
+    0.6745457,  0.77917274, 0.,         0.27605686, 0.76064776, 0.61547536, 0.44940102, 0.21050481,
+    0.81665436, 0.60889396, 0.60287165, 0.48390993, 0.27605686, 0.,         0.51360432, 0.68185144,
+    1.,         0.54847744, 0.8321819,  0.43324829, 0.67676228, 0.24558392, 0.76064776, 0.51360432,
+    0.,         1.,         0.76978799, 0.78021386, 1.,         0.84923694, 0.73155632, 0.99166225,
     0.61547536, 0.68185144, 1.,         0.},
    raft::distance::DistanceType::CosineExpanded,
    0.0},
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
    {1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
     1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
     1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
@@ -356,15 +354,13 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
    {0.0,
     3.3954660629919076,
     5.6469232737388815,
@@ -470,15 +466,13 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
    {0.0,
     1.31462855332296,
     1.3690307816129905,
@@ -584,15 +578,13 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {10,
    {0, 5, 11, 15, 20, 27, 32, 36, 43, 47, 50},
-   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2,
-    3, 4, 7, 0, 1, 2, 3, 4, 6, 8, 0, 1, 2, 5, 7, 1, 5,
-    8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
-   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131,
-    0.5167, 0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190,
-    0.8206, 0.3625, 0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431,
-    0.0535, 0.2225, 0.8853, 0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,
-    0.5279, 0.4885, 0.3495, 0.5079, 0.2325, 0.2331, 0.3018, 0.6231, 0.2645,
-    0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
+   {0, 1, 3, 6, 8, 0, 1, 2, 3, 5, 6, 1, 2, 4, 8, 0, 2, 3, 4, 7, 0, 1, 2, 3, 4,
+    6, 8, 0, 1, 2, 5, 7, 1, 5, 8, 9, 0, 1, 2, 5, 6, 8, 9, 2, 4, 5, 7, 0, 3, 9},  // indices
+   {0.5438, 0.2695, 0.4377, 0.7174, 0.9251, 0.7648, 0.3322, 0.7279, 0.4131, 0.5167,
+    0.8655, 0.0730, 0.0291, 0.9036, 0.7988, 0.5019, 0.7663, 0.2190, 0.8206, 0.3625,
+    0.0411, 0.3995, 0.5688, 0.7028, 0.8706, 0.3199, 0.4431, 0.0535, 0.2225, 0.8853,
+    0.1932, 0.3761, 0.3379, 0.1771, 0.2107, 0.228,  0.5279, 0.4885, 0.3495, 0.5079,
+    0.2325, 0.2331, 0.3018, 0.6231, 0.2645, 0.8429, 0.6625, 0.0797, 0.2724, 0.4218},
    {0.0,
     0.9251771844789913,
     0.9036452083899731,
@@ -698,17 +690,14 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
   {15,
    {0, 5, 8, 9, 15, 20, 26, 31, 34, 38, 45},
-   {0,  1, 5,  6,  9,  1, 4, 14, 7, 3, 4,  7, 9, 11, 14,
-    0,  3, 7,  8,  12, 0, 2, 5,  7, 8, 14, 4, 9, 10, 11,
-    13, 4, 10, 14, 5,  6, 8, 9,  0, 2, 3,  4, 6, 10, 11},
-   {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507,
-    0.73789274, 0.08450219, 1.,         0.20184723, 0.18036963, 0.12581403,
-    0.13867603, 0.24040536, 0.11288773, 0.00290246, 0.09120187, 0.31190555,
-    0.43245423, 0.16153588, 0.3233026,  0.05279589, 0.1387149,  0.05962761,
-    0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881,
-    0.15605804, 0.3867739,  0.24908977, 0.36413632, 0.37643732, 0.28910679,
-    0.0198409,  0.31461499, 0.24412279, 0.08327667, 0.04444576, 0.05047969,
-    0.26190054, 0.2077349,  0.10803964},
+   {0, 1, 5,  6, 9, 1,  4,  14, 7, 3,  4,  7, 9, 11, 14, 0, 3, 7, 8, 12, 0,  2, 5,
+    7, 8, 14, 4, 9, 10, 11, 13, 4, 10, 14, 5, 6, 8,  9,  0, 2, 3, 4, 6,  10, 11},
+   {0.13537497, 0.51440163, 0.17231936, 0.02417618, 0.15372786, 0.17760507, 0.73789274, 0.08450219,
+    1.,         0.20184723, 0.18036963, 0.12581403, 0.13867603, 0.24040536, 0.11288773, 0.00290246,
+    0.09120187, 0.31190555, 0.43245423, 0.16153588, 0.3233026,  0.05279589, 0.1387149,  0.05962761,
+    0.41751856, 0.00804045, 0.03262381, 0.27507131, 0.37245804, 0.16378881, 0.15605804, 0.3867739,
+    0.24908977, 0.36413632, 0.37643732, 0.28910679, 0.0198409,  0.31461499, 0.24412279, 0.08327667,
+    0.04444576, 0.05047969, 0.26190054, 0.2077349,  0.10803964},
    {1.05367121e-08, 8.35309089e-01, 1.00000000e+00, 9.24116813e-01,
     9.90039274e-01, 7.97613546e-01, 8.91271059e-01, 1.00000000e+00,
     6.64669302e-01, 8.59439512e-01, 8.35309089e-01, 1.05367121e-08,
@@ -767,31 +756,25 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {0, 3, 8, 12, 16, 20, 25, 30, 35, 40, 45},
    {0, 3, 4, 0, 1, 2, 3, 4, 1, 2, 3, 4, 0, 2, 3, 4, 0, 1, 3, 4, 0, 1, 2,
     3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4},
-   {0.70862347, 0.8232774,  0.12108795, 0.84527547, 0.94937088, 0.03258545,
-    0.99584118, 0.76835667, 0.34426657, 0.2357925,  0.01274851, 0.11422017,
-    0.3437756,  0.31967718, 0.5956055,  0.31610373, 0.04147273, 0.03724415,
-    0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529,
-    0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329,
-    0.61364678, 0.22837736, 0.56609561, 0.29809423, 0.76736686, 0.56460608,
-    0.98165371, 0.02140123, 0.19881268, 0.26057815, 0.31648823, 0.89874295,
-    0.27366735, 0.5119944,  0.11416134},
+   {0.70862347, 0.8232774,  0.12108795, 0.84527547, 0.94937088, 0.03258545, 0.99584118, 0.76835667,
+    0.34426657, 0.2357925,  0.01274851, 0.11422017, 0.3437756,  0.31967718, 0.5956055,  0.31610373,
+    0.04147273, 0.03724415, 0.21515727, 0.04751052, 0.50283183, 0.99957274, 0.01395933, 0.96032529,
+    0.88438711, 0.46095378, 0.27432481, 0.54294211, 0.54280225, 0.59503329, 0.61364678, 0.22837736,
+    0.56609561, 0.29809423, 0.76736686, 0.56460608, 0.98165371, 0.02140123, 0.19881268, 0.26057815,
+    0.31648823, 0.89874295, 0.27366735, 0.5119944,  0.11416134},
    {// dense output
-    0.,         0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794,
-    0.76962708, 1.122858,   1.1232498,  1.08166081, 0.48769777, 0.,
-    1.31332116, 0.98318907, 0.42661815, 0.09279052, 1.35187836, 1.38429055,
-    0.40658897, 0.56136388, 1.88014197, 1.31332116, 0.,         1.82943642,
-    1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848,
-    0.26127048, 0.98318907, 1.82943642, 0.,         0.29945563, 1.08494093,
-    0.22934281, 0.82801925, 1.74288748, 1.50610116, 0.26657011, 0.42661815,
-    1.54826077, 0.29945563, 0.,         0.45060069, 0.77814948, 1.45245711,
-    1.18328348, 0.82486987, 0.7874794,  0.09279052, 1.05918884, 1.08494093,
-    0.45060069, 0.,         1.29899154, 1.40683824, 0.48505269, 0.53862363,
-    0.76962708, 1.35187836, 1.59360067, 0.22934281, 0.77814948, 1.29899154,
-    0.,         0.33202426, 1.92108999, 1.88812175, 1.122858,   1.38429055,
-    1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0.,
-    1.47318624, 1.92660889, 1.1232498,  0.40658897, 0.60215168, 1.74288748,
-    1.18328348, 0.48505269, 1.92108999, 1.47318624, 0.,         0.24992619,
-    1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363,
+    0.,         0.48769777, 1.88014197, 0.26127048, 0.26657011, 0.7874794,  0.76962708, 1.122858,
+    1.1232498,  1.08166081, 0.48769777, 0.,         1.31332116, 0.98318907, 0.42661815, 0.09279052,
+    1.35187836, 1.38429055, 0.40658897, 0.56136388, 1.88014197, 1.31332116, 0.,         1.82943642,
+    1.54826077, 1.05918884, 1.59360067, 1.34698954, 0.60215168, 0.46993848, 0.26127048, 0.98318907,
+    1.82943642, 0.,         0.29945563, 1.08494093, 0.22934281, 0.82801925, 1.74288748, 1.50610116,
+    0.26657011, 0.42661815, 1.54826077, 0.29945563, 0.,         0.45060069, 0.77814948, 1.45245711,
+    1.18328348, 0.82486987, 0.7874794,  0.09279052, 1.05918884, 1.08494093, 0.45060069, 0.,
+    1.29899154, 1.40683824, 0.48505269, 0.53862363, 0.76962708, 1.35187836, 1.59360067, 0.22934281,
+    0.77814948, 1.29899154, 0.,         0.33202426, 1.92108999, 1.88812175, 1.122858,   1.38429055,
+    1.34698954, 0.82801925, 1.45245711, 1.40683824, 0.33202426, 0.,         1.47318624, 1.92660889,
+    1.1232498,  0.40658897, 0.60215168, 1.74288748, 1.18328348, 0.48505269, 1.92108999, 1.47318624,
+    0.,         0.24992619, 1.08166081, 0.56136388, 0.46993848, 1.50610116, 0.82486987, 0.53862363,
     1.88812175, 1.92660889, 0.24992619, 0.},
    raft::distance::DistanceType::CorrelationExpanded,
    0.0},
@@ -800,12 +783,11 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {1, 4, 0, 4, 1, 3, 0, 1, 3, 0},
    {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
    {// dense output
-    0., 1.,  1.,  1., 0.8, 1., 1.,  0.8, 1., 1.,  1., 0.,  0.8, 1.,  1., 1., 1.,
-    1., 1.,  1.,  1., 0.8, 0., 1.,  1.,  1., 0.8, 1., 1.,  0.8, 1.,  1., 1., 0.,
-    1., 1.,  1.,  1., 1.,  1., 0.8, 1.,  1., 1.,  0., 1.,  1.,  0.8, 1., 1., 1.,
-    1., 1.,  1.,  1., 0.,  1., 0.8, 1.,  1., 1.,  1., 0.8, 1.,  1.,  1., 0., 1.,
-    1., 0.8, 0.8, 1., 1.,  1., 0.8, 0.8, 1., 0.,  1., 1.,  1.,  1.,  1., 1., 1.,
-    1., 1.,  1.,  0., 1.,  1., 1.,  0.8, 1., 1.,  1., 0.8, 1.,  1.,  0.},
+    0.,  1.,  1.,  1., 0.8, 1., 1.,  0.8, 1., 1.,  1.,  0., 0.8, 1., 1.,  1.,  1.,  1.,  1., 1.,
+    1.,  0.8, 0.,  1., 1.,  1., 0.8, 1.,  1., 0.8, 1.,  1., 1.,  0., 1.,  1.,  1.,  1.,  1., 1.,
+    0.8, 1.,  1.,  1., 0.,  1., 1.,  0.8, 1., 1.,  1.,  1., 1.,  1., 1.,  0.,  1.,  0.8, 1., 1.,
+    1.,  1.,  0.8, 1., 1.,  1., 0.,  1.,  1., 0.8, 0.8, 1., 1.,  1., 0.8, 0.8, 1.,  0.,  1., 1.,
+    1.,  1.,  1.,  1., 1.,  1., 1.,  1.,  0., 1.,  1.,  1., 0.8, 1., 1.,  1.,  0.8, 1.,  1., 0.},
    raft::distance::DistanceType::RusselRaoExpanded,
    0.0},
   {5,
@@ -813,13 +795,12 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
    {0, 3, 4, 4, 2, 3, 0, 2, 3, 2},
    {1., 1., 1., 1., 1., 1., 1., 1., 1., 1.},
    {// dense output
-    0.,  0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2,
-    0.,  0.4, 0.6, 0.2, 0.,  0.6, 0.4, 0.,  0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4,
-    0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.4, 0.2, 0.2, 0.2, 0.,
-    0.2, 0.6, 0.8, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,
-    0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0.,  0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8,
-    0.6, 0.2, 0.,  0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4, 0.2, 0.2, 0.4, 0.,  0.2,
-    0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.},
+    0.,  0.2, 0.6, 0.2, 0.4, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4,
+    0.6, 0.2, 0.,  0.6, 0.4, 0.,  0.4, 0.2, 0.4, 0.4, 0.6, 0.6, 0.4, 0.2, 0.,  0.4, 0.,
+    0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.4, 0.2, 0.2, 0.2, 0.,  0.2, 0.6, 0.8, 0.4, 0.2, 0.2,
+    0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.,  0.6, 0.4, 0.4, 0.4, 0.6, 0.4, 0.,  0.2,
+    0.2, 0.4, 0.4, 0.6, 0.6, 0.6, 0.8, 0.6, 0.2, 0.,  0.4, 0.6, 0.4, 0.2, 0.6, 0.2, 0.4,
+    0.2, 0.2, 0.4, 0.,  0.2, 0.2, 0.,  0.4, 0.,  0.2, 0.,  0.4, 0.6, 0.2, 0.},
    raft::distance::DistanceType::HammingUnexpanded,
    0.0},
   {3,
@@ -863,7 +844,8 @@ const std::vector<SparseDistanceInputs<int, float>> inputs_i32_f = {
 
 typedef SparseDistanceTest<int, float> SparseDistanceTestF;
 TEST_P(SparseDistanceTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseDistanceTests, SparseDistanceTestF,
+INSTANTIATE_TEST_CASE_P(SparseDistanceTests,
+                        SparseDistanceTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
 };  // namespace distance
diff --git a/cpp/test/sparse/filter.cu b/cpp/test/sparse/filter.cu
index 58ad9cf803..63245a63b0 100644
--- a/cpp/test/sparse/filter.cu
+++ b/cpp/test/sparse/filter.cu
@@ -35,8 +35,7 @@ struct SparseFilterInputs {
 };
 
 template <typename T>
-class SparseFilterTests
-  : public ::testing::TestWithParam<SparseFilterInputs<T>> {
+class SparseFilterTests : public ::testing::TestWithParam<SparseFilterInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -49,12 +48,13 @@ class SparseFilterTests
 const std::vector<SparseFilterInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseFilterTests<float> COORemoveZeros;
-TEST_P(COORemoveZeros, Result) {
+TEST_P(COORemoveZeros, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   params = ::testing::TestWithParam<SparseFilterInputs<float>>::GetParam();
 
-  float *in_h_vals = new float[params.nnz];
+  float* in_h_vals = new float[params.nnz];
 
   COO<float> in(stream, params.nnz, 5, 5);
 
@@ -67,8 +67,8 @@ TEST_P(COORemoveZeros, Result) {
   in_h_vals[2] = 0;
   in_h_vals[3] = 0;
 
-  int *in_h_rows = new int[params.nnz];
-  int *in_h_cols = new int[params.nnz];
+  int* in_h_rows = new int[params.nnz];
+  int* in_h_cols = new int[params.nnz];
 
   for (int i = 0; i < params.nnz; i++) {
     in_h_rows[i] = params.nnz - i - 1;
@@ -84,9 +84,9 @@ TEST_P(COORemoveZeros, Result) {
   int out_rows_ref_h[2] = {0, 3};
   int out_cols_ref_h[2] = {4, 1};
 
-  float *out_vals_ref_h = (float *)malloc(2 * sizeof(float));
-  out_vals_ref_h[0] = in_h_vals[4];
-  out_vals_ref_h[1] = in_h_vals[1];
+  float* out_vals_ref_h = (float*)malloc(2 * sizeof(float));
+  out_vals_ref_h[0]     = in_h_vals[4];
+  out_vals_ref_h[1]     = in_h_vals[1];
 
   COO<float> out_ref(stream, 2, 5, 5);
   COO<float> out(stream);
@@ -97,12 +97,9 @@ TEST_P(COORemoveZeros, Result) {
 
   op::coo_remove_zeros<32, float>(&in, &out, stream);
 
-  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<float>(out_ref.vals(), out.vals(), 2,
-                                       raft::Compare<float>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.rows(), out.rows(), 2, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out_ref.cols(), out.cols(), 2, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<float>(out_ref.vals(), out.vals(), 2, raft::Compare<float>()));
 
   CUDA_CHECK(cudaStreamDestroy(stream));
   free(out_vals_ref_h);
@@ -112,8 +109,7 @@ TEST_P(COORemoveZeros, Result) {
   delete[] in_h_vals;
 }
 
-INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(SparseFilterTests, COORemoveZeros, ::testing::ValuesIn(inputsf));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/knn.cu b/cpp/test/sparse/knn.cu
index 86b3b3d382..a693262193 100644
--- a/cpp/test/sparse/knn.cu
+++ b/cpp/test/sparse/knn.cu
@@ -48,60 +48,76 @@ struct SparseKNNInputs {
   int batch_size_index = 2;
   int batch_size_query = 2;
 
-  raft::distance::DistanceType metric =
-    raft::distance::DistanceType::L2SqrtExpanded;
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2SqrtExpanded;
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const SparseKNNInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SparseKNNInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class SparseKNNTest
-  : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
+class SparseKNNTest : public ::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>> {
  public:
   SparseKNNTest()
-    : params(::testing::TestWithParam<
-             SparseKNNInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<SparseKNNInputs<value_idx, value_t>>::GetParam()),
       indptr(0, handle.get_stream()),
       indices(0, handle.get_stream()),
       data(0, handle.get_stream()),
       out_indices(0, handle.get_stream()),
       out_dists(0, handle.get_stream()),
       out_indices_ref(0, handle.get_stream()),
-      out_dists_ref(0, handle.get_stream()) {}
+      out_dists_ref(0, handle.get_stream())
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     n_rows = params.indptr_h.size() - 1;
-    nnz = params.indices_h.size();
-    k = params.k;
+    nnz    = params.indices_h.size();
+    k      = params.k;
 
     make_data();
 
-    raft::sparse::selection::brute_force_knn<value_idx, value_t>(
-      indptr.data(), indices.data(), data.data(), nnz, n_rows, params.n_cols,
-      indptr.data(), indices.data(), data.data(), nnz, n_rows, params.n_cols,
-      out_indices.data(), out_dists.data(), k, handle, params.batch_size_index,
-      params.batch_size_query, params.metric);
+    raft::sparse::selection::brute_force_knn<value_idx, value_t>(indptr.data(),
+                                                                 indices.data(),
+                                                                 data.data(),
+                                                                 nnz,
+                                                                 n_rows,
+                                                                 params.n_cols,
+                                                                 indptr.data(),
+                                                                 indices.data(),
+                                                                 data.data(),
+                                                                 nnz,
+                                                                 n_rows,
+                                                                 params.n_cols,
+                                                                 out_indices.data(),
+                                                                 out_dists.data(),
+                                                                 k,
+                                                                 handle,
+                                                                 params.batch_size_index,
+                                                                 params.batch_size_query,
+                                                                 params.metric);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k,
-                            CompareApprox<value_t>(1e-4)));
-    ASSERT_TRUE(devArrMatch(out_indices_ref.data(), out_indices.data(),
-                            n_rows * k, Compare<value_idx>()));
+  void compare()
+  {
+    ASSERT_TRUE(devArrMatch(
+      out_dists_ref.data(), out_dists.data(), n_rows * k, CompareApprox<value_t>(1e-4)));
+    ASSERT_TRUE(
+      devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare<value_idx>()));
   }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     auto stream = handle.get_stream();
     indptr.resize(indptr_h.size(), stream);
@@ -112,16 +128,15 @@ class SparseKNNTest
     update_device(indices.data(), indices_h.data(), indices_h.size(), stream);
     update_device(data.data(), data_h.data(), data_h.size(), stream);
 
-    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
 
     out_indices_ref.resize(out_indices_ref_h.size(), stream);
     out_dists_ref.resize(out_dists_ref_h.size(), stream);
 
-    update_device(out_indices_ref.data(), out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), stream);
-    update_device(out_dists_ref.data(), out_dists_ref_h.data(),
-                  out_dists_ref_h.size(), stream);
+    update_device(
+      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
 
     out_dists.resize(n_rows * k, stream);
     out_indices.resize(n_rows * k, stream);
@@ -158,8 +173,7 @@ const std::vector<SparseKNNInputs<int, float>> inputs_i32_f = {
    raft::distance::DistanceType::L2SqrtExpanded}};
 typedef SparseKNNTest<int, float> SparseKNNTestF;
 TEST_P(SparseKNNTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF,
-                        ::testing::ValuesIn(inputs_i32_f));
+INSTANTIATE_TEST_CASE_P(SparseKNNTest, SparseKNNTestF, ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace selection
 };  // end namespace sparse
diff --git a/cpp/test/sparse/knn_graph.cu b/cpp/test/sparse/knn_graph.cu
index c2a1c4b93c..1ed017f40a 100644
--- a/cpp/test/sparse/knn_graph.cu
+++ b/cpp/test/sparse/knn_graph.cu
@@ -30,8 +30,9 @@ namespace raft {
 namespace sparse {
 
 template <typename value_idx, typename value_t>
-__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals,
-                                value_idx nnz, value_idx *sum) {
+__global__ void assert_symmetry(
+  value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum)
+{
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid >= nnz) return;
@@ -51,32 +52,31 @@ struct KNNGraphInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(::std::ostream &os,
-                           const KNNGraphInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const KNNGraphInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class KNNGraphTest
-  : public ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>> {
+class KNNGraphTest : public ::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>> {
  public:
   KNNGraphTest()
-    : params(::testing::TestWithParam<
-             KNNGraphInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<KNNGraphInputs<value_idx, value_t>>::GetParam()),
       stream(handle.get_stream()),
-      X(0, stream) {
+      X(0, stream)
+  {
     X.resize(params.X.size(), stream);
   }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     out = new raft::sparse::COO<value_t, value_idx>(stream);
 
     update_device(X.data(), params.X.data(), params.X.size(), stream);
 
     raft::sparse::selection::knn_graph(
-      handle, X.data(), params.m, params.n,
-      raft::distance::DistanceType::L2Unexpanded, *out);
+      handle, X.data(), params.m, params.n, raft::distance::DistanceType::L2Unexpanded, *out);
 
     rmm::device_scalar<value_idx> sum(stream);
     sum.set_value_to_zero_async(stream);
@@ -98,7 +98,7 @@ class KNNGraphTest
   cudaStream_t stream;
 
   // input data
-  raft::sparse::COO<value_t, value_idx> *out;
+  raft::sparse::COO<value_t, value_idx>* out;
 
   rmm::device_uvector<value_t> X;
 
@@ -112,13 +112,15 @@ const std::vector<KNNGraphInputs<int, float>> knn_graph_inputs_fint = {
   {4, 2, {0, 100, 0.01, 0.02, 5000, 10000, -5, -2}, 2}};
 
 typedef KNNGraphTest<int, float> KNNGraphTestF_int;
-TEST_P(KNNGraphTestF_int, Result) {
+TEST_P(KNNGraphTestF_int, Result)
+{
   // nnz should not be larger than twice m * k
   ASSERT_TRUE(out->nnz <= (params.m * params.k * 2));
   ASSERT_TRUE(sum_h == 0);
 }
 
-INSTANTIATE_TEST_CASE_P(KNNGraphTest, KNNGraphTestF_int,
+INSTANTIATE_TEST_CASE_P(KNNGraphTest,
+                        KNNGraphTestF_int,
                         ::testing::ValuesIn(knn_graph_inputs_fint));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/linkage.cu b/cpp/test/sparse/linkage.cu
index 6d4af7f016..50401e5b7a 100644
--- a/cpp/test/sparse/linkage.cu
+++ b/cpp/test/sparse/linkage.cu
@@ -55,45 +55,44 @@ struct LinkageInputs {
  * @param b: number of pairs of points that both the clusters have classified differently
  */
 template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void computeTheNumerator(const T* firstClusterArray,
-                                    const T* secondClusterArray, uint64_t size,
-                                    uint64_t* a, uint64_t* b) {
-  //calculating the indices of pairs of datapoints compared by the current thread
+__global__ void computeTheNumerator(
+  const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b)
+{
+  // calculating the indices of pairs of datapoints compared by the current thread
   uint64_t j = threadIdx.x + blockIdx.x * blockDim.x;
   uint64_t i = threadIdx.y + blockIdx.y * blockDim.y;
 
-  //thread-local variables to count a and b
+  // thread-local variables to count a and b
   uint64_t myA = 0, myB = 0;
 
   if (i < size && j < size && j < i) {
-    //checking if the pair have been classified the same by both the clusters
+    // checking if the pair have been classified the same by both the clusters
     if (firstClusterArray[i] == firstClusterArray[j] &&
         secondClusterArray[i] == secondClusterArray[j]) {
       ++myA;
     }
 
-    //checking if the pair have been classified differently by both the clusters
+    // checking if the pair have been classified differently by both the clusters
     else if (firstClusterArray[i] != firstClusterArray[j] &&
              secondClusterArray[i] != secondClusterArray[j]) {
       ++myB;
     }
   }
 
-  //specialize blockReduce for a 2D block of 1024 threads of type uint64_t
-  typedef cub::BlockReduce<uint64_t, BLOCK_DIM_X,
-                           cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
+  // specialize blockReduce for a 2D block of 1024 threads of type uint64_t
+  typedef cub::BlockReduce<uint64_t, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
     BlockReduce;
 
-  //Allocate shared memory for blockReduce
+  // Allocate shared memory for blockReduce
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
-  //summing up thread-local counts specific to a block
+  // summing up thread-local counts specific to a block
   myA = BlockReduce(temp_storage).Sum(myA);
   __syncthreads();
   myB = BlockReduce(temp_storage).Sum(myB);
   __syncthreads();
 
-  //executed once per block
+  // executed once per block
   if (threadIdx.x == 0 && threadIdx.y == 0) {
     raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)a, myA);
     raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)b, myB);
@@ -101,53 +100,54 @@ __global__ void computeTheNumerator(const T* firstClusterArray,
 }
 
 /**
-* @brief Function to calculate RandIndex
-* <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
-* @param firstClusterArray: the array of classes of type T
-* @param secondClusterArray: the array of classes of type T
-* @param size: the size of the data points of type uint64_t
-* @param stream: the cudaStream object
-*/
+ * @brief Function to calculate RandIndex
+ * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
+ * @param firstClusterArray: the array of classes of type T
+ * @param secondClusterArray: the array of classes of type T
+ * @param size: the size of the data points of type uint64_t
+ * @param stream: the cudaStream object
+ */
 template <typename T>
-double compute_rand_index(T* firstClusterArray, T* secondClusterArray,
-                          uint64_t size, cudaStream_t stream) {
-  //rand index for size less than 2 is not defined
+double compute_rand_index(T* firstClusterArray,
+                          T* secondClusterArray,
+                          uint64_t size,
+                          cudaStream_t stream)
+{
+  // rand index for size less than 2 is not defined
   ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
 
-  //allocating and initializing memory for a and b in the GPU
+  // allocating and initializing memory for a and b in the GPU
   rmm::device_uvector<uint64_t> arr_buf(2, stream);
   CUDA_CHECK(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
 
-  //kernel configuration
+  // kernel configuration
   static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16;
   dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y);
   dim3 numBlocks(raft::ceildiv<int>(size, numThreadsPerBlock.x),
                  raft::ceildiv<int>(size, numThreadsPerBlock.y));
 
-  //calling the kernel
-  computeTheNumerator<T, BLOCK_DIM_X, BLOCK_DIM_Y>
-    <<<numBlocks, numThreadsPerBlock, 0, stream>>>(
-      firstClusterArray, secondClusterArray, size, arr_buf.data(),
-      arr_buf.data() + 1);
+  // calling the kernel
+  computeTheNumerator<T, BLOCK_DIM_X, BLOCK_DIM_Y><<<numBlocks, numThreadsPerBlock, 0, stream>>>(
+    firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1);
 
-  //synchronizing and updating the calculated values of a and b from device to host
+  // synchronizing and updating the calculated values of a and b from device to host
   uint64_t ab_host[2] = {0};
   raft::update_host(ab_host, arr_buf.data(), 2, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
 
-  //error handling
+  // error handling
   CUDA_CHECK(cudaGetLastError());
 
-  //denominator
+  // denominator
   uint64_t nChooseTwo = size * (size - 1) / 2;
 
-  //calculating the rand_index
+  // calculating the rand_index
   return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo);
 }
 
 template <typename T, typename IdxT>
-::std::ostream& operator<<(::std::ostream& os,
-                           const LinkageInputs<T, IdxT>& dims) {
+::std::ostream& operator<<(::std::ostream& os, const LinkageInputs<T, IdxT>& dims)
+{
   return os;
 }
 
@@ -158,15 +158,17 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
     : params(::testing::TestWithParam<LinkageInputs<T, IdxT>>::GetParam()),
       stream(handle.get_stream()),
       labels(params.n_row, stream),
-      labels_ref(params.n_row, stream) {}
+      labels_ref(params.n_row, stream)
+  {
+  }
 
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     rmm::device_uvector<T> data(params.n_row * params.n_col, stream);
 
     raft::copy(data.data(), params.data.data(), data.size(), stream);
-    raft::copy(labels_ref.data(), params.expected_labels.data(), params.n_row,
-               stream);
+    raft::copy(labels_ref.data(), params.expected_labels.data(), params.n_row, stream);
 
     raft::hierarchy::linkage_output<IdxT, T> out_arrs;
     out_arrs.labels = labels.data();
@@ -176,16 +178,19 @@ class LinkageTest : public ::testing::TestWithParam<LinkageInputs<T, IdxT>> {
     out_arrs.children = out_children.data();
 
     raft::handle_t handle;
-    raft::hierarchy::single_linkage<
-      IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>(
-      handle, data.data(), params.n_row, params.n_col,
-      raft::distance::DistanceType::L2SqrtExpanded, &out_arrs, params.c,
+    raft::hierarchy::single_linkage<IdxT, T, raft::hierarchy::LinkageDistance::KNN_GRAPH>(
+      handle,
+      data.data(),
+      params.n_row,
+      params.n_col,
+      raft::distance::DistanceType::L2SqrtExpanded,
+      &out_arrs,
+      params.c,
       params.n_clusters);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
-    score = compute_rand_index(labels.data(), labels_ref.data(), params.n_row,
-                               stream);
+    score = compute_rand_index(labels.data(), labels_ref.data(), params.n_row, stream);
   }
 
   void SetUp() override { basicTest(); }
@@ -203,14 +208,12 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   // Test n_clusters == n_points
   {10,
    5,
-   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392,
-    0.77782677, 0.43772379, 0.4035871,  0.3282796,  0.47544681, 0.59862974,
-    0.12319357, 0.06239463, 0.28200272, 0.1345717,  0.50498218, 0.5113505,
-    0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
-    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,
-    0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792,
-    0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692,
-    0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
     0.76166195, 0.66613745},
    {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
    10,
@@ -218,8 +221,7 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   //  // Test outlier points
   {9,
    2,
-   {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000,
-    10, 50, 30, 5},
+   {-1, -50, 3, 4, 5000, 10000, 1, 3, 4, 5, 0.000005, 0.00002, 2000000, 500000, 10, 50, 30, 5},
    {6, 0, 5, 0, 0, 4, 3, 2, 1},
    7,
    -1},
@@ -227,14 +229,12 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   // Test n_clusters == (n_points / 2)
   {10,
    5,
-   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392,
-    0.77782677, 0.43772379, 0.4035871,  0.3282796,  0.47544681, 0.59862974,
-    0.12319357, 0.06239463, 0.28200272, 0.1345717,  0.50498218, 0.5113505,
-    0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
-    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,
-    0.84854131, 0.28890216, 0.85267903, 0.74703138, 0.83842071, 0.34942792,
-    0.27864171, 0.70911132, 0.21338564, 0.32035554, 0.73788331, 0.46926692,
-    0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
+   {0.21390334, 0.50261639, 0.91036676, 0.59166485, 0.71162682, 0.10248392, 0.77782677, 0.43772379,
+    0.4035871,  0.3282796,  0.47544681, 0.59862974, 0.12319357, 0.06239463, 0.28200272, 0.1345717,
+    0.50498218, 0.5113505,  0.16233086, 0.62165332, 0.42281548, 0.933117,   0.41386077, 0.23264562,
+    0.73325968, 0.37537541, 0.70719873, 0.14522645, 0.73279625, 0.9126674,  0.84854131, 0.28890216,
+    0.85267903, 0.74703138, 0.83842071, 0.34942792, 0.27864171, 0.70911132, 0.21338564, 0.32035554,
+    0.73788331, 0.46926692, 0.57570162, 0.42559178, 0.87120209, 0.22734951, 0.01847905, 0.75549396,
     0.76166195, 0.66613745},
    {1, 0, 4, 0, 0, 3, 2, 0, 2, 1},
    5,
@@ -243,340 +243,173 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
   // Test n_points == 100
   {100,
    10,
-   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01,
-    2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
-    6.88942598e-01, 5.79163537e-01, 6.70341547e-01,
-    2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
-    9.89948537e-01, 7.75253347e-01, 1.34491522e-02,
-    2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
-    7.86373507e-01, 7.18748577e-01, 8.66998621e-01,
-    6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
-    3.76246281e-01, 4.86828710e-01, 5.67464772e-01,
-    5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
-    9.49339111e-01, 3.55248484e-01, 9.06046929e-01,
-    4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
-    7.74840000e-01, 5.21046603e-01, 4.66423971e-02,
-    5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
-    4.31536306e-01, 5.83857744e-01, 4.41787364e-01,
-    4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
-    3.19650588e-01, 6.12579596e-01, 6.49126442e-02,
-    8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
-    9.46507115e-01, 8.58440748e-01, 3.61528940e-01,
-    2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
-    8.82216988e-01, 8.31498633e-01, 7.23474381e-01,
-    7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
-    5.13985168e-01, 3.00686418e-01, 8.70109949e-01,
-    2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
-    8.70985521e-01, 8.77491176e-01, 6.72537226e-01,
-    3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
-    6.18239142e-01, 2.64768597e-01, 5.76145451e-01,
-    8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
-    1.27645356e-01, 4.51004673e-01, 3.92292980e-01,
-    4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
-    5.71832605e-02, 2.06763039e-01, 3.70116249e-01,
-    2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
-    9.84156240e-02, 2.66249156e-01, 3.87635103e-01,
-    2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
-    6.86227676e-01, 1.08848960e-01, 5.96731841e-02,
-    3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
-    9.00700636e-01, 8.76363105e-01, 2.67334632e-01,
-    1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
-    1.09372387e-01, 8.74028108e-01, 6.46403232e-01,
-    4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
-    8.83865057e-01, 3.15879821e-01, 2.27043992e-01,
-    9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
-    2.40548962e-01, 3.21795663e-01, 8.75087904e-02,
-    8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
-    1.21958818e-01, 3.44348628e-02, 8.72630414e-01,
-    3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
-    5.33896401e-01, 6.21642973e-01, 4.93062535e-01,
-    4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
-    4.43447610e-01, 8.95646149e-01, 6.05220676e-01,
-    1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
-    6.92582693e-01, 7.55946922e-01, 7.95086143e-01,
-    6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
-    9.81114529e-01, 4.98266428e-01, 6.37127930e-03,
-    1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
-    7.38827633e-01, 8.93214770e-01, 2.16494306e-01,
-    9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
-    7.86240041e-01, 7.06854594e-01, 2.13725879e-02,
-    7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
-    5.01989826e-03, 4.22081572e-02, 1.65337732e-01,
-    8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
-    1.14028379e-01, 3.69739861e-01, 1.32955599e-01,
-    2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
-    6.88449594e-01, 4.44921417e-01, 8.23296587e-01,
-    1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
-    3.42600285e-01, 5.64505195e-01, 5.57594559e-01,
-    7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
-    3.21010077e-01, 8.55081359e-01, 4.30105779e-01,
-    1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
-    4.50880592e-01, 2.72289598e-01, 6.31615256e-01,
-    8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
-    2.68767748e-02, 2.43374608e-01, 4.02141103e-01,
-    4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
-    7.16149148e-01, 4.19664401e-01, 2.29335357e-01,
-    2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
-    5.69849716e-01, 5.86454477e-01, 3.54474989e-01,
-    9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
-    7.88039746e-02, 2.04814126e-01, 7.82251754e-01,
-    2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
-    2.95349590e-01, 6.57991826e-01, 8.81214312e-01,
-    5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
-    7.69797417e-02, 6.44792402e-01, 9.46950998e-01,
-    7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
-    1.67498426e-01, 2.66514296e-01, 6.50140368e-01,
-    1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
-    9.85033484e-01, 2.92909152e-01, 8.65816607e-01,
-    1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
-    2.89234322e-01, 8.18668708e-01, 4.71706924e-01,
-    9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
-    4.13915748e-01, 9.31274932e-02, 6.66322195e-01,
-    9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
-    5.03096313e-02, 6.95225201e-01, 5.78469859e-01,
-    6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
-    6.80663678e-01, 6.34607157e-01, 6.42765834e-01,
-    1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
-    4.68676824e-01, 2.86003928e-01, 7.18608322e-01,
-    8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
-    5.24379196e-01, 2.13526524e-01, 5.88375435e-01,
-    9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
-    9.53760881e-01, 5.27151288e-01, 7.03017278e-01,
-    3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
-    1.98979011e-01, 4.24917361e-01, 5.73172761e-01,
-    2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
-    9.29665524e-01, 2.26135696e-01, 9.20563384e-01,
-    7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
-    3.78559302e-03, 9.15219382e-01, 3.55705698e-01,
-    6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
-    2.49478206e-01, 7.93679304e-01, 4.75830027e-01,
-    4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
-    1.70386675e-01, 7.04056121e-01, 4.85963102e-01,
-    9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
-    2.58915007e-01, 6.70052890e-01, 2.61945109e-01,
-    8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
-    2.45776933e-01, 2.66658783e-01, 3.71724077e-01,
-    4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
-    8.07997684e-01, 1.64296275e-01, 6.01638065e-01,
-    8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
-    9.75844338e-01, 7.81226782e-01, 2.20925515e-01,
-    7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
-    9.08058083e-01, 6.88010677e-01, 8.14271847e-01,
-    5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
-    9.17455497e-01, 2.12052706e-01, 7.04074603e-01,
-    8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
-    8.54801557e-01, 2.49729159e-01, 9.76594604e-01,
-    2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
-    4.25193986e-01, 7.61869994e-01, 5.13334255e-01,
-    6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
-    1.08154647e-01, 8.78446825e-01, 2.43833016e-01,
-    9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
-    3.74510294e-01, 4.08451278e-02, 9.78392777e-01,
-    3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
-    5.25978080e-01, 1.42803678e-01, 4.05451674e-01,
-    7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
-    1.43159543e-02, 1.80363779e-01, 5.05096904e-01,
-    2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
-    8.73223968e-01, 4.38545619e-01, 4.81348800e-01,
-    6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
-    1.89869550e-01, 2.34083070e-01, 2.94066207e-01,
-    5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
-    6.72650672e-02, 8.47345378e-01, 2.80916761e-01,
-    7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
-    8.48781331e-01, 8.83225408e-01, 7.34398275e-01,
-    7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
-    5.40732486e-01, 3.69704071e-01, 5.77305837e-01,
-    2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
-    3.49496706e-01, 8.34948910e-01, 1.56403291e-02,
-    6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
-    1.43943153e-01, 3.49618530e-01, 2.10440392e-01,
-    3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
-    2.72177079e-01, 7.07946300e-01, 4.33717726e-02,
-    3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
-    6.22777789e-01, 2.95989228e-02, 4.32855769e-01,
-    7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
-    6.43721247e-01, 6.58025802e-01, 1.05247633e-02,
-    5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
-    6.62634841e-01, 8.25936616e-01, 9.91253704e-01,
-    6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
-    3.32139049e-01, 7.98732398e-01, 7.38865223e-01,
-    9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
-    1.83778839e-01, 7.27558919e-02, 5.91602822e-01,
-    3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
-    9.18556407e-01, 9.35373324e-01, 6.89209070e-01,
-    2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
-    9.84983432e-01, 6.62322741e-01, 2.04144457e-01,
-    3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
-    3.14043787e-01, 5.91072666e-01, 7.44703771e-01,
-    8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
-    1.41526372e-01, 4.14878484e-01, 6.80683651e-01,
-    5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
-    9.03269815e-01, 8.68443745e-01, 9.86939190e-01,
-    7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
-    9.69509248e-01, 1.11908818e-01, 4.49198556e-01,
-    1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
-    2.10747488e-01, 9.53884090e-01, 8.43167950e-01,
-    4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
-    3.55290379e-01, 2.95705968e-01, 1.69622690e-01,
-    1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
-    3.70932500e-01, 9.94292830e-01, 4.62587505e-01,
-    7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
-    5.75768304e-01, 9.71448393e-01, 6.95574827e-02,
-    3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
-    6.73797120e-02, 6.76596969e-01, 5.50948898e-01,
-    3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
-    3.03264879e-01, 7.61037886e-03, 2.72289601e-01,
-    1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
-    1.92088941e-01, 2.19043977e-01, 9.09320161e-01,
-    2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
-    2.23355609e-01, 1.84789435e-01, 4.16104518e-01,
-    4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
-    4.50328256e-01, 8.72199917e-01, 2.51279916e-01,
-    4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
-    1.06187277e-01, 4.92341327e-01, 1.46017513e-01,
-    5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
-    8.72648431e-01, 5.54051490e-01, 1.80745062e-01,
-    2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
-    8.30254678e-01, 5.00003328e-01, 4.69017439e-01,
-    6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
-    9.06516882e-02, 8.52975842e-01, 1.19985883e-01,
-    3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
-    6.28362507e-02, 4.32693501e-01, 3.10500685e-01,
-    6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
-    7.91284868e-01, 7.93054570e-01, 2.93406765e-01,
-    8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
-    8.67523104e-01, 1.47963482e-01, 1.25584706e-01,
-    3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
-    5.75553531e-02, 5.31607516e-01, 2.63869588e-01,
-    9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
-    7.74866558e-01, 5.65210610e-01, 7.28015327e-02,
-    6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
-    1.29932405e-01, 8.64026259e-01, 9.92599934e-01,
-    7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
-    8.11335531e-01, 7.87734900e-01, 9.87344678e-01,
-    5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
-    1.66085871e-01, 1.12937664e-01, 5.24423470e-01,
-    6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
-    3.08722276e-02, 6.26979315e-01, 4.49754105e-01,
-    8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
-    9.22168418e-01, 3.73210378e-01, 8.04432575e-01,
-    5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
-    2.40407640e-01, 5.91631279e-01, 1.59369206e-01,
-    7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
-    6.39105224e-01, 4.85274738e-01, 2.12630838e-01,
-    2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
-    5.23869697e-01, 9.99418314e-01, 8.35331599e-01,
-    4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
-    2.77001890e-02, 5.75809742e-01, 2.78513031e-01,
-    8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
-    7.88311357e-01, 9.64676177e-01, 1.75752651e-01,
-    4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
-    4.06647450e-01, 8.46539387e-01, 2.12620694e-01,
-    9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
-    9.63626055e-01, 5.96689242e-01, 1.63372670e-01,
-    4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
-    2.82327625e-01, 4.75535418e-01, 6.27760926e-01,
-    8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
-    5.05508062e-01, 5.28102944e-01, 6.13045057e-01,
-    7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
-    4.89839179e-01, 3.10496849e-01, 8.82309038e-01,
-    2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
-    4.62955493e-01, 2.38185305e-01, 5.47259907e-02,
-    7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
-    8.77741168e-01, 4.19881322e-01, 4.81222328e-01,
-    1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
-    7.37216484e-01, 5.62134821e-02, 7.14089724e-01,
-    9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
-    4.70237690e-01, 2.66524167e-01, 7.93875484e-01,
-    4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
-    1.70082405e-01, 6.35905179e-01, 3.75379109e-01,
-    4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
-    2.24643800e-01, 2.42142981e-01, 6.57283636e-01,
-    3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
-    9.43856291e-01, 4.47518596e-01, 5.44453573e-01,
-    9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
-    1.01179183e-01, 4.45473958e-01, 4.60327322e-01,
-    4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
-    3.41027487e-01, 1.56175026e-01, 7.58283148e-01,
-    6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
-    3.92517893e-01, 6.70418431e-01, 5.16440832e-01,
-    8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
-    7.39396341e-01, 7.20852434e-01, 2.35653246e-02,
-    3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
-    8.79339335e-01, 7.41599159e-02, 5.62433904e-01,
-    6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
-    5.26845015e-02, 5.58471266e-01, 1.63632233e-01,
-    5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
-    8.99035326e-01, 7.20847756e-01, 5.68954684e-01,
-    7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
-    4.89328290e-01, 5.62208561e-01, 4.97540804e-02,
-    4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
-    7.89548214e-01, 8.46136387e-01, 8.46816189e-01,
-    1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
-    4.50646654e-01, 3.74785037e-01, 4.87196697e-01,
-    4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
-    7.13597697e-01, 1.23641270e-02, 5.10031271e-01,
-    4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
-    1.91165703e-01, 4.51170940e-01, 7.50843157e-01,
-    4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
-    6.55689206e-01, 9.68257670e-02, 1.96528793e-01,
-    8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
-    9.41828079e-01, 4.54397338e-01, 5.61893331e-01,
-    5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
-    1.74888861e-01, 6.65641378e-01, 2.81668336e-01,
-    1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
-    8.25092797e-01, 5.18106324e-01, 1.71904024e-01,
-    3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
-    9.30274827e-01, 2.38198517e-01, 9.52222901e-01,
-    5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
-    4.83356794e-01, 2.73050402e-01, 3.68027050e-01,
-    5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
-    7.13926203e-01, 8.16750052e-01, 1.57890291e-01,
-    6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
-    1.02429784e-01, 9.17488471e-01, 4.03584434e-01,
-    9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
-    2.45200576e-01, 1.28896951e-01, 3.15713052e-01,
-    5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
-    7.74738919e-02, 8.42422142e-01, 3.75598924e-01,
-    3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
-    7.43107867e-01, 9.46182666e-01, 9.44344819e-01,
-    3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
-    3.84060507e-01, 2.91057722e-01, 7.68173662e-02,
-    1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
-    7.21342202e-01, 6.69471294e-03, 9.07298311e-01,
-    5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
-    2.06407453e-01, 2.59590556e-01, 7.58512718e-01,
-    5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
-    2.42829343e-01, 9.19323719e-01, 3.46832864e-01,
-    3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
-    9.58438860e-01, 5.66326411e-01, 6.60292846e-01,
-    5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
-    4.44713264e-01, 2.09732933e-01, 5.22732436e-01,
-    1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
-    4.94036404e-01, 4.09785794e-01, 6.40025507e-01,
-    5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
-    5.41072639e-01, 5.18847173e-01, 1.97093284e-01,
-    8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
-    3.87699807e-01, 4.50705808e-01, 2.49371643e-01,
-    3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
-    9.07275994e-01, 3.73075859e-01, 4.14044139e-03,
-    2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
-    4.50350196e-01, 3.48618117e-01, 5.07193932e-01,
-    5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
-    1.02623450e-01, 3.06088345e-01, 7.80461650e-01,
-    2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
-    3.68286735e-01, 7.39358243e-01, 8.97879394e-01,
-    9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
-    4.23976657e-02, 8.25922012e-01, 2.60956996e-01,
-    2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
-    8.49071471e-01, 3.45835425e-01, 7.65458276e-01,
-    5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
-    5.63368667e-02, 4.26548945e-01, 5.46745780e-01,
-    5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
-    4.46492976e-01, 6.40240123e-01, 2.73246969e-01,
-    2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
-    1.96617189e-01, 6.61271644e-01, 8.12687657e-01,
-    8.66342445e-01
+   {6.26168372e-01, 9.30437651e-01, 6.02450208e-01, 2.73025296e-01, 9.53050619e-01, 3.32164396e-01,
+    6.88942598e-01, 5.79163537e-01, 6.70341547e-01, 2.70140602e-02, 9.30429671e-01, 7.17721157e-01,
+    9.89948537e-01, 7.75253347e-01, 1.34491522e-02, 2.48522428e-02, 3.51413378e-01, 7.64405834e-01,
+    7.86373507e-01, 7.18748577e-01, 8.66998621e-01, 6.80316582e-01, 2.51288712e-01, 4.91078420e-01,
+    3.76246281e-01, 4.86828710e-01, 5.67464772e-01, 5.30734742e-01, 8.99478296e-01, 7.66699088e-01,
+    9.49339111e-01, 3.55248484e-01, 9.06046929e-01, 4.48407772e-01, 6.96395305e-01, 2.44277335e-01,
+    7.74840000e-01, 5.21046603e-01, 4.66423971e-02, 5.12019638e-02, 8.95019614e-01, 5.28956953e-01,
+    4.31536306e-01, 5.83857744e-01, 4.41787364e-01, 4.68656523e-01, 5.73971433e-01, 6.79989654e-01,
+    3.19650588e-01, 6.12579596e-01, 6.49126442e-02, 8.39131142e-01, 2.85252117e-01, 5.84848929e-01,
+    9.46507115e-01, 8.58440748e-01, 3.61528940e-01, 2.44215959e-01, 3.80101125e-01, 4.57128957e-02,
+    8.82216988e-01, 8.31498633e-01, 7.23474381e-01, 7.75788607e-01, 1.40864146e-01, 6.62092382e-01,
+    5.13985168e-01, 3.00686418e-01, 8.70109949e-01, 2.43187753e-01, 2.89391938e-01, 2.84214238e-01,
+    8.70985521e-01, 8.77491176e-01, 6.72537226e-01, 3.30929686e-01, 1.85934324e-01, 9.16222614e-01,
+    6.18239142e-01, 2.64768597e-01, 5.76145451e-01, 8.62961369e-01, 6.84757925e-01, 7.60549082e-01,
+    1.27645356e-01, 4.51004673e-01, 3.92292980e-01, 4.63170803e-01, 4.35449330e-02, 2.17583404e-01,
+    5.71832605e-02, 2.06763039e-01, 3.70116249e-01, 2.09750028e-01, 6.17283019e-01, 8.62549231e-01,
+    9.84156240e-02, 2.66249156e-01, 3.87635103e-01, 2.85591012e-02, 4.24826068e-01, 4.45795088e-01,
+    6.86227676e-01, 1.08848960e-01, 5.96731841e-02, 3.71770228e-01, 1.91548833e-01, 6.95136078e-01,
+    9.00700636e-01, 8.76363105e-01, 2.67334632e-01, 1.80619709e-01, 7.94060419e-01, 1.42854171e-02,
+    1.09372387e-01, 8.74028108e-01, 6.46403232e-01, 4.86588834e-01, 5.93446175e-02, 6.11886291e-01,
+    8.83865057e-01, 3.15879821e-01, 2.27043992e-01, 9.76764951e-01, 6.15620336e-01, 9.76199360e-01,
+    2.40548962e-01, 3.21795663e-01, 8.75087904e-02, 8.11234663e-01, 6.96070480e-01, 8.12062321e-01,
+    1.21958818e-01, 3.44348628e-02, 8.72630414e-01, 3.06162776e-01, 1.76043529e-02, 9.45894971e-01,
+    5.33896401e-01, 6.21642973e-01, 4.93062535e-01, 4.48984262e-01, 2.24560379e-01, 4.24052195e-02,
+    4.43447610e-01, 8.95646149e-01, 6.05220676e-01, 1.81840491e-01, 9.70831206e-01, 2.12563586e-02,
+    6.92582693e-01, 7.55946922e-01, 7.95086143e-01, 6.05328941e-01, 3.99350764e-01, 4.32846636e-01,
+    9.81114529e-01, 4.98266428e-01, 6.37127930e-03, 1.59085889e-01, 6.34682067e-05, 5.59429440e-01,
+    7.38827633e-01, 8.93214770e-01, 2.16494306e-01, 9.35430573e-02, 4.75665868e-02, 7.80503518e-01,
+    7.86240041e-01, 7.06854594e-01, 2.13725879e-02, 7.68246091e-01, 4.50234808e-01, 5.21231104e-01,
+    5.01989826e-03, 4.22081572e-02, 1.65337732e-01, 8.54134740e-01, 4.99430262e-01, 8.94525601e-01,
+    1.14028379e-01, 3.69739861e-01, 1.32955599e-01, 2.65563824e-01, 2.52811151e-01, 1.44792843e-01,
+    6.88449594e-01, 4.44921417e-01, 8.23296587e-01, 1.93266317e-01, 1.19033309e-01, 1.36368966e-01,
+    3.42600285e-01, 5.64505195e-01, 5.57594559e-01, 7.44257892e-01, 8.38231569e-02, 4.11548847e-01,
+    3.21010077e-01, 8.55081359e-01, 4.30105779e-01, 1.16229135e-01, 9.87731964e-02, 3.14712335e-01,
+    4.50880592e-01, 2.72289598e-01, 6.31615256e-01, 8.97432958e-01, 4.44764250e-01, 8.03776440e-01,
+    2.68767748e-02, 2.43374608e-01, 4.02141103e-01, 4.98881209e-01, 5.33173003e-01, 8.82890436e-01,
+    7.16149148e-01, 4.19664401e-01, 2.29335357e-01, 2.88637806e-01, 3.44696803e-01, 6.78171906e-01,
+    5.69849716e-01, 5.86454477e-01, 3.54474989e-01, 9.03876540e-01, 6.45980000e-01, 6.34887593e-01,
+    7.88039746e-02, 2.04814126e-01, 7.82251754e-01, 2.43147074e-01, 7.50951808e-01, 1.72799092e-02,
+    2.95349590e-01, 6.57991826e-01, 8.81214312e-01, 5.73970708e-01, 2.77610881e-01, 1.82155097e-01,
+    7.69797417e-02, 6.44792402e-01, 9.46950998e-01, 7.73064845e-01, 6.04733624e-01, 5.80094567e-01,
+    1.67498426e-01, 2.66514296e-01, 6.50140368e-01, 1.91170299e-01, 2.08752199e-01, 3.01664091e-01,
+    9.85033484e-01, 2.92909152e-01, 8.65816607e-01, 1.85222119e-01, 2.28814559e-01, 1.34286382e-02,
+    2.89234322e-01, 8.18668708e-01, 4.71706924e-01, 9.23199803e-01, 2.80879188e-01, 1.47319284e-01,
+    4.13915748e-01, 9.31274932e-02, 6.66322195e-01, 9.66953974e-01, 3.19405786e-01, 6.69486551e-01,
+    5.03096313e-02, 6.95225201e-01, 5.78469859e-01, 6.29481655e-01, 1.39252534e-01, 1.22564968e-01,
+    6.80663678e-01, 6.34607157e-01, 6.42765834e-01, 1.57127410e-02, 2.92132086e-01, 5.24423878e-01,
+    4.68676824e-01, 2.86003928e-01, 7.18608322e-01, 8.95617933e-01, 5.48844309e-01, 1.74517278e-01,
+    5.24379196e-01, 2.13526524e-01, 5.88375435e-01, 9.88560185e-01, 4.17435771e-01, 6.14438688e-01,
+    9.53760881e-01, 5.27151288e-01, 7.03017278e-01, 3.44448559e-01, 4.47059676e-01, 2.83414901e-01,
+    1.98979011e-01, 4.24917361e-01, 5.73172761e-01, 2.32398853e-02, 1.65887230e-01, 4.05552785e-01,
+    9.29665524e-01, 2.26135696e-01, 9.20563384e-01, 7.65259963e-01, 4.54820075e-01, 8.97710267e-01,
+    3.78559302e-03, 9.15219382e-01, 3.55705698e-01, 6.94905124e-01, 8.58540202e-01, 3.89790666e-01,
+    2.49478206e-01, 7.93679304e-01, 4.75830027e-01, 4.40425353e-01, 3.70579459e-01, 1.40578049e-01,
+    1.70386675e-01, 7.04056121e-01, 4.85963102e-01, 9.68450060e-01, 6.77178001e-01, 2.65934654e-01,
+    2.58915007e-01, 6.70052890e-01, 2.61945109e-01, 8.46207759e-01, 1.01928951e-01, 2.85611334e-01,
+    2.45776933e-01, 2.66658783e-01, 3.71724077e-01, 4.34319025e-01, 4.24407347e-01, 7.15417683e-01,
+    8.07997684e-01, 1.64296275e-01, 6.01638065e-01, 8.60606804e-02, 2.68719187e-01, 5.11764101e-01,
+    9.75844338e-01, 7.81226782e-01, 2.20925515e-01, 7.18135040e-01, 9.82395577e-01, 8.39160243e-01,
+    9.08058083e-01, 6.88010677e-01, 8.14271847e-01, 5.12460821e-01, 1.17311345e-01, 5.96075228e-01,
+    9.17455497e-01, 2.12052706e-01, 7.04074603e-01, 8.72872565e-02, 8.76047818e-01, 6.96235046e-01,
+    8.54801557e-01, 2.49729159e-01, 9.76594604e-01, 2.87386363e-01, 2.36461559e-02, 9.94075254e-01,
+    4.25193986e-01, 7.61869994e-01, 5.13334255e-01, 6.44711165e-02, 8.92156689e-01, 3.55235167e-01,
+    1.08154647e-01, 8.78446825e-01, 2.43833016e-01, 9.23071293e-01, 2.72724115e-01, 9.46631338e-01,
+    3.74510294e-01, 4.08451278e-02, 9.78392777e-01, 3.65079221e-01, 6.37199516e-01, 5.51144906e-01,
+    5.25978080e-01, 1.42803678e-01, 4.05451674e-01, 7.79788219e-01, 6.26009784e-01, 3.35249497e-01,
+    1.43159543e-02, 1.80363779e-01, 5.05096904e-01, 2.82619947e-01, 5.83561392e-01, 3.10951324e-01,
+    8.73223968e-01, 4.38545619e-01, 4.81348800e-01, 6.68497085e-01, 3.79345401e-01, 9.58832501e-01,
+    1.89869550e-01, 2.34083070e-01, 2.94066207e-01, 5.74892667e-02, 6.92106828e-02, 9.61127686e-02,
+    6.72650672e-02, 8.47345378e-01, 2.80916761e-01, 7.32177357e-03, 9.80785961e-01, 5.73192225e-02,
+    8.48781331e-01, 8.83225408e-01, 7.34398275e-01, 7.70381941e-01, 6.20778343e-01, 8.96822048e-01,
+    5.40732486e-01, 3.69704071e-01, 5.77305837e-01, 2.08221827e-01, 7.34275341e-01, 1.06110900e-01,
+    3.49496706e-01, 8.34948910e-01, 1.56403291e-02, 6.78576376e-01, 8.96141268e-01, 5.94835119e-01,
+    1.43943153e-01, 3.49618530e-01, 2.10440392e-01, 3.46585620e-01, 1.05153093e-01, 3.45446174e-01,
+    2.72177079e-01, 7.07946300e-01, 4.33717726e-02, 3.31232203e-01, 3.91874320e-01, 4.76338141e-01,
+    6.22777789e-01, 2.95989228e-02, 4.32855769e-01, 7.61049310e-01, 3.63279149e-01, 9.47210350e-01,
+    6.43721247e-01, 6.58025802e-01, 1.05247633e-02, 5.29974442e-01, 7.30675767e-01, 4.30041079e-01,
+    6.62634841e-01, 8.25936616e-01, 9.91253704e-01, 6.79399281e-01, 5.44177006e-01, 7.52876048e-01,
+    3.32139049e-01, 7.98732398e-01, 7.38865223e-01, 9.16055132e-01, 6.11736493e-01, 9.63672879e-01,
+    1.83778839e-01, 7.27558919e-02, 5.91602822e-01, 3.25235484e-01, 2.34741217e-01, 9.52346277e-01,
+    9.18556407e-01, 9.35373324e-01, 6.89209070e-01, 2.56049054e-01, 6.17975395e-01, 7.82285691e-01,
+    9.84983432e-01, 6.62322741e-01, 2.04144457e-01, 3.98446577e-01, 1.38918297e-01, 3.05919921e-01,
+    3.14043787e-01, 5.91072666e-01, 7.44703771e-01, 8.92272567e-01, 9.78017873e-01, 9.01203161e-01,
+    1.41526372e-01, 4.14878484e-01, 6.80683651e-01, 5.01733152e-02, 8.14635389e-01, 2.27926375e-01,
+    9.03269815e-01, 8.68443745e-01, 9.86939190e-01, 7.40779486e-01, 2.61005311e-01, 3.19276232e-01,
+    9.69509248e-01, 1.11908818e-01, 4.49198556e-01, 1.27056715e-01, 3.84064823e-01, 5.14591811e-01,
+    2.10747488e-01, 9.53884090e-01, 8.43167950e-01, 4.51187972e-01, 3.75331782e-01, 6.23566461e-01,
+    3.55290379e-01, 2.95705968e-01, 1.69622690e-01, 1.42981830e-01, 2.72180991e-01, 9.46468040e-01,
+    3.70932500e-01, 9.94292830e-01, 4.62587505e-01, 7.14817405e-01, 2.45370540e-02, 3.00906377e-01,
+    5.75768304e-01, 9.71448393e-01, 6.95574827e-02, 3.93693854e-01, 5.29306116e-01, 5.04694554e-01,
+    6.73797120e-02, 6.76596969e-01, 5.50948898e-01, 3.24909641e-01, 7.70337719e-01, 6.51842631e-03,
+    3.03264879e-01, 7.61037886e-03, 2.72289601e-01, 1.50502041e-01, 6.71103888e-02, 7.41503703e-01,
+    1.92088941e-01, 2.19043977e-01, 9.09320161e-01, 2.37993569e-01, 6.18107973e-02, 8.31447852e-01,
+    2.23355609e-01, 1.84789435e-01, 4.16104518e-01, 4.21573859e-01, 8.72446305e-02, 2.97294197e-01,
+    4.50328256e-01, 8.72199917e-01, 2.51279916e-01, 4.86219272e-01, 7.57071329e-01, 4.85655942e-01,
+    1.06187277e-01, 4.92341327e-01, 1.46017513e-01, 5.25421017e-01, 4.22637906e-01, 2.24685018e-01,
+    8.72648431e-01, 5.54051490e-01, 1.80745062e-01, 2.12756336e-01, 5.20883169e-01, 7.60363654e-01,
+    8.30254678e-01, 5.00003328e-01, 4.69017439e-01, 6.38105527e-01, 3.50638261e-02, 5.22217353e-02,
+    9.06516882e-02, 8.52975842e-01, 1.19985883e-01, 3.74926753e-01, 6.50302066e-01, 1.98875727e-01,
+    6.28362507e-02, 4.32693501e-01, 3.10500685e-01, 6.20732833e-01, 4.58503272e-01, 3.20790034e-01,
+    7.91284868e-01, 7.93054570e-01, 2.93406765e-01, 8.95399023e-01, 1.06441034e-01, 7.53085241e-02,
+    8.67523104e-01, 1.47963482e-01, 1.25584706e-01, 3.81545040e-02, 6.34338619e-01, 1.76368938e-02,
+    5.75553531e-02, 5.31607516e-01, 2.63869588e-01, 9.41945823e-01, 9.24028838e-02, 5.21496463e-01,
+    7.74866558e-01, 5.65210610e-01, 7.28015327e-02, 6.51963790e-01, 8.94727453e-01, 4.49571590e-01,
+    1.29932405e-01, 8.64026259e-01, 9.92599934e-01, 7.43721560e-01, 8.87300215e-01, 1.06369925e-01,
+    8.11335531e-01, 7.87734900e-01, 9.87344678e-01, 5.32502820e-01, 4.42612382e-01, 9.64041183e-01,
+    1.66085871e-01, 1.12937664e-01, 5.24423470e-01, 6.54689333e-01, 4.59119726e-01, 5.22774091e-01,
+    3.08722276e-02, 6.26979315e-01, 4.49754105e-01, 8.07495757e-01, 2.34199499e-01, 1.67765675e-01,
+    9.22168418e-01, 3.73210378e-01, 8.04432575e-01, 5.61890354e-01, 4.47025593e-01, 6.43155678e-01,
+    2.40407640e-01, 5.91631279e-01, 1.59369206e-01, 7.75799090e-01, 8.32067212e-01, 5.59791576e-02,
+    6.39105224e-01, 4.85274738e-01, 2.12630838e-01, 2.81431312e-02, 7.16205363e-01, 6.83885011e-01,
+    5.23869697e-01, 9.99418314e-01, 8.35331599e-01, 4.69877463e-02, 6.74712562e-01, 7.99273684e-01,
+    2.77001890e-02, 5.75809742e-01, 2.78513031e-01, 8.36209905e-01, 7.25472379e-01, 4.87173943e-01,
+    7.88311357e-01, 9.64676177e-01, 1.75752651e-01, 4.98112580e-01, 8.08850418e-02, 6.40981131e-01,
+    4.06647450e-01, 8.46539387e-01, 2.12620694e-01, 9.11012851e-01, 8.25041445e-01, 8.90065575e-01,
+    9.63626055e-01, 5.96689242e-01, 1.63372670e-01, 4.51640148e-01, 3.43026542e-01, 5.80658851e-01,
+    2.82327625e-01, 4.75535418e-01, 6.27760926e-01, 8.46314115e-01, 9.61961932e-01, 3.19806094e-01,
+    5.05508062e-01, 5.28102944e-01, 6.13045057e-01, 7.44714938e-01, 1.50586073e-01, 7.91878033e-01,
+    4.89839179e-01, 3.10496849e-01, 8.82309038e-01, 2.86922314e-01, 4.84687559e-01, 5.20838630e-01,
+    4.62955493e-01, 2.38185305e-01, 5.47259907e-02, 7.10916137e-01, 7.31887202e-01, 6.25602317e-01,
+    8.77741168e-01, 4.19881322e-01, 4.81222328e-01, 1.28224501e-01, 2.46034010e-01, 3.34971854e-01,
+    7.37216484e-01, 5.62134821e-02, 7.14089724e-01, 9.85549393e-01, 4.66295827e-01, 3.08722434e-03,
+    4.70237690e-01, 2.66524167e-01, 7.93875484e-01, 4.54795911e-02, 8.09702944e-01, 1.47709735e-02,
+    1.70082405e-01, 6.35905179e-01, 3.75379109e-01, 4.30315011e-01, 3.15788760e-01, 5.58065230e-01,
+    2.24643800e-01, 2.42142981e-01, 6.57283636e-01, 3.34921891e-01, 1.26588975e-01, 7.68064155e-01,
+    9.43856291e-01, 4.47518596e-01, 5.44453573e-01, 9.95764932e-01, 7.16444391e-01, 8.51019765e-01,
+    1.01179183e-01, 4.45473958e-01, 4.60327322e-01, 4.96895844e-02, 4.72907738e-01, 5.58987444e-01,
+    3.41027487e-01, 1.56175026e-01, 7.58283148e-01, 6.83600909e-01, 2.14623396e-01, 3.27348880e-01,
+    3.92517893e-01, 6.70418431e-01, 5.16440832e-01, 8.63140348e-01, 5.73277464e-01, 3.46608058e-01,
+    7.39396341e-01, 7.20852434e-01, 2.35653246e-02, 3.89935659e-01, 7.53783745e-01, 6.34563528e-01,
+    8.79339335e-01, 7.41599159e-02, 5.62433904e-01, 6.15553852e-01, 4.56956324e-01, 5.20047447e-01,
+    5.26845015e-02, 5.58471266e-01, 1.63632233e-01, 5.38936665e-02, 6.49593683e-01, 2.56838748e-01,
+    8.99035326e-01, 7.20847756e-01, 5.68954684e-01, 7.43684755e-01, 5.70924238e-01, 3.82318724e-01,
+    4.89328290e-01, 5.62208561e-01, 4.97540804e-02, 4.18011085e-01, 6.88041565e-01, 2.16234653e-01,
+    7.89548214e-01, 8.46136387e-01, 8.46816189e-01, 1.73842353e-01, 6.11627842e-02, 8.44440559e-01,
+    4.50646654e-01, 3.74785037e-01, 4.87196697e-01, 4.56276448e-01, 9.13284391e-01, 4.15715464e-01,
+    7.13597697e-01, 1.23641270e-02, 5.10031271e-01, 4.74601930e-02, 2.55731159e-01, 3.22090006e-01,
+    1.91165703e-01, 4.51170940e-01, 7.50843157e-01, 4.42420576e-01, 4.25380660e-01, 4.50667257e-01,
+    6.55689206e-01, 9.68257670e-02, 1.96528793e-01, 8.97343028e-01, 4.99940904e-01, 6.65504083e-01,
+    9.41828079e-01, 4.54397338e-01, 5.61893331e-01, 5.09839880e-01, 4.53117514e-01, 8.96804127e-02,
+    1.74888861e-01, 6.65641378e-01, 2.81668336e-01, 1.89532742e-01, 5.61668382e-01, 8.68330157e-02,
+    8.25092797e-01, 5.18106324e-01, 1.71904024e-01, 3.68385523e-01, 1.62005436e-01, 7.48507399e-01,
+    9.30274827e-01, 2.38198517e-01, 9.52222901e-01, 5.23587800e-01, 6.94384557e-01, 1.09338652e-01,
+    4.83356794e-01, 2.73050402e-01, 3.68027050e-01, 5.92366466e-01, 1.83192289e-01, 8.60376029e-01,
+    7.13926203e-01, 8.16750052e-01, 1.57890291e-01, 6.25691951e-01, 5.24831646e-01, 1.73873797e-01,
+    1.02429784e-01, 9.17488471e-01, 4.03584434e-01, 9.31170884e-01, 2.79386137e-01, 8.77745206e-01,
+    2.45200576e-01, 1.28896951e-01, 3.15713052e-01, 5.27874291e-01, 2.16444335e-01, 7.03883817e-01,
+    7.74738919e-02, 8.42422142e-01, 3.75598924e-01, 3.51002411e-01, 6.22752776e-01, 4.82407943e-01,
+    7.43107867e-01, 9.46182666e-01, 9.44344819e-01, 3.28124763e-01, 1.06147431e-01, 1.65102684e-01,
+    3.84060507e-01, 2.91057722e-01, 7.68173662e-02, 1.03543651e-01, 6.76698940e-01, 1.43141994e-01,
+    7.21342202e-01, 6.69471294e-03, 9.07298311e-01, 5.57080171e-01, 8.10954489e-01, 4.11120526e-01,
+    2.06407453e-01, 2.59590556e-01, 7.58512718e-01, 5.79873897e-01, 2.92875650e-01, 2.83686529e-01,
+    2.42829343e-01, 9.19323719e-01, 3.46832864e-01, 3.58238858e-01, 7.42827585e-01, 2.05760059e-01,
+    9.58438860e-01, 5.66326411e-01, 6.60292846e-01, 5.61095078e-02, 6.79465531e-01, 7.05118513e-01,
+    4.44713264e-01, 2.09732933e-01, 5.22732436e-01, 1.74396512e-01, 5.29356748e-01, 4.38475687e-01,
+    4.94036404e-01, 4.09785794e-01, 6.40025507e-01, 5.79371821e-01, 1.57726118e-01, 6.04572263e-01,
+    5.41072639e-01, 5.18847173e-01, 1.97093284e-01, 8.91767002e-01, 4.29050835e-01, 8.25490570e-01,
+    3.87699807e-01, 4.50705808e-01, 2.49371643e-01, 3.36074898e-01, 9.29925118e-01, 6.65393649e-01,
+    9.07275994e-01, 3.73075859e-01, 4.14044139e-03, 2.37463702e-01, 2.25893784e-01, 2.46900245e-01,
+    4.50350196e-01, 3.48618117e-01, 5.07193932e-01, 5.23435142e-01, 8.13611417e-01, 8.92715622e-01,
+    1.02623450e-01, 3.06088345e-01, 7.80461650e-01, 2.21453645e-01, 2.01419652e-01, 2.84254457e-01,
+    3.68286735e-01, 7.39358243e-01, 8.97879394e-01, 9.81599566e-01, 7.56526442e-01, 7.37645545e-01,
+    4.23976657e-02, 8.25922012e-01, 2.60956996e-01, 2.90702065e-01, 8.98388344e-01, 3.03733299e-01,
+    8.49071471e-01, 3.45835425e-01, 7.65458276e-01, 5.68094872e-01, 8.93770930e-01, 9.93161641e-01,
+    5.63368667e-02, 4.26548945e-01, 5.46745780e-01, 5.75674571e-01, 7.94599487e-01, 7.18935553e-02,
+    4.46492976e-01, 6.40240123e-01, 2.73246969e-01, 2.00465968e-01, 1.30718835e-01, 1.92492005e-01,
+    1.96617189e-01, 6.61271644e-01, 8.12687657e-01, 8.66342445e-01
 
    },
    {0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -589,6 +422,5 @@ const std::vector<LinkageInputs<float, int>> linkage_inputsf2 = {
 typedef LinkageTest<float, int> LinkageTestF_Int;
 TEST_P(LinkageTestF_Int, Result) { EXPECT_TRUE(score == 1.0); }
 
-INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int,
-                        ::testing::ValuesIn(linkage_inputsf2));
+INSTANTIATE_TEST_CASE_P(LinkageTest, LinkageTestF_Int, ::testing::ValuesIn(linkage_inputsf2));
 }  // end namespace raft
diff --git a/cpp/test/sparse/norm.cu b/cpp/test/sparse/norm.cu
index 4900b3ff2b..3cf465e032 100644
--- a/cpp/test/sparse/norm.cu
+++ b/cpp/test/sparse/norm.cu
@@ -39,24 +39,25 @@ struct CSRRowNormalizeInputs {
 };
 
 template <typename Type_f, typename Index_>
-class CSRRowNormalizeTest
-  : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
+class CSRRowNormalizeTest : public ::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>> {
  public:
   CSRRowNormalizeTest()
-    : params(::testing::TestWithParam<
-             CSRRowNormalizeInputs<Type_f, Index_>>::GetParam()),
+    : params(::testing::TestWithParam<CSRRowNormalizeInputs<Type_f, Index_>>::GetParam()),
       stream(handle.get_stream()),
       in_vals(params.in_vals.size(), stream),
       verify(params.verify.size(), stream),
       ex_scan(params.ex_scan.size(), stream),
-      result(params.verify.size(), stream) {}
+      result(params.verify.size(), stream)
+  {
+  }
 
  protected:
   void SetUp() override {}
 
-  void Run() {
+  void Run()
+  {
     Index_ n_rows = params.ex_scan.size();
-    Index_ nnz = params.in_vals.size();
+    Index_ nnz    = params.in_vals.size();
 
     raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
     raft::update_device(in_vals.data(), params.in_vals.data(), nnz, stream);
@@ -73,8 +74,8 @@ class CSRRowNormalizeTest
         break;
     }
 
-    ASSERT_TRUE(raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz,
-                                          raft::Compare<Type_f>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz, raft::Compare<Type_f>()));
   }
 
  protected:
@@ -113,9 +114,11 @@ const std::vector<CSRRowNormalizeInputs<double, int>> csrnormalize_inputs_d = {
    {0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.0, 0.0, 1, 0.0}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestF,
+INSTANTIATE_TEST_CASE_P(SparseNormTest,
+                        CSRRowNormalizeTestF,
                         ::testing::ValuesIn(csrnormalize_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseNormTest, CSRRowNormalizeTestD,
+INSTANTIATE_TEST_CASE_P(SparseNormTest,
+                        CSRRowNormalizeTestD,
                         ::testing::ValuesIn(csrnormalize_inputs_d));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/reduce.cu b/cpp/test/sparse/reduce.cu
index 8ff4a600bc..9a27ae5134 100644
--- a/cpp/test/sparse/reduce.cu
+++ b/cpp/test/sparse/reduce.cu
@@ -42,15 +42,15 @@ struct SparseReduceInputs {
 };
 
 template <typename value_t, typename value_idx>
-class SparseReduceTest
-  : public ::testing::TestWithParam<SparseReduceInputs<value_t, value_idx>> {
+class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<value_t, value_idx>> {
  protected:
-  void SetUp() override {
-    params = ::testing::TestWithParam<
-      SparseReduceInputs<value_t, value_idx>>::GetParam();
+  void SetUp() override
+  {
+    params = ::testing::TestWithParam<SparseReduceInputs<value_t, value_idx>>::GetParam();
   }
 
-  void Run() {
+  void Run()
+  {
     raft::handle_t handle;
 
     auto stream = handle.get_stream();
@@ -62,30 +62,29 @@ class SparseReduceTest
     rmm::device_uvector<value_idx> out_cols(params.out_cols.size(), stream);
     rmm::device_uvector<value_t> out_vals(params.out_vals.size(), stream);
 
-    raft::update_device(in_rows.data(), params.in_rows.data(),
-                        params.in_rows.size(), stream);
-    raft::update_device(in_cols.data(), params.in_cols.data(),
-                        params.in_cols.size(), stream);
-    raft::update_device(in_vals.data(), params.in_vals.data(),
-                        params.in_vals.size(), stream);
-    raft::update_device(out_rows.data(), params.out_rows.data(),
-                        params.out_rows.size(), stream);
-    raft::update_device(out_cols.data(), params.out_cols.data(),
-                        params.out_cols.size(), stream);
-    raft::update_device(out_vals.data(), params.out_vals.data(),
-                        params.out_vals.size(), stream);
+    raft::update_device(in_rows.data(), params.in_rows.data(), params.in_rows.size(), stream);
+    raft::update_device(in_cols.data(), params.in_cols.data(), params.in_cols.size(), stream);
+    raft::update_device(in_vals.data(), params.in_vals.data(), params.in_vals.size(), stream);
+    raft::update_device(out_rows.data(), params.out_rows.data(), params.out_rows.size(), stream);
+    raft::update_device(out_cols.data(), params.out_cols.data(), params.out_cols.size(), stream);
+    raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream);
 
     raft::sparse::COO<value_t, value_idx> out(stream);
-    raft::sparse::op::max_duplicates(handle, out, in_rows.data(),
-                                     in_cols.data(), in_vals.data(),
-                                     params.in_rows.size(), params.m, params.n);
+    raft::sparse::op::max_duplicates(handle,
+                                     out,
+                                     in_rows.data(),
+                                     in_cols.data(),
+                                     in_vals.data(),
+                                     params.in_rows.size(),
+                                     params.m,
+                                     params.n);
 
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_rows.data(), out.rows(), out.nnz, raft::Compare<value_idx>()));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_cols.data(), out.cols(), out.nnz, raft::Compare<value_idx>()));
-    ASSERT_TRUE(raft::devArrMatch<value_t>(out_vals.data(), out.vals(), out.nnz,
-                                           raft::Compare<value_t>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<value_t>(out_vals.data(), out.vals(), out.nnz, raft::Compare<value_t>()));
   }
 
   void TearDown() override {}
@@ -114,7 +113,8 @@ const std::vector<SparseReduceInputs<float, int>> max_reduce_inputs_f = {
    4},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseReduceTest, SparseReduceTestF,
+INSTANTIATE_TEST_CASE_P(SparseReduceTest,
+                        SparseReduceTestF,
                         ::testing::ValuesIn(max_reduce_inputs_f));
 
 }  // namespace sparse
diff --git a/cpp/test/sparse/row_op.cu b/cpp/test/sparse/row_op.cu
index d527e7323e..d73288b9f6 100644
--- a/cpp/test/sparse/row_op.cu
+++ b/cpp/test/sparse/row_op.cu
@@ -38,43 +38,48 @@ struct CSRRowOpInputs {
 /** Wrapper to call csr_row_op because the enclosing function of a __device__
  *  lambda cannot have private ot protected access within the class. */
 template <typename Type_f, typename Index_>
-void csr_row_op_wrapper(const Index_ *row_ind, Index_ n_rows, Index_ nnz,
-                        Type_f *result, cudaStream_t stream) {
+void csr_row_op_wrapper(
+  const Index_* row_ind, Index_ n_rows, Index_ nnz, Type_f* result, cudaStream_t stream)
+{
   op::csr_row_op<Index_, 32>(
-    row_ind, n_rows, nnz,
+    row_ind,
+    n_rows,
+    nnz,
     [result] __device__(Index_ row, Index_ start_idx, Index_ stop_idx) {
-      for (Index_ i = start_idx; i < stop_idx; i++) result[i] = row;
+      for (Index_ i = start_idx; i < stop_idx; i++)
+        result[i] = row;
     },
     stream);
 }
 
 template <typename Type_f, typename Index_>
-class CSRRowOpTest
-  : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>> {
+class CSRRowOpTest : public ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>> {
  public:
   CSRRowOpTest()
-    : params(
-        ::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>>::GetParam()),
+    : params(::testing::TestWithParam<CSRRowOpInputs<Type_f, Index_>>::GetParam()),
       stream(handle.get_stream()),
       verify(params.verify.size(), stream),
       ex_scan(params.ex_scan.size(), stream),
-      result(params.verify.size(), stream) {}
+      result(params.verify.size(), stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     n_rows = params.ex_scan.size();
-    nnz = params.verify.size();
+    nnz    = params.verify.size();
   }
 
-  void Run() {
+  void Run()
+  {
     raft::update_device(ex_scan.data(), params.ex_scan.data(), n_rows, stream);
     raft::update_device(verify.data(), params.verify.data(), nnz, stream);
 
-    csr_row_op_wrapper<Type_f, Index_>(ex_scan.data(), n_rows, nnz,
-                                       result.data(), stream);
+    csr_row_op_wrapper<Type_f, Index_>(ex_scan.data(), n_rows, nnz, result.data(), stream);
 
-    ASSERT_TRUE(raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz,
-                                          raft::Compare<Type_f>()));
+    ASSERT_TRUE(
+      raft::devArrMatch<Type_f>(verify.data(), result.data(), nnz, raft::Compare<Type_f>()));
   }
 
  protected:
@@ -100,10 +105,8 @@ const std::vector<CSRRowOpInputs<double, int>> csrrowop_inputs_d = {
   {{0, 4, 8, 9}, {0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0}},
 };
 
-INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF,
-                        ::testing::ValuesIn(csrrowop_inputs_f));
-INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD,
-                        ::testing::ValuesIn(csrrowop_inputs_d));
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestF, ::testing::ValuesIn(csrrowop_inputs_f));
+INSTANTIATE_TEST_CASE_P(SparseRowOpTest, CSRRowOpTestD, ::testing::ValuesIn(csrrowop_inputs_d));
 
 }  // namespace sparse
 }  // namespace raft
diff --git a/cpp/test/sparse/sort.cu b/cpp/test/sparse/sort.cu
index 7d43780cfd..c7cd03b485 100644
--- a/cpp/test/sparse/sort.cu
+++ b/cpp/test/sparse/sort.cu
@@ -46,7 +46,8 @@ class SparseSortTest : public ::testing::TestWithParam<SparseSortInput<T>> {
 const std::vector<SparseSortInput<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef SparseSortTest<float> COOSort;
-TEST_P(COOSort, Result) {
+TEST_P(COOSort, Result)
+{
   params = ::testing::TestWithParam<SparseSortInput<float>>::GetParam();
   raft::random::Rng r(params.seed);
   cudaStream_t stream;
@@ -59,13 +60,13 @@ TEST_P(COOSort, Result) {
 
   r.uniform(in_vals.data(), params.nnz, float(-1.0), float(1.0), stream);
 
-  int *in_rows_h = (int *)malloc(params.nnz * sizeof(int));
-  int *in_cols_h = (int *)malloc(params.nnz * sizeof(int));
-  int *verify_h = (int *)malloc(params.nnz * sizeof(int));
+  int* in_rows_h = (int*)malloc(params.nnz * sizeof(int));
+  int* in_cols_h = (int*)malloc(params.nnz * sizeof(int));
+  int* verify_h  = (int*)malloc(params.nnz * sizeof(int));
 
   for (int i = 0; i < params.nnz; i++) {
     in_rows_h[i] = params.nnz - i - 1;
-    verify_h[i] = i;
+    verify_h[i]  = i;
     in_cols_h[i] = i;
   }
 
@@ -74,11 +75,11 @@ TEST_P(COOSort, Result) {
   raft::update_device(in_cols.data(), in_cols_h, params.nnz, stream);
   raft::update_device(verify.data(), verify_h, params.nnz, stream);
 
-  op::coo_sort(params.m, params.n, params.nnz, in_rows.data(), in_cols.data(),
-               in_vals.data(), stream);
+  op::coo_sort(
+    params.m, params.n, params.nnz, in_rows.data(), in_cols.data(), in_vals.data(), stream);
 
-  ASSERT_TRUE(raft::devArrMatch<int>(verify.data(), in_rows.data(), params.nnz,
-                                     raft::Compare<int>()));
+  ASSERT_TRUE(
+    raft::devArrMatch<int>(verify.data(), in_rows.data(), params.nnz, raft::Compare<int>()));
 
   delete[] in_rows_h;
   delete[] in_cols_h;
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index 77d9d3d822..53bea0ddc0 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -31,8 +31,9 @@ namespace raft {
 namespace sparse {
 
 template <typename value_idx, typename value_t>
-__global__ void assert_symmetry(value_idx *rows, value_idx *cols, value_t *vals,
-                                value_idx nnz, value_idx *sum) {
+__global__ void assert_symmetry(
+  value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum)
+{
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
 
   if (tid >= nnz) return;
@@ -51,28 +52,31 @@ struct SparseSymmetrizeInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(
-  ::std::ostream &os, const SparseSymmetrizeInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os,
+                           const SparseSymmetrizeInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
 template <typename value_idx, typename value_t>
-class SparseSymmetrizeTest : public ::testing::TestWithParam<
-                               SparseSymmetrizeInputs<value_idx, value_t>> {
+class SparseSymmetrizeTest
+  : public ::testing::TestWithParam<SparseSymmetrizeInputs<value_idx, value_t>> {
  public:
   SparseSymmetrizeTest()
-    : params(::testing::TestWithParam<
-             SparseSymmetrizeInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<SparseSymmetrizeInputs<value_idx, value_t>>::GetParam()),
       stream(handle.get_stream()),
       indptr(0, stream),
       indices(0, stream),
-      data(0, stream) {}
+      data(0, stream)
+  {
+  }
 
  protected:
-  void make_data() {
-    std::vector<value_idx> indptr_h = params.indptr_h;
+  void make_data()
+  {
+    std::vector<value_idx> indptr_h  = params.indptr_h;
     std::vector<value_idx> indices_h = params.indices_h;
-    std::vector<value_t> data_h = params.data_h;
+    std::vector<value_t> data_h      = params.data_h;
 
     indptr.resize(indptr_h.size(), stream);
     indices.resize(indices_h.size(), stream);
@@ -83,22 +87,22 @@ class SparseSymmetrizeTest : public ::testing::TestWithParam<
     update_device(data.data(), data_h.data(), data_h.size(), stream);
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     make_data();
 
-    value_idx m = params.indptr_h.size() - 1;
-    value_idx n = params.n_cols;
+    value_idx m   = params.indptr_h.size() - 1;
+    value_idx n   = params.n_cols;
     value_idx nnz = params.indices_h.size();
 
     rmm::device_uvector<value_idx> coo_rows(nnz, stream);
 
-    raft::sparse::convert::csr_to_coo(indptr.data(), m, coo_rows.data(), nnz,
-                                      stream);
+    raft::sparse::convert::csr_to_coo(indptr.data(), m, coo_rows.data(), nnz, stream);
 
     raft::sparse::COO<value_t, value_idx> out(stream);
 
-    raft::sparse::linalg::symmetrize(handle, coo_rows.data(), indices.data(),
-                                     data.data(), m, n, coo_rows.size(), out);
+    raft::sparse::linalg::symmetrize(
+      handle, coo_rows.data(), indices.data(), data.data(), m, n, coo_rows.size(), out);
 
     rmm::device_scalar<value_idx> sum(stream);
     sum.set_value_to_zero_async(stream);
@@ -130,8 +134,7 @@ struct COOSymmetrizeInputs {
 };
 
 template <typename T>
-class COOSymmetrizeTest
-  : public ::testing::TestWithParam<COOSymmetrizeInputs<T>> {
+class COOSymmetrizeTest : public ::testing::TestWithParam<COOSymmetrizeInputs<T>> {
  protected:
   void SetUp() override {}
 
@@ -141,22 +144,21 @@ class COOSymmetrizeTest
 const std::vector<COOSymmetrizeInputs<float>> inputsf = {{5, 10, 5, 1234ULL}};
 
 typedef COOSymmetrizeTest<float> COOSymmetrize;
-TEST_P(COOSymmetrize, Result) {
+TEST_P(COOSymmetrize, Result)
+{
   cudaStream_t stream;
   cudaStreamCreate(&stream);
 
   int nnz = 8;
 
-  int *in_rows_h = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
-  int *in_cols_h = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2};
-  float *in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5};
+  int* in_rows_h   = new int[nnz]{0, 0, 1, 1, 2, 2, 3, 3};
+  int* in_cols_h   = new int[nnz]{1, 3, 2, 3, 0, 1, 0, 2};
+  float* in_vals_h = new float[nnz]{0.5, 1.0, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5};
 
-  int *exp_rows_h =
-    new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0};
-  int *exp_cols_h =
-    new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0};
-  float *exp_vals_h = new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0,
-                                         0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
+  int* exp_rows_h = new int[nnz * 2]{1, 0, 0, 0, 1, 3, 1, 0, 0, 2, 2, 0, 3, 2, 3, 0};
+  int* exp_cols_h = new int[nnz * 2]{0, 1, 3, 0, 2, 1, 3, 0, 2, 0, 1, 0, 0, 3, 2, 0};
+  float* exp_vals_h =
+    new float[nnz * 2]{0.5, 0.5, 1.5, 0, 0.5, 0.5, 0.5, 0, 0.5, 0.5, 0.5, 0, 1.5, 0.5, 0.5, 0.0};
 
   COO<float> in(stream, nnz, 4, 4);
   raft::update_device(in.rows(), *&in_rows_h, nnz, stream);
@@ -166,22 +168,18 @@ TEST_P(COOSymmetrize, Result) {
   COO<float> out(stream);
 
   linalg::coo_symmetrize<32, float>(
-    &in, &out,
-    [] __device__(int row, int col, float val, float trans) {
-      return val + trans;
-    },
+    &in,
+    &out,
+    [] __device__(int row, int col, float val, float trans) { return val + trans; },
     stream);
 
   CUDA_CHECK(cudaStreamSynchronize(stream));
   std::cout << out << std::endl;
 
   ASSERT_TRUE(out.nnz == nnz * 2);
-  ASSERT_TRUE(raft::devArrMatch<int>(out.rows(), exp_rows_h, out.nnz,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<int>(out.cols(), exp_cols_h, out.nnz,
-                                     raft::Compare<int>()));
-  ASSERT_TRUE(raft::devArrMatch<float>(out.vals(), exp_vals_h, out.nnz,
-                                       raft::Compare<float>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.rows(), exp_rows_h, out.nnz, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<int>(out.cols(), exp_cols_h, out.nnz, raft::Compare<int>()));
+  ASSERT_TRUE(raft::devArrMatch<float>(out.vals(), exp_vals_h, out.nnz, raft::Compare<float>()));
 
   cudaStreamDestroy(stream);
 
@@ -194,8 +192,7 @@ TEST_P(COOSymmetrize, Result) {
   delete[] exp_vals_h;
 }
 
-INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize,
-                        ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_CASE_P(COOSymmetrizeTest, COOSymmetrize, ::testing::ValuesIn(inputsf));
 
 const std::vector<SparseSymmetrizeInputs<int, float>> symm_inputs_fint = {
   // Test n_clusters == n_points
@@ -215,7 +212,8 @@ const std::vector<SparseSymmetrizeInputs<int, float>> symm_inputs_fint = {
 typedef SparseSymmetrizeTest<int, float> SparseSymmetrizeTestF_int;
 TEST_P(SparseSymmetrizeTestF_int, Result) { ASSERT_TRUE(sum_h == 0); }
 
-INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest, SparseSymmetrizeTestF_int,
+INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest,
+                        SparseSymmetrizeTestF_int,
                         ::testing::ValuesIn(symm_inputs_fint));
 
 }  // namespace sparse
diff --git a/cpp/test/spatial/ball_cover.cu b/cpp/test/spatial/ball_cover.cu
index ca30506df0..ab85e7fe8f 100644
--- a/cpp/test/spatial/ball_cover.cu
+++ b/cpp/test/spatial/ball_cover.cu
@@ -37,21 +37,26 @@ namespace knn {
 using namespace std;
 
 template <typename value_idx, typename value_t>
-__global__ void count_discrepancies_kernel(value_idx *actual_idx,
-                                           value_idx *expected_idx,
-                                           value_t *actual, value_t *expected,
-                                           uint32_t m, uint32_t n,
-                                           uint32_t *out, float thres = 1e-3) {
+__global__ void count_discrepancies_kernel(value_idx* actual_idx,
+                                           value_idx* expected_idx,
+                                           value_t* actual,
+                                           value_t* expected,
+                                           uint32_t m,
+                                           uint32_t n,
+                                           uint32_t* out,
+                                           float thres = 1e-3)
+{
   uint32_t row = blockDim.x * blockIdx.x + threadIdx.x;
 
   int n_diffs = 0;
   if (row < m) {
     for (uint32_t i = 0; i < n; i++) {
-      value_t d = actual[row * n + i] - expected[row * n + i];
+      value_t d    = actual[row * n + i] - expected[row * n + i];
       bool matches = fabsf(d) <= thres;
       if (!matches) {
         //          printf("row=%d, actual_idx=%ld, actual=%f, expected_id=%ld, expected=%f\n",
-        //                 row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i], expected[row*n+i]);
+        //                 row, actual_idx[row*n+i], actual[row*n+i], expected_idx[row*n+i],
+        //                 expected[row*n+i]);
       }
 
       n_diffs += !matches;
@@ -61,13 +66,19 @@ __global__ void count_discrepancies_kernel(value_idx *actual_idx,
 }
 
 struct is_nonzero {
-  __host__ __device__ bool operator()(uint32_t &i) { return i > 0; }
+  __host__ __device__ bool operator()(uint32_t& i) { return i > 0; }
 };
 
 template <typename value_idx, typename value_t>
-uint32_t count_discrepancies(value_idx *actual_idx, value_idx *expected_idx,
-                             value_t *actual, value_t *expected, uint32_t m,
-                             uint32_t n, uint32_t *out, cudaStream_t stream) {
+uint32_t count_discrepancies(value_idx* actual_idx,
+                             value_idx* expected_idx,
+                             value_t* actual,
+                             value_t* expected,
+                             uint32_t m,
+                             uint32_t n,
+                             uint32_t* out,
+                             cudaStream_t stream)
+{
   uint32_t tpb = 256;
   count_discrepancies_kernel<<<raft::ceildiv(m, tpb), tpb, 0, stream>>>(
     actual_idx, expected_idx, actual, expected, m, n, out);
@@ -79,25 +90,41 @@ uint32_t count_discrepancies(value_idx *actual_idx, value_idx *expected_idx,
 }
 
 template <typename value_t>
-void compute_bfknn(const raft::handle_t &handle, const value_t *X1,
-                   const value_t *X2, uint32_t n, uint32_t d, uint32_t k,
-                   const raft::distance::DistanceType metric, value_t *dists,
-                   int64_t *inds) {
-  std::vector<value_t *> input_vec = {const_cast<value_t *>(X1)};
+void compute_bfknn(const raft::handle_t& handle,
+                   const value_t* X1,
+                   const value_t* X2,
+                   uint32_t n,
+                   uint32_t d,
+                   uint32_t k,
+                   const raft::distance::DistanceType metric,
+                   value_t* dists,
+                   int64_t* inds)
+{
+  std::vector<value_t*> input_vec = {const_cast<value_t*>(X1)};
   std::vector<uint32_t> sizes_vec = {n};
 
-  cudaStream_t *int_streams = nullptr;
-  std::vector<int64_t> *translations = nullptr;
-
-  raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(
-    input_vec, sizes_vec, d, const_cast<value_t *>(X2), n, inds, dists, k,
-    handle.get_stream(), int_streams, 0, true, true, translations, metric);
+  cudaStream_t* int_streams          = nullptr;
+  std::vector<int64_t>* translations = nullptr;
+
+  raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(input_vec,
+                                                                      sizes_vec,
+                                                                      d,
+                                                                      const_cast<value_t*>(X2),
+                                                                      n,
+                                                                      inds,
+                                                                      dists,
+                                                                      k,
+                                                                      handle.get_stream(),
+                                                                      int_streams,
+                                                                      0,
+                                                                      true,
+                                                                      true,
+                                                                      translations,
+                                                                      metric);
 }
 
 struct ToRadians {
-  __device__ __host__ float operator()(float a) {
-    return a * (CUDART_PI_F / 180.0);
-  }
+  __device__ __host__ float operator()(float a) { return a * (CUDART_PI_F / 180.0); }
 };
 
 struct BallCoverInputs {
@@ -109,13 +136,14 @@ struct BallCoverInputs {
 template <typename value_idx, typename value_t>
 class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     params = ::testing::TestWithParam<BallCoverInputs>::GetParam();
     raft::handle_t handle;
 
-    uint32_t k = params.k;
+    uint32_t k   = params.k;
     float weight = params.weight;
-    auto metric = params.metric;
+    auto metric  = params.metric;
 
     std::vector<value_t> h_train_inputs = spatial_data;
 
@@ -126,17 +154,25 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
 
     // Allocate input
     rmm::device_uvector<value_t> d_train_inputs(n * d, handle.get_stream());
-    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d,
-                        handle.get_stream());
+    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, handle.get_stream());
 
     if (metric == raft::distance::DistanceType::Haversine) {
-      thrust::transform(handle.get_thrust_policy(), d_train_inputs.data(),
+      thrust::transform(handle.get_thrust_policy(),
+                        d_train_inputs.data(),
                         d_train_inputs.data() + d_train_inputs.size(),
-                        d_train_inputs.data(), ToRadians());
+                        d_train_inputs.data(),
+                        ToRadians());
     }
 
-    compute_bfknn(handle, d_train_inputs.data(), d_train_inputs.data(), n, d, k,
-                  metric, d_ref_D.data(), d_ref_I.data());
+    compute_bfknn(handle,
+                  d_train_inputs.data(),
+                  d_train_inputs.data(),
+                  n,
+                  d,
+                  k,
+                  metric,
+                  d_ref_D.data(),
+                  d_ref_I.data());
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
@@ -144,13 +180,11 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
     rmm::device_uvector<value_idx> d_pred_I(n * k, handle.get_stream());
     rmm::device_uvector<value_t> d_pred_D(n * k, handle.get_stream());
 
-    BallCoverIndex<value_idx, value_t> index(handle, d_train_inputs.data(), n,
-                                             d, metric);
+    BallCoverIndex<value_idx, value_t> index(handle, d_train_inputs.data(), n, d, metric);
 
     raft::spatial::knn::rbc_build_index(handle, index);
-    raft::spatial::knn::rbc_knn_query(handle, index, k, d_train_inputs.data(),
-                                      n, d_pred_I.data(), d_pred_D.data(), true,
-                                      weight);
+    raft::spatial::knn::rbc_knn_query(
+      handle, index, k, d_train_inputs.data(), n, d_pred_I.data(), d_pred_D.data(), true, weight);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
     // What we really want are for the distances to match exactly. The
@@ -158,12 +192,19 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
     // can be nondeterministic.
 
     rmm::device_uvector<uint32_t> discrepancies(n, handle.get_stream());
-    thrust::fill(handle.get_thrust_policy(), discrepancies.data(),
-                 discrepancies.data() + discrepancies.size(), 0);
+    thrust::fill(handle.get_thrust_policy(),
+                 discrepancies.data(),
+                 discrepancies.data() + discrepancies.size(),
+                 0);
     //
-    int res = count_discrepancies(d_ref_I.data(), d_pred_I.data(),
-                                  d_ref_D.data(), d_pred_D.data(), n, k,
-                                  discrepancies.data(), handle.get_stream());
+    int res = count_discrepancies(d_ref_I.data(),
+                                  d_pred_I.data(),
+                                  d_ref_D.data(),
+                                  d_pred_D.data(),
+                                  n,
+                                  k,
+                                  discrepancies.data(),
+                                  handle.get_stream());
 
     ASSERT_TRUE(res == 0);
   }
@@ -180,13 +221,14 @@ class BallCoverKNNQueryTest : public ::testing::TestWithParam<BallCoverInputs> {
 template <typename value_idx, typename value_t>
 class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     params = ::testing::TestWithParam<BallCoverInputs>::GetParam();
     raft::handle_t handle;
 
-    uint32_t k = params.k;
+    uint32_t k   = params.k;
     float weight = params.weight;
-    auto metric = params.metric;
+    auto metric  = params.metric;
 
     std::vector<value_t> h_train_inputs = spatial_data;
 
@@ -197,25 +239,37 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
 
     // Allocate input
     rmm::device_uvector<value_t> d_train_inputs(n * d, handle.get_stream());
-    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d,
-                        handle.get_stream());
+    raft::update_device(d_train_inputs.data(), h_train_inputs.data(), n * d, handle.get_stream());
 
     if (metric == raft::distance::DistanceType::Haversine) {
-      thrust::transform(handle.get_thrust_policy(), d_train_inputs.data(),
+      thrust::transform(handle.get_thrust_policy(),
+                        d_train_inputs.data(),
                         d_train_inputs.data() + d_train_inputs.size(),
-                        d_train_inputs.data(), ToRadians());
+                        d_train_inputs.data(),
+                        ToRadians());
     }
 
-    cudaStream_t *int_streams = nullptr;
-    std::vector<int64_t> *translations = nullptr;
+    cudaStream_t* int_streams          = nullptr;
+    std::vector<int64_t>* translations = nullptr;
 
-    std::vector<float *> input_vec = {d_train_inputs.data()};
+    std::vector<float*> input_vec   = {d_train_inputs.data()};
     std::vector<uint32_t> sizes_vec = {n};
 
-    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(
-      input_vec, sizes_vec, d, d_train_inputs.data(), n, d_ref_I.data(),
-      d_ref_D.data(), k, handle.get_stream(), int_streams, 0, true, true,
-      translations, metric);
+    raft::spatial::knn::detail::brute_force_knn_impl<uint32_t, int64_t>(input_vec,
+                                                                        sizes_vec,
+                                                                        d,
+                                                                        d_train_inputs.data(),
+                                                                        n,
+                                                                        d_ref_I.data(),
+                                                                        d_ref_D.data(),
+                                                                        k,
+                                                                        handle.get_stream(),
+                                                                        int_streams,
+                                                                        0,
+                                                                        true,
+                                                                        true,
+                                                                        translations,
+                                                                        metric);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
 
@@ -223,11 +277,10 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
     rmm::device_uvector<value_idx> d_pred_I(n * k, handle.get_stream());
     rmm::device_uvector<value_t> d_pred_D(n * k, handle.get_stream());
 
-    BallCoverIndex<value_idx, value_t> index(handle, d_train_inputs.data(), n,
-                                             d, metric);
+    BallCoverIndex<value_idx, value_t> index(handle, d_train_inputs.data(), n, d, metric);
 
-    raft::spatial::knn::rbc_all_knn_query(handle, index, k, d_pred_I.data(),
-                                          d_pred_D.data(), true, weight);
+    raft::spatial::knn::rbc_all_knn_query(
+      handle, index, k, d_pred_I.data(), d_pred_D.data(), true, weight);
 
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
     // What we really want are for the distances to match exactly. The
@@ -235,12 +288,19 @@ class BallCoverAllKNNTest : public ::testing::TestWithParam<BallCoverInputs> {
     // can be nondeterministic.
 
     rmm::device_uvector<uint32_t> discrepancies(n, handle.get_stream());
-    thrust::fill(handle.get_thrust_policy(), discrepancies.data(),
-                 discrepancies.data() + discrepancies.size(), 0);
+    thrust::fill(handle.get_thrust_policy(),
+                 discrepancies.data(),
+                 discrepancies.data() + discrepancies.size(),
+                 0);
     //
-    uint32_t res = count_discrepancies(
-      d_ref_I.data(), d_pred_I.data(), d_ref_D.data(), d_pred_D.data(), n, k,
-      discrepancies.data(), handle.get_stream());
+    uint32_t res = count_discrepancies(d_ref_I.data(),
+                                       d_pred_I.data(),
+                                       d_ref_D.data(),
+                                       d_pred_D.data(),
+                                       n,
+                                       k,
+                                       discrepancies.data(),
+                                       handle.get_stream());
     ASSERT_TRUE(res == 0);
   }
 
@@ -265,9 +325,11 @@ const std::vector<BallCoverInputs> ballcover_inputs = {
   {7, 1.0, raft::distance::DistanceType::L2SqrtUnexpanded},
 };
 
-INSTANTIATE_TEST_CASE_P(BallCoverAllKNNTest, BallCoverAllKNNTestF,
+INSTANTIATE_TEST_CASE_P(BallCoverAllKNNTest,
+                        BallCoverAllKNNTestF,
                         ::testing::ValuesIn(ballcover_inputs));
-INSTANTIATE_TEST_CASE_P(BallCoverKNNQueryTest, BallCoverKNNQueryTestF,
+INSTANTIATE_TEST_CASE_P(BallCoverKNNQueryTest,
+                        BallCoverKNNQueryTestF,
                         ::testing::ValuesIn(ballcover_inputs));
 
 TEST_P(BallCoverAllKNNTestF, Fit) { basicTest(); }
diff --git a/cpp/test/spatial/fused_l2_knn.cu b/cpp/test/spatial/fused_l2_knn.cu
index 4930b47e0c..e48a3c6657 100644
--- a/cpp/test/spatial/fused_l2_knn.cu
+++ b/cpp/test/spatial/fused_l2_knn.cu
@@ -49,20 +49,25 @@ struct idx_dist_pair {
   IdxT idx;
   DistT dist;
   compareDist eq_compare;
-  bool operator==(const idx_dist_pair<IdxT, DistT, compareDist> &a) const {
+  bool operator==(const idx_dist_pair<IdxT, DistT, compareDist>& a) const
+  {
     if (idx == a.idx) return true;
     if (eq_compare(dist, a.dist)) return true;
     return false;
   }
-  idx_dist_pair(IdxT x, DistT y, compareDist op)
-    : idx(x), dist(y), eq_compare(op) {}
+  idx_dist_pair(IdxT x, DistT y, compareDist op) : idx(x), dist(y), eq_compare(op) {}
 };
 
 template <typename T, typename DistT>
-testing::AssertionResult devArrMatchKnnPair(
-  const T *expected_idx, const T *actual_idx, const DistT *expected_dist,
-  const DistT *actual_dist, size_t rows, size_t cols, const DistT eps,
-  cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatchKnnPair(const T* expected_idx,
+                                            const T* actual_idx,
+                                            const DistT* expected_dist,
+                                            const DistT* actual_dist,
+                                            size_t rows,
+                                            size_t cols,
+                                            const DistT eps,
+                                            cudaStream_t stream = 0)
+{
   size_t size = rows * cols;
   std::unique_ptr<T[]> exp_idx_h(new T[size]);
   std::unique_ptr<T[]> act_idx_h(new T[size]);
@@ -75,9 +80,9 @@ testing::AssertionResult devArrMatchKnnPair(
   CUDA_CHECK(cudaStreamSynchronize(stream));
   for (size_t i(0); i < rows; ++i) {
     for (size_t j(0); j < cols; ++j) {
-      auto idx = i * cols + j;  // row major assumption!
-      auto exp_idx = exp_idx_h.get()[idx];
-      auto act_idx = act_idx_h.get()[idx];
+      auto idx      = i * cols + j;  // row major assumption!
+      auto exp_idx  = exp_idx_h.get()[idx];
+      auto act_idx  = act_idx_h.get()[idx];
       auto exp_dist = exp_dist_h.get()[idx];
       auto act_dist = act_dist_h.get()[idx];
       idx_dist_pair exp_kvp(exp_idx, exp_dist, raft::CompareApprox<DistT>(eps));
@@ -85,8 +90,7 @@ testing::AssertionResult devArrMatchKnnPair(
       if (!(exp_kvp == act_kvp)) {
         return testing::AssertionFailure()
                << "actual=" << act_kvp.idx << "," << act_kvp.dist << "!="
-               << "expected" << exp_kvp.idx << "," << exp_kvp.dist << " @" << i
-               << "," << j;
+               << "expected" << exp_kvp.idx << "," << exp_kvp.dist << " @" << i << "," << j;
       }
     }
   }
@@ -96,26 +100,43 @@ testing::AssertionResult devArrMatchKnnPair(
 template <typename T>
 class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
  protected:
-  void testBruteForce() {
+  void testBruteForce()
+  {
     cudaStream_t stream = handle_.get_stream();
 
     launchFaissBfknn();
-    detail::fusedL2Knn(dim, raft_indices_, raft_distances_, database,
-                       search_queries, num_db_vecs, num_queries, k_, true, true,
-                       stream, metric);
+    detail::fusedL2Knn(dim,
+                       raft_indices_,
+                       raft_distances_,
+                       database,
+                       search_queries,
+                       num_db_vecs,
+                       num_queries,
+                       k_,
+                       true,
+                       true,
+                       stream,
+                       metric);
 
     // verify.
-    devArrMatchKnnPair(faiss_indices_, raft_indices_, faiss_distances_,
-                       raft_distances_, num_queries, k_, float(0.001), stream);
+    devArrMatchKnnPair(faiss_indices_,
+                       raft_indices_,
+                       faiss_distances_,
+                       raft_distances_,
+                       num_queries,
+                       k_,
+                       float(0.001),
+                       stream);
   }
 
-  void SetUp() override {
-    params_ = ::testing::TestWithParam<FusedL2KNNInputs>::GetParam();
+  void SetUp() override
+  {
+    params_     = ::testing::TestWithParam<FusedL2KNNInputs>::GetParam();
     num_queries = params_.num_queries;
     num_db_vecs = params_.num_db_vecs;
-    dim = params_.dim;
-    k_ = params_.k;
-    metric = params_.metric_;
+    dim         = params_.dim;
+    k_          = params_.k;
+    metric      = params_.metric_;
 
     cudaStream_t stream = handle_.get_stream();
 
@@ -133,12 +154,14 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
     raft::allocate(faiss_distances_, num_queries * k_, stream, true);
   }
 
-  void TearDown() override {
+  void TearDown() override
+  {
     cudaStream_t stream = handle_.get_stream();
     raft::deallocate_all(stream);
   }
 
-  void launchFaissBfknn() {
+  void launchFaissBfknn()
+  {
     faiss::MetricType m = detail::build_faiss_metric(metric);
 
     faiss::gpu::StandardGpuResources gpu_res;
@@ -149,18 +172,18 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
     gpu_res.setDefaultStream(device, handle_.get_stream());
 
     faiss::gpu::GpuDistanceParams args;
-    args.metric = m;
-    args.metricArg = 0;
-    args.k = k_;
-    args.dims = dim;
-    args.vectors = database;
+    args.metric          = m;
+    args.metricArg       = 0;
+    args.k               = k_;
+    args.dims            = dim;
+    args.vectors         = database;
     args.vectorsRowMajor = true;
-    args.numVectors = num_db_vecs;
-    args.queries = search_queries;
+    args.numVectors      = num_db_vecs;
+    args.queries         = search_queries;
     args.queriesRowMajor = true;
-    args.numQueries = num_queries;
-    args.outDistances = faiss_distances_;
-    args.outIndices = faiss_indices_;
+    args.numQueries      = num_queries;
+    args.outDistances    = faiss_distances_;
+    args.outIndices      = faiss_indices_;
 
     bfKnn(&gpu_res, args);
   }
@@ -171,12 +194,12 @@ class FusedL2KNNTest : public ::testing::TestWithParam<FusedL2KNNInputs> {
   int num_queries;
   int num_db_vecs;
   int dim;
-  T *database;
-  T *search_queries;
-  int64_t *raft_indices_;
-  T *raft_distances_;
-  int64_t *faiss_indices_;
-  T *faiss_distances_;
+  T* database;
+  T* search_queries;
+  int64_t* raft_indices_;
+  T* raft_distances_;
+  int64_t* faiss_indices_;
+  T* faiss_distances_;
   int k_;
   raft::distance::DistanceType metric;
 };
@@ -201,8 +224,7 @@ const std::vector<FusedL2KNNInputs> inputs = {
 typedef FusedL2KNNTest<float> FusedL2KNNTestF;
 TEST_P(FusedL2KNNTestF, FusedBruteForce) { this->testBruteForce(); }
 
-INSTANTIATE_TEST_CASE_P(FusedL2KNNTest, FusedL2KNNTestF,
-                        ::testing::ValuesIn(inputs));
+INSTANTIATE_TEST_CASE_P(FusedL2KNNTest, FusedL2KNNTestF, ::testing::ValuesIn(inputs));
 
 }  // namespace knn
 }  // namespace spatial
diff --git a/cpp/test/spatial/haversine.cu b/cpp/test/spatial/haversine.cu
index 5a45c45bff..bff7665f83 100644
--- a/cpp/test/spatial/haversine.cu
+++ b/cpp/test/spatial/haversine.cu
@@ -35,10 +35,13 @@ class HaversineKNNTest : public ::testing::Test {
       d_ref_I(0, stream),
       d_ref_D(0, stream),
       d_pred_I(0, stream),
-      d_pred_D(0, stream) {}
+      d_pred_D(0, stream)
+  {
+  }
 
  protected:
-  void basicTest() {
+  void basicTest()
+  {
     // Allocate input
     d_train_inputs.resize(n * d, stream);
 
@@ -51,35 +54,45 @@ class HaversineKNNTest : public ::testing::Test {
     d_pred_D.resize(n * n, stream);
 
     // make testdata on host
-    std::vector<value_t> h_train_inputs = {
-      0.71113885, -1.29215058, 0.59613176, -2.08048115,
-      0.74932804, -1.33634042, 0.51486728, -1.65962873,
-      0.53154002, -1.47049808, 0.72891737, -1.54095137};
+    std::vector<value_t> h_train_inputs = {0.71113885,
+                                           -1.29215058,
+                                           0.59613176,
+                                           -2.08048115,
+                                           0.74932804,
+                                           -1.33634042,
+                                           0.51486728,
+                                           -1.65962873,
+                                           0.53154002,
+                                           -1.47049808,
+                                           0.72891737,
+                                           -1.54095137};
 
     h_train_inputs.resize(d_train_inputs.size());
-    raft::update_device(d_train_inputs.data(), h_train_inputs.data(),
-                        d_train_inputs.size(), stream);
-
-    std::vector<value_t> h_res_D = {
-      0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595,
-      0., 0.36575755, 0.44288665, 0.5170737,  0.59501296, 0.62925595,
-      0., 0.05041587, 0.152463,   0.2426416,  0.34925285, 0.59501296,
-      0., 0.16461092, 0.2345792,  0.34925285, 0.35749438, 0.36575755,
-      0., 0.16461092, 0.20535265, 0.23048252, 0.2426416,  0.5170737,
-      0., 0.152463,   0.18767063, 0.20535265, 0.2345792,  0.44288665};
+    raft::update_device(
+      d_train_inputs.data(), h_train_inputs.data(), d_train_inputs.size(), stream);
+
+    std::vector<value_t> h_res_D = {0., 0.05041587, 0.18767063, 0.23048252, 0.35749438, 0.62925595,
+                                    0., 0.36575755, 0.44288665, 0.5170737,  0.59501296, 0.62925595,
+                                    0., 0.05041587, 0.152463,   0.2426416,  0.34925285, 0.59501296,
+                                    0., 0.16461092, 0.2345792,  0.34925285, 0.35749438, 0.36575755,
+                                    0., 0.16461092, 0.20535265, 0.23048252, 0.2426416,  0.5170737,
+                                    0., 0.152463,   0.18767063, 0.20535265, 0.2345792,  0.44288665};
     h_res_D.resize(n * n);
     raft::update_device(d_ref_D.data(), h_res_D.data(), n * n, stream);
 
-    std::vector<value_idx> h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0,
-                                      2, 0, 5, 4, 3, 1, 3, 4, 5, 2, 0, 1,
-                                      4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1};
+    std::vector<value_idx> h_res_I = {0, 2, 5, 4, 3, 1, 1, 3, 5, 4, 2, 0, 2, 0, 5, 4, 3, 1,
+                                      3, 4, 5, 2, 0, 1, 4, 3, 5, 0, 2, 1, 5, 2, 0, 4, 3, 1};
     h_res_I.resize(n * n);
-    raft::update_device<value_idx>(d_ref_I.data(), h_res_I.data(), n * n,
-                                   stream);
+    raft::update_device<value_idx>(d_ref_I.data(), h_res_I.data(), n * n, stream);
 
-    raft::spatial::knn::detail::haversine_knn(
-      d_pred_I.data(), d_pred_D.data(), d_train_inputs.data(),
-      d_train_inputs.data(), n, n, k, stream);
+    raft::spatial::knn::detail::haversine_knn(d_pred_I.data(),
+                                              d_pred_D.data(),
+                                              d_train_inputs.data(),
+                                              d_train_inputs.data(),
+                                              n,
+                                              n,
+                                              k,
+                                              stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -106,11 +119,11 @@ class HaversineKNNTest : public ::testing::Test {
 
 typedef HaversineKNNTest<int, float> HaversineKNNTestF;
 
-TEST_F(HaversineKNNTestF, Fit) {
-  ASSERT_TRUE(raft::devArrMatch(d_ref_D.data(), d_pred_D.data(), n * n,
-                                raft::CompareApprox<float>(1e-3)));
-  ASSERT_TRUE(raft::devArrMatch(d_ref_I.data(), d_pred_I.data(), n * n,
-                                raft::Compare<int>()));
+TEST_F(HaversineKNNTestF, Fit)
+{
+  ASSERT_TRUE(
+    raft::devArrMatch(d_ref_D.data(), d_pred_D.data(), n * n, raft::CompareApprox<float>(1e-3)));
+  ASSERT_TRUE(raft::devArrMatch(d_ref_I.data(), d_pred_I.data(), n * n, raft::Compare<int>()));
 }
 
 }  // namespace knn
diff --git a/cpp/test/spatial/knn.cu b/cpp/test/spatial/knn.cu
index 35a82b1e53..49e5aaab4b 100644
--- a/cpp/test/spatial/knn.cu
+++ b/cpp/test/spatial/knn.cu
@@ -36,17 +36,17 @@ struct KNNInputs {
   std::vector<int> labels;
 };
 
-__global__ void build_actual_output(int *output, int n_rows, int k,
-                                    const int *idx_labels,
-                                    const int64_t *indices) {
+__global__ void build_actual_output(
+  int* output, int n_rows, int k, const int* idx_labels, const int64_t* indices)
+{
   int element = threadIdx.x + blockDim.x * blockIdx.x;
   if (element >= n_rows * k) return;
 
   output[element] = idx_labels[indices[element]];
 }
 
-__global__ void build_expected_output(int *output, int n_rows, int k,
-                                      const int *labels) {
+__global__ void build_expected_output(int* output, int n_rows, int k, const int* labels)
+{
   int row = threadIdx.x + blockDim.x * blockIdx.x;
   if (row >= n_rows) return;
 
@@ -68,23 +68,33 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
       search_data_(0, stream),
       indices_(0, stream),
       distances_(0, stream),
-      search_labels_(0, stream) {}
+      search_labels_(0, stream)
+  {
+  }
 
  protected:
-  void testBruteForce() {
-    raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_,
-                              std::cout);
+  void testBruteForce()
+  {
+    raft::print_device_vector("Input array: ", input_.data(), rows_ * cols_, std::cout);
     std::cout << "K: " << k_ << "\n";
-    raft::print_device_vector("Labels array: ", search_labels_.data(), rows_,
-                              std::cout);
+    raft::print_device_vector("Labels array: ", search_labels_.data(), rows_, std::cout);
 
-    std::vector<float *> input_vec;
+    std::vector<float*> input_vec;
     std::vector<int> sizes_vec;
     input_vec.push_back(input_.data());
     sizes_vec.push_back(rows_);
 
-    brute_force_knn(handle, input_vec, sizes_vec, cols_, search_data_.data(),
-                    rows_, indices_.data(), distances_.data(), k_, true, true);
+    brute_force_knn(handle,
+                    input_vec,
+                    sizes_vec,
+                    cols_,
+                    search_data_.data(),
+                    rows_,
+                    indices_.data(),
+                    distances_.data(),
+                    k_,
+                    true,
+                    true);
 
     build_actual_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
       actual_labels_.data(), rows_, k_, search_labels_.data(), indices_.data());
@@ -92,14 +102,15 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     build_expected_output<<<raft::ceildiv(rows_ * k_, 32), 32, 0, stream>>>(
       expected_labels_.data(), rows_, k_, search_labels_.data());
 
-    ASSERT_TRUE(devArrMatch(expected_labels_.data(), actual_labels_.data(),
-                            rows_ * k_, raft::Compare<int>()));
+    ASSERT_TRUE(devArrMatch(
+      expected_labels_.data(), actual_labels_.data(), rows_ * k_, raft::Compare<int>()));
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     rows_ = params_.input.size();
     cols_ = params_.input[0].size();
-    k_ = params_.k;
+    k_    = params_.k;
 
     actual_labels_.resize(rows_ * k_, stream);
     expected_labels_.resize(rows_ * k_, stream);
@@ -109,20 +120,17 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
     distances_.resize(rows_ * k_, stream);
     search_labels_.resize(rows_, stream);
 
-    CUDA_CHECK(cudaMemsetAsync(actual_labels_.data(), 0,
-                               actual_labels_.size() * sizeof(int), stream));
-    CUDA_CHECK(cudaMemsetAsync(expected_labels_.data(), 0,
-                               expected_labels_.size() * sizeof(int), stream));
     CUDA_CHECK(
-      cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream));
-    CUDA_CHECK(cudaMemsetAsync(search_data_.data(), 0,
-                               search_data_.size() * sizeof(float), stream));
-    CUDA_CHECK(cudaMemsetAsync(indices_.data(), 0,
-                               indices_.size() * sizeof(int64_t), stream));
-    CUDA_CHECK(cudaMemsetAsync(distances_.data(), 0,
-                               distances_.size() * sizeof(float), stream));
-    CUDA_CHECK(cudaMemsetAsync(search_labels_.data(), 0,
-                               search_labels_.size() * sizeof(int), stream));
+      cudaMemsetAsync(actual_labels_.data(), 0, actual_labels_.size() * sizeof(int), stream));
+    CUDA_CHECK(
+      cudaMemsetAsync(expected_labels_.data(), 0, expected_labels_.size() * sizeof(int), stream));
+    CUDA_CHECK(cudaMemsetAsync(input_.data(), 0, input_.size() * sizeof(float), stream));
+    CUDA_CHECK(
+      cudaMemsetAsync(search_data_.data(), 0, search_data_.size() * sizeof(float), stream));
+    CUDA_CHECK(cudaMemsetAsync(indices_.data(), 0, indices_.size() * sizeof(int64_t), stream));
+    CUDA_CHECK(cudaMemsetAsync(distances_.data(), 0, distances_.size() * sizeof(float), stream));
+    CUDA_CHECK(
+      cudaMemsetAsync(search_labels_.data(), 0, search_labels_.size() * sizeof(int), stream));
 
     std::vector<float> row_major_input;
     for (std::size_t i = 0; i < params_.input.size(); ++i) {
@@ -130,13 +138,13 @@ class KNNTest : public ::testing::TestWithParam<KNNInputs> {
         row_major_input.push_back(params_.input[i][j]);
       }
     }
-    rmm::device_buffer input_d = rmm::device_buffer(
-      row_major_input.data(), row_major_input.size() * sizeof(float), stream);
-    float *input_ptr = static_cast<float *>(input_d.data());
+    rmm::device_buffer input_d =
+      rmm::device_buffer(row_major_input.data(), row_major_input.size() * sizeof(float), stream);
+    float* input_ptr = static_cast<float*>(input_d.data());
 
-    rmm::device_buffer labels_d = rmm::device_buffer(
-      params_.labels.data(), params_.labels.size() * sizeof(int), stream);
-    int *labels_ptr = static_cast<int *>(labels_d.data());
+    rmm::device_buffer labels_d =
+      rmm::device_buffer(params_.labels.data(), params_.labels.size() * sizeof(int), stream);
+    int* labels_ptr = static_cast<int*>(labels_d.data());
 
     raft::copy(input_.data(), input_ptr, rows_ * cols_, stream);
     raft::copy(search_data_.data(), input_ptr, rows_ * cols_, stream);
diff --git a/cpp/test/spatial/selection.cu b/cpp/test/spatial/selection.cu
index 7742b9bd30..ad6d1e58d1 100644
--- a/cpp/test/spatial/selection.cu
+++ b/cpp/test/spatial/selection.cu
@@ -45,8 +45,9 @@ struct SparseSelectionInputs {
 };
 
 template <typename value_idx, typename value_t>
-::std::ostream &operator<<(
-  ::std::ostream &os, const SparseSelectionInputs<value_idx, value_t> &dims) {
+::std::ostream& operator<<(::std::ostream& os,
+                           const SparseSelectionInputs<value_idx, value_t>& dims)
+{
   return os;
 }
 
@@ -55,18 +56,20 @@ class SparseSelectionTest
   : public ::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>> {
  public:
   SparseSelectionTest()
-    : params(::testing::TestWithParam<
-             SparseSelectionInputs<value_idx, value_t>>::GetParam()),
+    : params(::testing::TestWithParam<SparseSelectionInputs<value_idx, value_t>>::GetParam()),
       stream(handle.get_stream()),
       dists(0, stream),
       inds(0, stream),
       out_indices_ref(0, stream),
       out_dists_ref(0, stream),
       out_dists(0, stream),
-      out_indices(0, stream) {}
+      out_indices(0, stream)
+  {
+  }
 
  protected:
-  void make_data() {
+  void make_data()
+  {
     std::vector<value_t> dists_h = params.dists_h;
 
     dists.resize(n_rows * n_cols, stream);
@@ -77,36 +80,43 @@ class SparseSelectionTest
     update_device(dists.data(), dists_h.data(), dists_h.size(), stream);
     iota_fill(inds.data(), n_rows, n_cols, stream);
 
-    std::vector<value_t> out_dists_ref_h = params.out_dists_ref_h;
+    std::vector<value_t> out_dists_ref_h     = params.out_dists_ref_h;
     std::vector<value_idx> out_indices_ref_h = params.out_indices_ref_h;
     out_indices_ref.resize(out_indices_ref_h.size(), stream);
     out_dists_ref.resize(out_dists_ref_h.size(), stream);
 
-    update_device(out_indices_ref.data(), out_indices_ref_h.data(),
-                  out_indices_ref_h.size(), stream);
-    update_device(out_dists_ref.data(), out_dists_ref_h.data(),
-                  out_dists_ref_h.size(), stream);
+    update_device(
+      out_indices_ref.data(), out_indices_ref_h.data(), out_indices_ref_h.size(), stream);
+    update_device(out_dists_ref.data(), out_dists_ref_h.data(), out_dists_ref_h.size(), stream);
   }
 
-  void SetUp() override {
+  void SetUp() override
+  {
     n_rows = params.n_rows;
     n_cols = params.n_cols;
-    k = params.k;
+    k      = params.k;
 
     make_data();
 
-    raft::spatial::knn::select_k(dists.data(), inds.data(), n_rows, n_cols,
-                                 out_dists.data(), out_indices.data(),
-                                 params.select_min, k, stream);
+    raft::spatial::knn::select_k(dists.data(),
+                                 inds.data(),
+                                 n_rows,
+                                 n_cols,
+                                 out_dists.data(),
+                                 out_indices.data(),
+                                 params.select_min,
+                                 k,
+                                 stream);
 
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void compare() {
-    ASSERT_TRUE(devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k,
-                            Compare<value_t>()));
-    ASSERT_TRUE(devArrMatch(out_indices_ref.data(), out_indices.data(),
-                            n_rows * k, Compare<value_idx>()));
+  void compare()
+  {
+    ASSERT_TRUE(
+      devArrMatch(out_dists_ref.data(), out_dists.data(), n_rows * k, Compare<value_t>()));
+    ASSERT_TRUE(
+      devArrMatch(out_indices_ref.data(), out_indices.data(), n_rows * k, Compare<value_idx>()));
   }
 
  protected:
@@ -141,7 +151,8 @@ const std::vector<SparseSelectionInputs<int, float>> inputs_i32_f = {
    true}};
 typedef SparseSelectionTest<int, float> SparseSelectionTestF;
 TEST_P(SparseSelectionTestF, Result) { compare(); }
-INSTANTIATE_TEST_CASE_P(SparseSelectionTest, SparseSelectionTestF,
+INSTANTIATE_TEST_CASE_P(SparseSelectionTest,
+                        SparseSelectionTestF,
                         ::testing::ValuesIn(inputs_i32_f));
 
 };  // end namespace selection
diff --git a/cpp/test/spatial/spatial_data.h b/cpp/test/spatial/spatial_data.h
index 87891164fc..dbb32c4546 100644
--- a/cpp/test/spatial/spatial_data.h
+++ b/cpp/test/spatial/spatial_data.h
@@ -5,23 +5,18 @@ namespace spatial {
 
 // Latitude and longitude coordinates of 51 US states / territories
 std::vector<float> spatial_data = {
-  63.588753, -154.493062, 32.318231, -86.902298,  35.20105,  -91.831833,
-  34.048928, -111.093731, 36.778261, -119.417932, 39.550051, -105.782067,
-  41.603221, -73.087749,  38.905985, -77.033418,  38.910832, -75.52767,
-  27.664827, -81.515754,  32.157435, -82.907123,  19.898682, -155.665857,
-  41.878003, -93.097702,  44.068202, -114.742041, 40.633125, -89.398528,
-  40.551217, -85.602364,  39.011902, -98.484246,  37.839333, -84.270018,
-  31.244823, -92.145024,  42.407211, -71.382437,  39.045755, -76.641271,
-  45.253783, -69.445469,  44.314844, -85.602364,  46.729553, -94.6859,
-  37.964253, -91.831833,  32.354668, -89.398528,  46.879682, -110.362566,
-  35.759573, -79.0193,    47.551493, -101.002012, 41.492537, -99.901813,
-  43.193852, -71.572395,  40.058324, -74.405661,  34.97273,  -105.032363,
-  38.80261,  -116.419389, 43.299428, -74.217933,  40.417287, -82.907123,
-  35.007752, -97.092877,  43.804133, -120.554201, 41.203322, -77.194525,
-  18.220833, -66.590149,  41.580095, -71.477429,  33.836081, -81.163725,
-  43.969515, -99.901813,  35.517491, -86.580447,  31.968599, -99.901813,
-  39.32098,  -111.093731, 37.431573, -78.656894,  44.558803, -72.577841,
-  47.751074, -120.740139, 43.78444,  -88.787868,  38.597626, -80.454903,
-  43.075968, -107.290284};
+  63.588753, -154.493062, 32.318231, -86.902298,  35.20105,  -91.831833,  34.048928, -111.093731,
+  36.778261, -119.417932, 39.550051, -105.782067, 41.603221, -73.087749,  38.905985, -77.033418,
+  38.910832, -75.52767,   27.664827, -81.515754,  32.157435, -82.907123,  19.898682, -155.665857,
+  41.878003, -93.097702,  44.068202, -114.742041, 40.633125, -89.398528,  40.551217, -85.602364,
+  39.011902, -98.484246,  37.839333, -84.270018,  31.244823, -92.145024,  42.407211, -71.382437,
+  39.045755, -76.641271,  45.253783, -69.445469,  44.314844, -85.602364,  46.729553, -94.6859,
+  37.964253, -91.831833,  32.354668, -89.398528,  46.879682, -110.362566, 35.759573, -79.0193,
+  47.551493, -101.002012, 41.492537, -99.901813,  43.193852, -71.572395,  40.058324, -74.405661,
+  34.97273,  -105.032363, 38.80261,  -116.419389, 43.299428, -74.217933,  40.417287, -82.907123,
+  35.007752, -97.092877,  43.804133, -120.554201, 41.203322, -77.194525,  18.220833, -66.590149,
+  41.580095, -71.477429,  33.836081, -81.163725,  43.969515, -99.901813,  35.517491, -86.580447,
+  31.968599, -99.901813,  39.32098,  -111.093731, 37.431573, -78.656894,  44.558803, -72.577841,
+  47.751074, -120.740139, 43.78444,  -88.787868,  38.597626, -80.454903,  43.075968, -107.290284};
 };  // namespace spatial
 };  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/spectral_matrix.cu b/cpp/test/spectral_matrix.cu
index 388ad56f2d..fa54b04cda 100644
--- a/cpp/test/spectral_matrix.cu
+++ b/cpp/test/spectral_matrix.cu
@@ -32,7 +32,8 @@ struct csr_view_t {
   index_type number_of_edges;
 };
 }  // namespace
-TEST(Raft, SpectralMatrices) {
+TEST(Raft, SpectralMatrices)
+{
   using namespace matrix;
   using index_type = int;
   using value_type = double;
@@ -48,7 +49,7 @@ TEST(Raft, SpectralMatrices) {
   index_type* ro{nullptr};
   index_type* ci{nullptr};
   value_type* vs{nullptr};
-  index_type nnz = 0;
+  index_type nnz   = 0;
   index_type nrows = 0;
   sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
   sparse_matrix_t<index_type, value_type> sm2{h, csr_v};
@@ -62,9 +63,7 @@ TEST(Raft, SpectralMatrices) {
   };
   EXPECT_ANY_THROW(cnstr_lm1());  // because of nullptr ptr args
 
-  auto cnstr_lm2 = [&h, &sm2](void) {
-    laplacian_matrix_t<index_type, value_type> lm2{h, sm2};
-  };
+  auto cnstr_lm2 = [&h, &sm2](void) { laplacian_matrix_t<index_type, value_type> lm2{h, sm2}; };
   EXPECT_ANY_THROW(cnstr_lm2());  // because of nullptr ptr args
 
   auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) {
@@ -72,9 +71,7 @@ TEST(Raft, SpectralMatrices) {
   };
   EXPECT_ANY_THROW(cnstr_mm1());  // because of nullptr ptr args
 
-  auto cnstr_mm2 = [&h, &sm2](void) {
-    modularity_matrix_t<index_type, value_type> mm2{h, sm2};
-  };
+  auto cnstr_mm2 = [&h, &sm2](void) { modularity_matrix_t<index_type, value_type> mm2{h, sm2}; };
   EXPECT_ANY_THROW(cnstr_mm2());  // because of nullptr ptr args
 }
 
diff --git a/cpp/test/stats/mean.cu b/cpp/test/stats/mean.cu
index cf866a5663..b8ea2cb799 100644
--- a/cpp/test/stats/mean.cu
+++ b/cpp/test/stats/mean.cu
@@ -35,7 +35,8 @@ struct MeanInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const MeanInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MeanInputs<T>& dims)
+{
   return os;
 }
 
@@ -48,20 +49,23 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
       rows(params.rows),
       cols(params.cols),
       data(rows * cols, stream),
-      mean_act(rows * cols, stream) {}
+      mean_act(rows * cols, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     int len = rows * cols;
     r.normal(data.data(), len, params.mean, (T)1.0, stream);
     meanSGtest(data.data(), stream);
   }
 
-  void meanSGtest(T *data, cudaStream_t stream) {
+  void meanSGtest(T* data, cudaStream_t stream)
+  {
     int rows = params.rows, cols = params.cols;
-    mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor,
-         stream);
+    mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor, stream);
   }
 
  protected:
@@ -76,52 +80,52 @@ class MeanTest : public ::testing::TestWithParam<MeanInputs<T>> {
 // Note: For 1024 samples, 256 experiments, a mean of 1.0 with stddev=1.0, the
 // measured mean (of a normal distribution) will fall outside of an epsilon of
 // 0.15 only 4/10000 times. (epsilon of 0.1 will fail 30/100 times)
-const std::vector<MeanInputs<float>> inputsf = {
-  {0.15f, 1.f, 1024, 32, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
-  {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
-  {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
-  {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
-  {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
-  {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
-  {0.15f, -1.f, 1024, 256, false, true, 1234ULL}};
-
-const std::vector<MeanInputs<double>> inputsd = {
-  {0.15, 1.0, 1024, 32, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 64, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, false, 1234ULL},
-  {0.15, 1.0, 1024, 256, true, false, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 64, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, false, 1234ULL},
-  {0.15, -1.0, 1024, 256, false, false, 1234ULL},
-  {0.15, 1.0, 1024, 32, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 64, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 128, true, true, 1234ULL},
-  {0.15, 1.0, 1024, 256, true, true, 1234ULL},
-  {0.15, -1.0, 1024, 32, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 64, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 128, false, true, 1234ULL},
-  {0.15, -1.0, 1024, 256, false, true, 1234ULL}};
+const std::vector<MeanInputs<float>> inputsf = {{0.15f, 1.f, 1024, 32, true, false, 1234ULL},
+                                                {0.15f, 1.f, 1024, 64, true, false, 1234ULL},
+                                                {0.15f, 1.f, 1024, 128, true, false, 1234ULL},
+                                                {0.15f, 1.f, 1024, 256, true, false, 1234ULL},
+                                                {0.15f, -1.f, 1024, 32, false, false, 1234ULL},
+                                                {0.15f, -1.f, 1024, 64, false, false, 1234ULL},
+                                                {0.15f, -1.f, 1024, 128, false, false, 1234ULL},
+                                                {0.15f, -1.f, 1024, 256, false, false, 1234ULL},
+                                                {0.15f, 1.f, 1024, 32, true, true, 1234ULL},
+                                                {0.15f, 1.f, 1024, 64, true, true, 1234ULL},
+                                                {0.15f, 1.f, 1024, 128, true, true, 1234ULL},
+                                                {0.15f, 1.f, 1024, 256, true, true, 1234ULL},
+                                                {0.15f, -1.f, 1024, 32, false, true, 1234ULL},
+                                                {0.15f, -1.f, 1024, 64, false, true, 1234ULL},
+                                                {0.15f, -1.f, 1024, 128, false, true, 1234ULL},
+                                                {0.15f, -1.f, 1024, 256, false, true, 1234ULL}};
+
+const std::vector<MeanInputs<double>> inputsd = {{0.15, 1.0, 1024, 32, true, false, 1234ULL},
+                                                 {0.15, 1.0, 1024, 64, true, false, 1234ULL},
+                                                 {0.15, 1.0, 1024, 128, true, false, 1234ULL},
+                                                 {0.15, 1.0, 1024, 256, true, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 32, false, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 64, false, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 128, false, false, 1234ULL},
+                                                 {0.15, -1.0, 1024, 256, false, false, 1234ULL},
+                                                 {0.15, 1.0, 1024, 32, true, true, 1234ULL},
+                                                 {0.15, 1.0, 1024, 64, true, true, 1234ULL},
+                                                 {0.15, 1.0, 1024, 128, true, true, 1234ULL},
+                                                 {0.15, 1.0, 1024, 256, true, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 32, false, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 64, false, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 128, false, true, 1234ULL},
+                                                 {0.15, -1.0, 1024, 256, false, true, 1234ULL}};
 
 typedef MeanTest<float> MeanTestF;
-TEST_P(MeanTestF, Result) {
-  ASSERT_TRUE(devArrMatch(params.mean, mean_act.data(), params.cols,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(MeanTestF, Result)
+{
+  ASSERT_TRUE(
+    devArrMatch(params.mean, mean_act.data(), params.cols, CompareApprox<float>(params.tolerance)));
 }
 
 typedef MeanTest<double> MeanTestD;
-TEST_P(MeanTestD, Result) {
-  ASSERT_TRUE(devArrMatch(params.mean, mean_act.data(), params.cols,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(MeanTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    params.mean, mean_act.data(), params.cols, CompareApprox<double>(params.tolerance)));
 }
 
 INSTANTIATE_TEST_SUITE_P(MeanTests, MeanTestF, ::testing::ValuesIn(inputsf));
diff --git a/cpp/test/stats/mean_center.cu b/cpp/test/stats/mean_center.cu
index dcc4b4e551..6a76a289d7 100644
--- a/cpp/test/stats/mean_center.cu
+++ b/cpp/test/stats/mean_center.cu
@@ -34,37 +34,49 @@ struct MeanCenterInputs {
 };
 
 template <typename T, typename IdxType>
-::std::ostream &operator<<(::std::ostream &os,
-                           const MeanCenterInputs<T, IdxType> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const MeanCenterInputs<T, IdxType>& dims)
+{
   return os;
 }
 
 template <typename T, typename IdxType>
-class MeanCenterTest
-  : public ::testing::TestWithParam<MeanCenterInputs<T, IdxType>> {
+class MeanCenterTest : public ::testing::TestWithParam<MeanCenterInputs<T, IdxType>> {
  public:
   MeanCenterTest()
-    : params(
-        ::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam()),
+    : params(::testing::TestWithParam<MeanCenterInputs<T, IdxType>>::GetParam()),
       stream(handle.get_stream()),
       rows(params.rows),
       cols(params.cols),
       out(rows * cols, stream),
       out_ref(rows * cols, stream),
       data(rows * cols, stream),
-      meanVec(params.bcastAlongRows ? cols : rows, stream) {}
+      meanVec(params.bcastAlongRows ? cols : rows, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     raft::random::Rng r(params.seed);
     auto len = rows * cols;
     r.normal(data.data(), len, params.mean, (T)1.0, stream);
-    raft::stats::mean(meanVec.data(), data.data(), cols, rows, params.sample,
-                      params.rowMajor, stream);
-    meanCenter(out.data(), data.data(), meanVec.data(), cols, rows,
-               params.rowMajor, params.bcastAlongRows, stream);
-    raft::linalg::naiveMatVec(out_ref.data(), data.data(), meanVec.data(), cols,
-                              rows, params.rowMajor, params.bcastAlongRows,
+    raft::stats::mean(
+      meanVec.data(), data.data(), cols, rows, params.sample, params.rowMajor, stream);
+    meanCenter(out.data(),
+               data.data(),
+               meanVec.data(),
+               cols,
+               rows,
+               params.rowMajor,
+               params.bcastAlongRows,
+               stream);
+    raft::linalg::naiveMatVec(out_ref.data(),
+                              data.data(),
+                              meanVec.data(),
+                              cols,
+                              rows,
+                              params.rowMajor,
+                              params.bcastAlongRows,
                               (T)-1.0);
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
@@ -104,12 +116,12 @@ const std::vector<MeanCenterInputs<float, int>> inputsf_i32 = {
   {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
   {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<float, int> MeanCenterTestF_i32;
-TEST_P(MeanCenterTestF_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MeanCenterTestF_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out.data(), out_ref.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32,
-                         ::testing::ValuesIn(inputsf_i32));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i32, ::testing::ValuesIn(inputsf_i32));
 
 const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
   {0.05f, 1.f, 1024, 32, true, false, true, 1234ULL},
@@ -137,12 +149,12 @@ const std::vector<MeanCenterInputs<float, size_t>> inputsf_i64 = {
   {0.05f, -1.f, 1024, 64, false, true, false, 1234ULL},
   {0.05f, -1.f, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<float, size_t> MeanCenterTestF_i64;
-TEST_P(MeanCenterTestF_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols,
-                          raft::CompareApprox<float>(params.tolerance)));
+TEST_P(MeanCenterTestF_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out.data(), out_ref.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64,
-                         ::testing::ValuesIn(inputsf_i64));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestF_i64, ::testing::ValuesIn(inputsf_i64));
 
 const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
   {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
@@ -170,12 +182,12 @@ const std::vector<MeanCenterInputs<double, int>> inputsd_i32 = {
   {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
   {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<double, int> MeanCenterTestD_i32;
-TEST_P(MeanCenterTestD_i32, Result) {
-  ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(MeanCenterTestD_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out.data(), out_ref.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32,
-                         ::testing::ValuesIn(inputsd_i32));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i32, ::testing::ValuesIn(inputsd_i32));
 
 const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
   {0.05, 1.0, 1024, 32, true, false, true, 1234ULL},
@@ -203,12 +215,12 @@ const std::vector<MeanCenterInputs<double, size_t>> inputsd_i64 = {
   {0.05, -1.0, 1024, 64, false, true, false, 1234ULL},
   {0.05, -1.0, 1024, 128, false, true, false, 1234ULL}};
 typedef MeanCenterTest<double, size_t> MeanCenterTestD_i64;
-TEST_P(MeanCenterTestD_i64, Result) {
-  ASSERT_TRUE(devArrMatch(out.data(), out_ref.data(), params.cols,
-                          raft::CompareApprox<double>(params.tolerance)));
+TEST_P(MeanCenterTestD_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out.data(), out_ref.data(), params.cols, raft::CompareApprox<double>(params.tolerance)));
 }
-INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64,
-                         ::testing::ValuesIn(inputsd_i64));
+INSTANTIATE_TEST_SUITE_P(MeanCenterTests, MeanCenterTestD_i64, ::testing::ValuesIn(inputsd_i64));
 
 }  // end namespace stats
 }  // end namespace raft
diff --git a/cpp/test/stats/stddev.cu b/cpp/test/stats/stddev.cu
index 53f392aaf3..3efc54264e 100644
--- a/cpp/test/stats/stddev.cu
+++ b/cpp/test/stats/stddev.cu
@@ -34,7 +34,8 @@ struct StdDevInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const StdDevInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const StdDevInputs<T>& dims)
+{
   return os;
 }
 
@@ -49,10 +50,13 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
       data(rows * cols, stream),
       mean_act(cols, stream),
       stddev_act(cols, stream),
-      vars_act(cols, stream) {}
+      vars_act(cols, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     random::Rng r(params.seed);
     int len = rows * cols;
 
@@ -65,17 +69,17 @@ class StdDevTest : public ::testing::TestWithParam<StdDevInputs<T>> {
     CUDA_CHECK(cudaStreamSynchronize(stream));
   }
 
-  void stdVarSGtest(T *data, cudaStream_t stream) {
+  void stdVarSGtest(T* data, cudaStream_t stream)
+  {
     int rows = params.rows, cols = params.cols;
 
-    mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor,
-         stream);
+    mean(mean_act.data(), data, cols, rows, params.sample, params.rowMajor, stream);
 
-    stddev(stddev_act.data(), data, mean_act.data(), cols, rows, params.sample,
-           params.rowMajor, stream);
+    stddev(
+      stddev_act.data(), data, mean_act.data(), cols, rows, params.sample, params.rowMajor, stream);
 
-    vars(vars_act.data(), data, mean_act.data(), cols, rows, params.sample,
-         params.rowMajor, stream);
+    vars(
+      vars_act.data(), data, mean_act.data(), cols, rows, params.sample, params.rowMajor, stream);
 
     raft::matrix::seqRoot(vars_act.data(), T(1), cols, stream);
   }
@@ -126,28 +130,28 @@ const std::vector<StdDevInputs<double>> inputsd = {
   {0.1, -1.0, 2.0, 1024, 256, false, true, 1234ULL}};
 
 typedef StdDevTest<float> StdDevTestF;
-TEST_P(StdDevTestF, Result) {
-  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act.data(), params.cols,
-                          CompareApprox<float>(params.tolerance)));
+TEST_P(StdDevTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    params.stddev, stddev_act.data(), params.cols, CompareApprox<float>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(stddev_act.data(), vars_act.data(), params.cols,
-                          CompareApprox<float>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    stddev_act.data(), vars_act.data(), params.cols, CompareApprox<float>(params.tolerance)));
 }
 
 typedef StdDevTest<double> StdDevTestD;
-TEST_P(StdDevTestD, Result) {
-  ASSERT_TRUE(devArrMatch(params.stddev, stddev_act.data(), params.cols,
-                          CompareApprox<double>(params.tolerance)));
+TEST_P(StdDevTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    params.stddev, stddev_act.data(), params.cols, CompareApprox<double>(params.tolerance)));
 
-  ASSERT_TRUE(devArrMatch(stddev_act.data(), vars_act.data(), params.cols,
-                          CompareApprox<double>(params.tolerance)));
+  ASSERT_TRUE(devArrMatch(
+    stddev_act.data(), vars_act.data(), params.cols, CompareApprox<double>(params.tolerance)));
 }
 
-INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF,
-                         ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD,
-                         ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(StdDevTests, StdDevTestD, ::testing::ValuesIn(inputsd));
 
 }  // end namespace stats
 }  // end namespace raft
diff --git a/cpp/test/stats/sum.cu b/cpp/test/stats/sum.cu
index ac4d642c8e..ecb1171ea5 100644
--- a/cpp/test/stats/sum.cu
+++ b/cpp/test/stats/sum.cu
@@ -32,7 +32,8 @@ struct SumInputs {
 };
 
 template <typename T>
-::std::ostream &operator<<(::std::ostream &os, const SumInputs<T> &dims) {
+::std::ostream& operator<<(::std::ostream& os, const SumInputs<T>& dims)
+{
   return os;
 }
 
@@ -45,10 +46,13 @@ class SumTest : public ::testing::TestWithParam<SumInputs<T>> {
       rows(params.rows),
       cols(params.cols),
       data(rows * cols, stream),
-      sum_act(cols, stream) {}
+      sum_act(cols, stream)
+  {
+  }
 
  protected:
-  void SetUp() override {
+  void SetUp() override
+  {
     int len = rows * cols;
 
     T data_h[len];
@@ -77,14 +81,17 @@ const std::vector<SumInputs<double>> inputsd = {{0.05, 1024, 32, 1234ULL},
                                                 {0.05, 1024, 256, 1234ULL}};
 
 typedef SumTest<float> SumTestF;
-TEST_P(SumTestF, Result) {
-  ASSERT_TRUE(raft::devArrMatch(float(params.rows), sum_act.data(), params.cols,
-                                raft::CompareApprox<float>(params.tolerance)));
+TEST_P(SumTestF, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(
+    float(params.rows), sum_act.data(), params.cols, raft::CompareApprox<float>(params.tolerance)));
 }
 
 typedef SumTest<double> SumTestD;
-TEST_P(SumTestD, Result) {
-  ASSERT_TRUE(raft::devArrMatch(double(params.rows), sum_act.data(),
+TEST_P(SumTestD, Result)
+{
+  ASSERT_TRUE(raft::devArrMatch(double(params.rows),
+                                sum_act.data(),
                                 params.cols,
                                 raft::CompareApprox<double>(params.tolerance)));
 }
diff --git a/cpp/test/test_utils.h b/cpp/test/test_utils.h
index 0f135c0121..58b9ae42ae 100644
--- a/cpp/test/test_utils.h
+++ b/cpp/test/test_utils.h
@@ -32,15 +32,16 @@ namespace raft {
 
 template <typename T>
 struct Compare {
-  bool operator()(const T &a, const T &b) const { return a == b; }
+  bool operator()(const T& a, const T& b) const { return a == b; }
 };
 
 template <typename T>
 struct CompareApprox {
   CompareApprox(T eps_) : eps(eps_) {}
-  bool operator()(const T &a, const T &b) const {
-    T diff = abs(a - b);
-    T m = std::max(abs(a), abs(b));
+  bool operator()(const T& a, const T& b) const
+  {
+    T diff  = abs(a - b);
+    T m     = std::max(abs(a), abs(b));
     T ratio = diff >= eps ? diff / m : diff;
 
     return (ratio <= eps);
@@ -53,9 +54,10 @@ struct CompareApprox {
 template <typename T>
 struct CompareApproxAbs {
   CompareApproxAbs(T eps_) : eps(eps_) {}
-  bool operator()(const T &a, const T &b) const {
-    T diff = abs(abs(a) - abs(b));
-    T m = std::max(abs(a), abs(b));
+  bool operator()(const T& a, const T& b) const
+  {
+    T diff  = abs(abs(a) - abs(b));
+    T m     = std::max(abs(a), abs(b));
     T ratio = diff >= eps ? diff / m : diff;
     return (ratio <= eps);
   }
@@ -65,25 +67,26 @@ struct CompareApproxAbs {
 };
 
 template <typename T>
-T abs(const T &a) {
+T abs(const T& a)
+{
   return a > T(0) ? a : -a;
 }
 
 /*
-     * @brief Helper function to compare 2 device n-D arrays with custom comparison
-     * @tparam T the data type of the arrays
-     * @tparam L the comparator lambda or object function
-     * @param expected expected value(s)
-     * @param actual actual values
-     * @param eq_compare the comparator
-     * @param stream cuda stream
-     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
-     * @{
-     */
+ * @brief Helper function to compare 2 device n-D arrays with custom comparison
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected expected value(s)
+ * @param actual actual values
+ * @param eq_compare the comparator
+ * @param stream cuda stream
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ * @{
+ */
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(const T *expected, const T *actual,
-                                     size_t size, L eq_compare,
-                                     cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatch(
+  const T* expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0)
+{
   std::unique_ptr<T[]> exp_h(new T[size]);
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(exp_h.get(), expected, size, stream);
@@ -93,16 +96,16 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual,
     auto exp = exp_h.get()[i];
     auto act = act_h.get()[i];
     if (!eq_compare(exp, act)) {
-      return testing::AssertionFailure()
-             << "actual=" << act << " != expected=" << exp << " @" << i;
+      return testing::AssertionFailure() << "actual=" << act << " != expected=" << exp << " @" << i;
     }
   }
   return testing::AssertionSuccess();
 }
 
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size,
-                                     L eq_compare, cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatch(
+  T expected, const T* actual, size_t size, L eq_compare, cudaStream_t stream = 0)
+{
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -117,9 +120,13 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t size,
 }
 
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(const T *expected, const T *actual,
-                                     size_t rows, size_t cols, L eq_compare,
-                                     cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatch(const T* expected,
+                                     const T* actual,
+                                     size_t rows,
+                                     size_t cols,
+                                     L eq_compare,
+                                     cudaStream_t stream = 0)
+{
   size_t size = rows * cols;
   std::unique_ptr<T[]> exp_h(new T[size]);
   std::unique_ptr<T[]> act_h(new T[size]);
@@ -133,8 +140,7 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual,
       auto act = act_h.get()[idx];
       if (!eq_compare(exp, act)) {
         return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << exp << " @" << i << ","
-               << j;
+               << "actual=" << act << " != expected=" << exp << " @" << i << "," << j;
       }
     }
   }
@@ -142,9 +148,9 @@ testing::AssertionResult devArrMatch(const T *expected, const T *actual,
 }
 
 template <typename T, typename L>
-testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows,
-                                     size_t cols, L eq_compare,
-                                     cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatch(
+  T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0)
+{
   size_t size = rows * cols;
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
@@ -155,8 +161,7 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows,
       auto act = act_h.get()[idx];
       if (!eq_compare(expected, act)) {
         return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << expected << " @" << i
-               << "," << j;
+               << "actual=" << act << " != expected=" << expected << " @" << i << "," << j;
       }
     }
   }
@@ -164,24 +169,24 @@ testing::AssertionResult devArrMatch(T expected, const T *actual, size_t rows,
 }
 
 /*
-     * @brief Helper function to compare a device n-D arrays with an expected array
-     * on the host, using a custom comparison
-     * @tparam T the data type of the arrays
-     * @tparam L the comparator lambda or object function
-     * @param expected_h host array of expected value(s)
-     * @param actual_d device array actual values
-     * @param eq_compare the comparator
-     * @param stream cuda stream
-     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
-     */
+ * @brief Helper function to compare a device n-D arrays with an expected array
+ * on the host, using a custom comparison
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected_h host array of expected value(s)
+ * @param actual_d device array actual values
+ * @param eq_compare the comparator
+ * @param stream cuda stream
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ */
 template <typename T, typename L>
-testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d,
-                                         size_t size, L eq_compare,
-                                         cudaStream_t stream = 0) {
+testing::AssertionResult devArrMatchHost(
+  const T* expected_h, const T* actual_d, size_t size, L eq_compare, cudaStream_t stream = 0)
+{
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual_d, size, stream);
   CUDA_CHECK(cudaStreamSynchronize(stream));
-  bool ok = true;
+  bool ok   = true;
   auto fail = testing::AssertionFailure();
   for (size_t i(0); i < size; ++i) {
     auto exp = expected_h[i];
@@ -196,19 +201,19 @@ testing::AssertionResult devArrMatchHost(const T *expected_h, const T *actual_d,
 }
 
 /*
-     * @brief Helper function to compare diagonal values of a 2D matrix
-     * @tparam T the data type of the arrays
-     * @tparam L the comparator lambda or object function
-     * @param expected expected value along diagonal
-     * @param actual actual matrix
-     * @param eq_compare the comparator
-     * @param stream cuda stream
-     * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
-     */
+ * @brief Helper function to compare diagonal values of a 2D matrix
+ * @tparam T the data type of the arrays
+ * @tparam L the comparator lambda or object function
+ * @param expected expected value along diagonal
+ * @param actual actual matrix
+ * @param eq_compare the comparator
+ * @param stream cuda stream
+ * @return the testing assertion to be later used by ASSERT_TRUE/EXPECT_TRUE
+ */
 template <typename T, typename L>
-testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows,
-                                       size_t cols, L eq_compare,
-                                       cudaStream_t stream = 0) {
+testing::AssertionResult diagonalMatch(
+  T expected, const T* actual, size_t rows, size_t cols, L eq_compare, cudaStream_t stream = 0)
+{
   size_t size = rows * cols;
   std::unique_ptr<T[]> act_h(new T[size]);
   raft::update_host<T>(act_h.get(), actual, size, stream);
@@ -220,8 +225,7 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows,
       auto act = act_h.get()[idx];
       if (!eq_compare(expected, act)) {
         return testing::AssertionFailure()
-               << "actual=" << act << " != expected=" << expected << " @" << i
-               << "," << j;
+               << "actual=" << act << " != expected=" << expected << " @" << i << "," << j;
       }
     }
   }
@@ -229,10 +233,10 @@ testing::AssertionResult diagonalMatch(T expected, const T *actual, size_t rows,
 }
 
 template <typename T, typename L>
-testing::AssertionResult match(const T expected, T actual, L eq_compare) {
+testing::AssertionResult match(const T expected, T actual, L eq_compare)
+{
   if (!eq_compare(expected, actual)) {
-    return testing::AssertionFailure()
-           << "actual=" << actual << " != expected=" << expected;
+    return testing::AssertionFailure() << "actual=" << actual << " != expected=" << expected;
   }
   return testing::AssertionSuccess();
 }
@@ -256,8 +260,8 @@ testing::AssertionResult match(const T expected, T actual, L eq_compare) {
     ms /= args.runs;                                    \
   } while (0)
 
-inline std::vector<float> read_csv(std::string filename,
-                                   bool skip_first_n_columns = 1) {
+inline std::vector<float> read_csv(std::string filename, bool skip_first_n_columns = 1)
+{
   std::vector<float> result;
   std::ifstream myFile(filename);
   if (!myFile.is_open()) throw std::runtime_error("Could not open file");
@@ -268,8 +272,7 @@ inline std::vector<float> read_csv(std::string filename,
   if (myFile.good()) {
     std::getline(myFile, line);
     std::stringstream ss(line);
-    while (std::getline(ss, colname, ',')) {
-    }
+    while (std::getline(ss, colname, ',')) {}
   }
 
   int n_lines = 0;

From d7b4f0adf2a4f13ddcc7b7072ad0123137182f96 Mon Sep 17 00:00:00 2001
From: Conor Hoekstra <codereport@outlook.com>
Date: Wed, 24 Nov 2021 18:09:39 -0500
Subject: [PATCH 5/5] Missed change

---
 cpp/include/raft/spectral/partition.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/spectral/partition.hpp b/cpp/include/raft/spectral/partition.hpp
index 88cc8aa8f0..b52bfcc0d6 100644
--- a/cpp/include/raft/spectral/partition.hpp
+++ b/cpp/include/raft/spectral/partition.hpp
@@ -79,7 +79,7 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(handle_t const& handle,
 
   std::tuple<vertex_t, weight_t, vertex_t>
     stats;  //{iters_eig_solver,residual_cluster,iters_cluster_solver} // # iters eigen solver,
-            //cluster solver residual, # iters cluster solver
+            // cluster solver residual, # iters cluster solver
 
   vertex_t n = csr_m.nrows_;