From 94166bc4f3974012c538395ae94f9f85c1a4f1d9 Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Tue, 8 Feb 2022 04:19:53 -0500
Subject: [PATCH] Updating RAFT linalg headers (#4515)

Depends on https://github.com/rapidsai/raft/pull/383

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/cuml/pull/4515
---
 cpp/bench/prims/add.cu                        |   4 +-
 cpp/bench/prims/fused_l2_nn.cu                |   2 +-
 cpp/bench/prims/gram_matrix.cu                |   3 +-
 cpp/bench/prims/map_then_reduce.cu            |   4 +-
 cpp/bench/prims/matrix_vector_op.cu           |   4 +-
 cpp/bench/prims/reduce.cu                     |   4 +-
 cpp/bench/sg/dataset.cuh                      |   4 +-
 cpp/bench/sg/linkage.cu                       |   2 +-
 cpp/include/cuml/cluster/dbscan.hpp           |   2 +-
 cpp/include/cuml/cluster/hdbscan.hpp          |   2 +-
 cpp/include/cuml/cluster/linkage.hpp          |   2 +-
 cpp/include/cuml/metrics/metrics.hpp          |   4 +-
 cpp/include/cuml/neighbors/knn.hpp            |   4 +-
 cpp/include/cuml/neighbors/knn_sparse.hpp     |   2 +-
 cpp/src/arima/batched_arima.cu                |   2 +-
 cpp/src/arima/batched_kalman.cu               |  83 +++---
 cpp/src/common/cumlHandle.cpp                 |   6 +-
 cpp/src/dbscan/vertexdeg/precomputed.cuh      |   4 +-
 .../decisiontree/batched-levelalgo/split.cuh  |   2 +-
 cpp/src/genetic/fitness.cuh                   |  10 +-
 cpp/src/genetic/genetic.cu                    |   4 +-
 cpp/src/genetic/program.cu                    |   2 +-
 cpp/src/glm/ols.cuh                           |   8 +-
 cpp/src/glm/ols_mg.cu                         |   4 +-
 cpp/src/glm/preprocess.cuh                    |   6 +-
 cpp/src/glm/preprocess_mg.cu                  |   4 +-
 cpp/src/glm/qn/glm_base.cuh                   |  10 +-
 cpp/src/glm/qn/glm_linear.cuh                 |   2 +-
 cpp/src/glm/qn/glm_logistic.cuh               |   2 +-
 cpp/src/glm/qn/glm_regularizer.cuh            |   4 +-
 cpp/src/glm/qn/glm_softmax.cuh                |   2 +-
 cpp/src/glm/qn/glm_svm.cuh                    |   2 +-
 cpp/src/glm/qn/simple_mat/dense.hpp           |  42 +--
 cpp/src/glm/qn/simple_mat/sparse.hpp          |   9 +-
 cpp/src/glm/ridge.cuh                         |  10 +-
 cpp/src/glm/ridge_mg.cu                       |   4 +-
 cpp/src/hdbscan/detail/reachability.cuh       |   2 +-
 cpp/src/hierarchy/pw_dist_graph.cuh           |   2 +-
 cpp/src/holtwinters/internal/hw_decompose.cuh | 114 +++++----
 cpp/src/holtwinters/internal/hw_utils.cuh     |   4 +-
 cpp/src/holtwinters/runner.cuh                |   9 +-
 cpp/src/kmeans/common.cuh                     |   8 +-
 cpp/src/metrics/silhouette_score.cu           |   2 +-
 cpp/src/pca/pca.cuh                           |   7 +-
 cpp/src/pca/pca_mg.cu                         |   2 +-
 cpp/src/random_projection/rproj.cuh           |  34 +--
 cpp/src/solver/cd.cuh                         |  15 +-
 cpp/src/solver/cd_mg.cu                       |  10 +-
 cpp/src/solver/lars_impl.cuh                  | 242 +++++++++---------
 cpp/src/solver/sgd.cuh                        |  13 +-
 cpp/src/svm/kernelcache.cuh                   |   2 +-
 cpp/src/svm/linear.cu                         |  13 +-
 cpp/src/svm/results.cuh                       |   7 +-
 cpp/src/svm/smosolver.cuh                     |  66 ++---
 cpp/src/svm/svc.cu                            |   3 +-
 cpp/src/svm/svc_impl.cuh                      |  32 +--
 cpp/src/svm/svr.cu                            |   3 +-
 cpp/src/svm/svr_impl.cuh                      |   3 +-
 cpp/src/svm/workingset.cuh                    |   4 +-
 cpp/src/tsne/barnes_hut_tsne.cuh              |   2 +-
 cpp/src/tsne/distances.cuh                    |   4 +-
 cpp/src/tsne/exact_kernels.cuh                |   2 +-
 cpp/src/tsne/fft_tsne.cuh                     |   2 +-
 cpp/src/tsne/utils.cuh                        |   4 +-
 cpp/src/tsvd/tsvd.cuh                         |  11 +-
 cpp/src/tsvd/tsvd_mg.cu                       |   2 +-
 cpp/src/umap/init_embed/spectral_algo.cuh     |   4 +-
 cpp/src/umap/knn_graph/algo.cuh               |   4 +-
 cpp/src/umap/optimize.cuh                     |   9 +-
 cpp/src/umap/simpl_set_embed/algo.cuh         |   2 +-
 .../distance/epsilon_neighborhood.cuh         |   2 +-
 cpp/src_prims/functions/hinge.cuh             |  15 +-
 cpp/src_prims/functions/linearReg.cuh         |  11 +-
 cpp/src_prims/functions/log.cuh               |   4 +-
 cpp/src_prims/functions/logisticReg.cuh       |  12 +-
 cpp/src_prims/functions/penalty.cuh           |   6 +-
 cpp/src_prims/functions/sigmoid.cuh           |   4 +-
 cpp/src_prims/functions/sign.cuh              |   4 +-
 cpp/src_prims/functions/softThres.cuh         |   4 +-
 cpp/src_prims/label/classlabels.cuh           |   2 +-
 cpp/src_prims/linalg/batched/matrix.cuh       |  79 +++---
 cpp/src_prims/linalg/lstsq.cuh                | 214 ++++++++--------
 cpp/src_prims/linalg/power.cuh                |   6 +-
 cpp/src_prims/linalg/rsvd.cuh                 |  12 +-
 cpp/src_prims/linalg/sqrt.cuh                 |   4 +-
 cpp/src_prims/matrix/grammatrix.cuh           |  67 ++---
 cpp/src_prims/matrix/kernelmatrices.cuh       |   2 +-
 cpp/src_prims/metrics/adjusted_rand_index.cuh |   4 +-
 .../metrics/batched/information_criterion.cuh |   4 +-
 cpp/src_prims/metrics/dispersion.cuh          |   2 +-
 cpp/src_prims/metrics/entropy.cuh             |   4 +-
 cpp/src_prims/metrics/kl_divergence.cuh       |   2 +-
 cpp/src_prims/metrics/mutual_info_score.cuh   |   2 +-
 cpp/src_prims/metrics/scores.cuh              |   4 +-
 cpp/src_prims/metrics/silhouette_score.cuh    |  12 +-
 cpp/src_prims/random/make_blobs.cuh           |   2 +-
 cpp/src_prims/random/make_regression.cuh      | 102 ++++----
 cpp/src_prims/random/mvg.cuh                  | 116 +++++----
 cpp/src_prims/selection/knn.cuh               |   2 +-
 cpp/src_prims/selection/processing.cuh        |   8 +-
 cpp/src_prims/sparse/batched/csr.cuh          |   1 -
 cpp/src_prims/stats/cov.cuh                   |  34 +--
 cpp/src_prims/stats/weighted_mean.cuh         |   6 +-
 cpp/src_prims/timeSeries/arima_helpers.cuh    |   4 +-
 cpp/src_prims/timeSeries/fillna.cuh           |   4 +-
 cpp/src_prims/timeSeries/jones_transform.cuh  |   2 +-
 cpp/src_prims/timeSeries/stationarity.cuh     |   5 +-
 cpp/test/mg/pca.cu                            |   1 -
 cpp/test/prims/add_sub_dev_scalar.cu          |   6 +-
 cpp/test/prims/batched/matrix.cu              |   2 +-
 cpp/test/prims/knn_regression.cu              |   3 +-
 cpp/test/prims/make_regression.cu             |  38 +--
 cpp/test/prims/mvg.cu                         |  33 +--
 cpp/test/prims/silhouette_score.cu            |   2 +-
 cpp/test/sg/cd_test.cu                        |   1 -
 cpp/test/sg/dbscan_test.cu                    |   5 +-
 cpp/test/sg/hdbscan_test.cu                   |   4 +-
 cpp/test/sg/lars_test.cu                      |  25 +-
 cpp/test/sg/linear_svm_test.cu                |   8 +-
 cpp/test/sg/linkage_test.cu                   |   4 +-
 cpp/test/sg/pca_test.cu                       |   1 -
 cpp/test/sg/quasi_newton.cu                   |   2 +-
 cpp/test/sg/rf_test.cu                        |   2 +-
 cpp/test/sg/rproj_test.cu                     |   2 +-
 cpp/test/sg/sgd.cu                            |   1 -
 cpp/test/sg/svc_test.cu                       |   6 +-
 cpp/test/sg/tsne_test.cu                      |   2 +-
 python/cuml/metrics/distance_type.pxd         |   4 +-
 python/cuml/metrics/trustworthiness.pyx       |   4 +-
 129 files changed, 944 insertions(+), 888 deletions(-)

diff --git a/cpp/bench/prims/add.cu b/cpp/bench/prims/add.cu
index 1665ad7656..5a9340cd2f 100644
--- a/cpp/bench/prims/add.cu
+++ b/cpp/bench/prims/add.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <common/ml_benchmark.hpp>
-#include <raft/linalg/add.cuh>
+#include <raft/linalg/add.hpp>
 
 namespace MLCommon {
 namespace Bench {
diff --git a/cpp/bench/prims/fused_l2_nn.cu b/cpp/bench/prims/fused_l2_nn.cu
index 174618c857..c949e119d3 100644
--- a/cpp/bench/prims/fused_l2_nn.cu
+++ b/cpp/bench/prims/fused_l2_nn.cu
@@ -19,7 +19,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/distance/fused_l2_nn.hpp>
 #include <raft/handle.hpp>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/norm.hpp>
 #include <raft/random/rng.hpp>
 #include <raft/spatial/knn/specializations.hpp>
 
diff --git a/cpp/bench/prims/gram_matrix.cu b/cpp/bench/prims/gram_matrix.cu
index 1e3ad2b5ca..d4a83a8e31 100644
--- a/cpp/bench/prims/gram_matrix.cu
+++ b/cpp/bench/prims/gram_matrix.cu
@@ -19,7 +19,8 @@
 #include <matrix/grammatrix.cuh>
 #include <matrix/kernelfactory.cuh>
 #include <memory>
-#include <raft/linalg/cublas_wrappers.h>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/random/rng.hpp>
 #include <sstream>
 #include <string>
diff --git a/cpp/bench/prims/map_then_reduce.cu b/cpp/bench/prims/map_then_reduce.cu
index 6f451672ba..0520562f7b 100644
--- a/cpp/bench/prims/map_then_reduce.cu
+++ b/cpp/bench/prims/map_then_reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <common/ml_benchmark.hpp>
-#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/map_then_reduce.hpp>
 
 namespace MLCommon {
 namespace Bench {
diff --git a/cpp/bench/prims/matrix_vector_op.cu b/cpp/bench/prims/matrix_vector_op.cu
index 35cc0122d5..e117d96bb2 100644
--- a/cpp/bench/prims/matrix_vector_op.cu
+++ b/cpp/bench/prims/matrix_vector_op.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <common/ml_benchmark.hpp>
-#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/matrix_vector_op.hpp>
 
 namespace MLCommon {
 namespace Bench {
diff --git a/cpp/bench/prims/reduce.cu b/cpp/bench/prims/reduce.cu
index cb593c2a3d..bdfe17c62d 100644
--- a/cpp/bench/prims/reduce.cu
+++ b/cpp/bench/prims/reduce.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
  */
 
 #include <common/ml_benchmark.hpp>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/reduce.hpp>
 
 namespace MLCommon {
 namespace Bench {
diff --git a/cpp/bench/sg/dataset.cuh b/cpp/bench/sg/dataset.cuh
index 133529c19e..de5bd470fa 100644
--- a/cpp/bench/sg/dataset.cuh
+++ b/cpp/bench/sg/dataset.cuh
@@ -22,8 +22,8 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/transpose.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/transpose.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <random/make_regression.cuh>
 #include <sstream>
 #include <string>
diff --git a/cpp/bench/sg/linkage.cu b/cpp/bench/sg/linkage.cu
index a6dc8305e9..1003b6cfdb 100644
--- a/cpp/bench/sg/linkage.cu
+++ b/cpp/bench/sg/linkage.cu
@@ -17,7 +17,7 @@
 #include "benchmark.cuh"
 #include <cuml/cluster/linkage.hpp>
 #include <cuml/common/logger.hpp>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/sparse/hierarchy/common.h>
 #include <utility>
 
diff --git a/cpp/include/cuml/cluster/dbscan.hpp b/cpp/include/cuml/cluster/dbscan.hpp
index d5fe70e992..c71d8539f6 100644
--- a/cpp/include/cuml/cluster/dbscan.hpp
+++ b/cpp/include/cuml/cluster/dbscan.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 #include <cuml/common/log_levels.hpp>
 
diff --git a/cpp/include/cuml/cluster/hdbscan.hpp b/cpp/include/cuml/cluster/hdbscan.hpp
index 3fb6708312..6162799e7d 100644
--- a/cpp/include/cuml/cluster/hdbscan.hpp
+++ b/cpp/include/cuml/cluster/hdbscan.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 #include <raft/handle.hpp>
 
diff --git a/cpp/include/cuml/cluster/linkage.hpp b/cpp/include/cuml/cluster/linkage.hpp
index bac0b9218b..eb6e88ff81 100644
--- a/cpp/include/cuml/cluster/linkage.hpp
+++ b/cpp/include/cuml/cluster/linkage.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/sparse/hierarchy/common.h>
 
 #include <raft/handle.hpp>
diff --git a/cpp/include/cuml/metrics/metrics.hpp b/cpp/include/cuml/metrics/metrics.hpp
index 66d2459aaa..f1f9a3d218 100644
--- a/cpp/include/cuml/metrics/metrics.hpp
+++ b/cpp/include/cuml/metrics/metrics.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 #include <cstdint>
 
diff --git a/cpp/include/cuml/neighbors/knn.hpp b/cpp/include/cuml/neighbors/knn.hpp
index 08f726c6af..47af2ffaa0 100644
--- a/cpp/include/cuml/neighbors/knn.hpp
+++ b/cpp/include/cuml/neighbors/knn.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/spatial/knn/ann_common.h>
 #include <raft/spatial/knn/ball_cover_common.h>
 
diff --git a/cpp/include/cuml/neighbors/knn_sparse.hpp b/cpp/include/cuml/neighbors/knn_sparse.hpp
index 916d89567e..0d0e359eb0 100644
--- a/cpp/include/cuml/neighbors/knn_sparse.hpp
+++ b/cpp/include/cuml/neighbors/knn_sparse.hpp
@@ -19,7 +19,7 @@
 #include <cusparse_v2.h>
 
 #include <cuml/neighbors/knn.hpp>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 namespace raft {
 class handle_t;
diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
index 86dffaabce..9bf5cf3225 100644
--- a/cpp/src/arima/batched_arima.cu
+++ b/cpp/src/arima/batched_arima.cu
@@ -35,7 +35,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/matrix_vector_op.hpp>
 #include <rmm/device_uvector.hpp>
 #include <timeSeries/arima_helpers.cuh>
 #include <timeSeries/fillna.cuh>
diff --git a/cpp/src/arima/batched_kalman.cu b/cpp/src/arima/batched_kalman.cu
index 2dc76a7b8a..f0c042f9c4 100644
--- a/cpp/src/arima/batched_kalman.cu
+++ b/cpp/src/arima/batched_kalman.cu
@@ -26,8 +26,9 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/cublas_wrappers.h>
+#include <raft/linalg/add.hpp>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <linalg/batched/matrix.cuh>
@@ -1222,48 +1223,50 @@ void _batched_kalman_filter(raft::handle_t& handle,
 
     double alpha = 1.0;
     double beta  = 0.0;
-    RAFT_CUBLAS_TRY(raft::linalg::cublasgemmStridedBatched(cublasHandle,
-                                                           CUBLAS_OP_N,
-                                                           CUBLAS_OP_N,
-                                                           nobs,
-                                                           1,
-                                                           order.n_exog,
-                                                           &alpha,
-                                                           d_exog,
-                                                           nobs,
-                                                           nobs * order.n_exog,
-                                                           d_beta,
-                                                           order.n_exog,
-                                                           order.n_exog,
-                                                           &beta,
-                                                           obs_intercept.data(),
-                                                           nobs,
-                                                           nobs,
-                                                           batch_size,
-                                                           stream));
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemmStridedBatched(cublasHandle,
+                                                                   CUBLAS_OP_N,
+                                                                   CUBLAS_OP_N,
+                                                                   nobs,
+                                                                   1,
+                                                                   order.n_exog,
+                                                                   &alpha,
+                                                                   d_exog,
+                                                                   nobs,
+                                                                   nobs * order.n_exog,
+                                                                   d_beta,
+                                                                   order.n_exog,
+                                                                   order.n_exog,
+                                                                   &beta,
+                                                                   obs_intercept.data(),
+                                                                   nobs,
+                                                                   nobs,
+                                                                   batch_size,
+                                                                   stream));
 
     if (fc_steps > 0) {
       obs_intercept_fut.resize(fc_steps * batch_size, stream);
 
-      RAFT_CUBLAS_TRY(raft::linalg::cublasgemmStridedBatched(cublasHandle,
-                                                             CUBLAS_OP_N,
-                                                             CUBLAS_OP_N,
-                                                             fc_steps,
-                                                             1,
-                                                             order.n_exog,
-                                                             &alpha,
-                                                             d_exog_fut,
-                                                             fc_steps,
-                                                             fc_steps * order.n_exog,
-                                                             d_beta,
-                                                             order.n_exog,
-                                                             order.n_exog,
-                                                             &beta,
-                                                             obs_intercept_fut.data(),
-                                                             fc_steps,
-                                                             fc_steps,
-                                                             batch_size,
-                                                             stream));
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemmStridedBatched(cublasHandle,
+                                                                     CUBLAS_OP_N,
+                                                                     CUBLAS_OP_N,
+                                                                     fc_steps,
+                                                                     1,
+                                                                     order.n_exog,
+                                                                     &alpha,
+                                                                     d_exog_fut,
+                                                                     fc_steps,
+                                                                     fc_steps * order.n_exog,
+                                                                     d_beta,
+                                                                     order.n_exog,
+                                                                     order.n_exog,
+                                                                     &beta,
+                                                                     obs_intercept_fut.data(),
+                                                                     fc_steps,
+                                                                     fc_steps,
+                                                                     batch_size,
+                                                                     stream));
     }
   }
 
diff --git a/cpp/src/common/cumlHandle.cpp b/cpp/src/common/cumlHandle.cpp
index 6192b24a22..5133876a14 100644
--- a/cpp/src/common/cumlHandle.cpp
+++ b/cpp/src/common/cumlHandle.cpp
@@ -18,8 +18,10 @@
 
 #include <cuml/common/logger.hpp>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/mr/device/allocator.hpp>
 #include <raft/mr/host/allocator.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
diff --git a/cpp/src/dbscan/vertexdeg/precomputed.cuh b/cpp/src/dbscan/vertexdeg/precomputed.cuh
index 75e0886642..3cead4bac8 100644
--- a/cpp/src/dbscan/vertexdeg/precomputed.cuh
+++ b/cpp/src/dbscan/vertexdeg/precomputed.cuh
@@ -22,8 +22,8 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/device_atomics.cuh>
-#include <raft/linalg/coalesced_reduction.cuh>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/coalesced_reduction.hpp>
+#include <raft/linalg/reduce.hpp>
 
 #include "pack.h"
 
diff --git a/cpp/src/decisiontree/batched-levelalgo/split.cuh b/cpp/src/decisiontree/batched-levelalgo/split.cuh
index ea80412cfc..e69fb81489 100644
--- a/cpp/src/decisiontree/batched-levelalgo/split.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/split.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace ML {
 namespace DT {
diff --git a/cpp/src/genetic/fitness.cuh b/cpp/src/genetic/fitness.cuh
index fa32e198c1..a15fb96b54 100644
--- a/cpp/src/genetic/fitness.cuh
+++ b/cpp/src/genetic/fitness.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,10 +15,10 @@
  */
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/strided_reduction.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/strided_reduction.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/stats/mean.hpp>
 #include <raft/stats/mean_center.hpp>
diff --git a/cpp/src/genetic/genetic.cu b/cpp/src/genetic/genetic.cu
index c4aa018f7f..ece6c6d81c 100644
--- a/cpp/src/genetic/genetic.cu
+++ b/cpp/src/genetic/genetic.cu
@@ -23,8 +23,8 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/random/rng.hpp>
 
 #include <algorithm>
diff --git a/cpp/src/genetic/program.cu b/cpp/src/genetic/program.cu
index 0f62cedb96..cd69056ca9 100644
--- a/cpp/src/genetic/program.cu
+++ b/cpp/src/genetic/program.cu
@@ -18,7 +18,7 @@
 #include <cuml/genetic/node.h>
 #include <cuml/genetic/program.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <algorithm>
diff --git a/cpp/src/glm/ols.cuh b/cpp/src/glm/ols.cuh
index 7bb3a10594..0cb009bd3b 100644
--- a/cpp/src/glm/ols.cuh
+++ b/cpp/src/glm/ols.cuh
@@ -17,10 +17,10 @@
 #pragma once
 
 #include <linalg/lstsq.cuh>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/gemv.h>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/subtract.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/gemv.hpp>
+#include <raft/linalg/norm.hpp>
+#include <raft/linalg/subtract.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/stats/mean.hpp>
diff --git a/cpp/src/glm/ols_mg.cu b/cpp/src/glm/ols_mg.cu
index 7e0a1404b1..325566908e 100644
--- a/cpp/src/glm/ols_mg.cu
+++ b/cpp/src/glm/ols_mg.cu
@@ -22,8 +22,8 @@
 
 #include <raft/comms/comms.hpp>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/gemm.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/allocator.hpp>
diff --git a/cpp/src/glm/preprocess.cuh b/cpp/src/glm/preprocess.cuh
index 8ee77966c2..07e1a8cee5 100644
--- a/cpp/src/glm/preprocess.cuh
+++ b/cpp/src/glm/preprocess.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/norm.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/stats/mean.hpp>
diff --git a/cpp/src/glm/preprocess_mg.cu b/cpp/src/glm/preprocess_mg.cu
index 3769bedd78..655af3b9be 100644
--- a/cpp/src/glm/preprocess_mg.cu
+++ b/cpp/src/glm/preprocess_mg.cu
@@ -23,8 +23,8 @@
 #include <raft/comms/comms.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/subtract.cuh>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/subtract.hpp>
 #include <raft/matrix/math.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/src/glm/qn/glm_base.cuh b/cpp/src/glm/qn/glm_base.cuh
index 597d4ad2b0..126855637e 100644
--- a/cpp/src/glm/qn/glm_base.cuh
+++ b/cpp/src/glm/qn/glm_base.cuh
@@ -19,12 +19,10 @@
 #include "simple_mat.cuh"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/map.cuh>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/map.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
 #include <raft/stats/mean.hpp>
 #include <vector>
 
diff --git a/cpp/src/glm/qn/glm_linear.cuh b/cpp/src/glm/qn/glm_linear.cuh
index 664f25b2b2..11df1d5833 100644
--- a/cpp/src/glm/qn/glm_linear.cuh
+++ b/cpp/src/glm/qn/glm_linear.cuh
@@ -19,7 +19,7 @@
 #include "glm_base.cuh"
 #include "simple_mat.cuh"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/add.hpp>
 
 namespace ML {
 namespace GLM {
diff --git a/cpp/src/glm/qn/glm_logistic.cuh b/cpp/src/glm/qn/glm_logistic.cuh
index 01f732df05..5e76da4843 100644
--- a/cpp/src/glm/qn/glm_logistic.cuh
+++ b/cpp/src/glm/qn/glm_logistic.cuh
@@ -19,7 +19,7 @@
 #include "glm_base.cuh"
 #include "simple_mat.cuh"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/add.hpp>
 
 namespace ML {
 namespace GLM {
diff --git a/cpp/src/glm/qn/glm_regularizer.cuh b/cpp/src/glm/qn/glm_regularizer.cuh
index 60958d2a9f..9e4aa7067b 100644
--- a/cpp/src/glm/qn/glm_regularizer.cuh
+++ b/cpp/src/glm/qn/glm_regularizer.cuh
@@ -19,8 +19,8 @@
 #include "simple_mat.cuh"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
 #include <raft/stats/mean.hpp>
 
 namespace ML {
diff --git a/cpp/src/glm/qn/glm_softmax.cuh b/cpp/src/glm/qn/glm_softmax.cuh
index 7b80ae61af..91a18f15b5 100644
--- a/cpp/src/glm/qn/glm_softmax.cuh
+++ b/cpp/src/glm/qn/glm_softmax.cuh
@@ -19,7 +19,7 @@
 #include "glm_base.cuh"
 #include "simple_mat.cuh"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/add.hpp>
 
 namespace ML {
 namespace GLM {
diff --git a/cpp/src/glm/qn/glm_svm.cuh b/cpp/src/glm/qn/glm_svm.cuh
index 04d41b8c3e..fa71377760 100644
--- a/cpp/src/glm/qn/glm_svm.cuh
+++ b/cpp/src/glm/qn/glm_svm.cuh
@@ -19,7 +19,7 @@
 #include "glm_base.cuh"
 #include "simple_mat.cuh"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
+#include <raft/linalg/add.hpp>
 
 namespace ML {
 namespace GLM {
diff --git a/cpp/src/glm/qn/simple_mat/dense.hpp b/cpp/src/glm/qn/simple_mat/dense.hpp
index d87a8765cf..efd6de68a5 100644
--- a/cpp/src/glm/qn/simple_mat/dense.hpp
+++ b/cpp/src/glm/qn/simple_mat/dense.hpp
@@ -23,11 +23,12 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/norm.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace ML {
@@ -89,21 +90,22 @@ struct SimpleDenseMat : SimpleMat<T> {
     ASSERT(kA == kB, "GEMM invalid dims: k");
 
     if (A.ord == COL_MAJOR && B.ord == COL_MAJOR && C.ord == COL_MAJOR) {
-      raft::linalg::cublasgemm(handle.get_cublas_handle(),          // handle
-                               transA ? CUBLAS_OP_T : CUBLAS_OP_N,  // transA
-                               transB ? CUBLAS_OP_T : CUBLAS_OP_N,  // transB
-                               C.m,
-                               C.n,
-                               kA,  // dimensions m,n,k
-                               &alpha,
-                               A.data,
-                               A.m,  // lda
-                               B.data,
-                               B.m,  // ldb
-                               &beta,
-                               C.data,
-                               C.m,  // ldc,
-                               stream);
+      // #TODO: Call from public API when ready
+      raft::linalg::detail::cublasgemm(handle.get_cublas_handle(),          // handle
+                                       transA ? CUBLAS_OP_T : CUBLAS_OP_N,  // transA
+                                       transB ? CUBLAS_OP_T : CUBLAS_OP_N,  // transB
+                                       C.m,
+                                       C.n,
+                                       kA,  // dimensions m,n,k
+                                       &alpha,
+                                       A.data,
+                                       A.m,  // lda
+                                       B.data,
+                                       B.m,  // ldb
+                                       &beta,
+                                       C.data,
+                                       C.m,  // ldc,
+                                       stream);
       return;
     }
     if (A.ord == ROW_MAJOR) {
diff --git a/cpp/src/glm/qn/simple_mat/sparse.hpp b/cpp/src/glm/qn/simple_mat/sparse.hpp
index ccc46b3e6d..0cfa750338 100644
--- a/cpp/src/glm/qn/simple_mat/sparse.hpp
+++ b/cpp/src/glm/qn/simple_mat/sparse.hpp
@@ -23,11 +23,10 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/norm.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/src/glm/ridge.cuh b/cpp/src/glm/ridge.cuh
index b6d855f571..1ebeabfe10 100644
--- a/cpp/src/glm/ridge.cuh
+++ b/cpp/src/glm/ridge.cuh
@@ -17,11 +17,11 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/subtract.cuh>
-#include <raft/linalg/svd.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/norm.hpp>
+#include <raft/linalg/subtract.hpp>
+#include <raft/linalg/svd.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/stats/mean.hpp>
diff --git a/cpp/src/glm/ridge_mg.cu b/cpp/src/glm/ridge_mg.cu
index b73137e4cc..3710eef28b 100644
--- a/cpp/src/glm/ridge_mg.cu
+++ b/cpp/src/glm/ridge_mg.cu
@@ -24,8 +24,8 @@
 #include <raft/comms/comms.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/gemm.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 
diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index 26080dee8f..dff60121ba 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -23,7 +23,7 @@
 
 #include <raft/mr/device/buffer.hpp>
 
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 #include <raft/sparse/convert/csr.hpp>
 #include <raft/sparse/hierarchy/detail/connectivities.cuh>
diff --git a/cpp/src/hierarchy/pw_dist_graph.cuh b/cpp/src/hierarchy/pw_dist_graph.cuh
index dca5f100ba..1c45a66af8 100644
--- a/cpp/src/hierarchy/pw_dist_graph.cuh
+++ b/cpp/src/hierarchy/pw_dist_graph.cuh
@@ -24,7 +24,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/mr/device/buffer.hpp>
 
 // TODO: Not a good strategy for pluggability but will be
diff --git a/cpp/src/holtwinters/internal/hw_decompose.cuh b/cpp/src/holtwinters/internal/hw_decompose.cuh
index 5ffbeba9a5..d94cf30ec3 100644
--- a/cpp/src/holtwinters/internal/hw_decompose.cuh
+++ b/cpp/src/holtwinters/internal/hw_decompose.cuh
@@ -18,6 +18,10 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -180,58 +184,63 @@ void batched_ls(const raft::handle_t& handle,
   }
   raft::update_device(A_d.data(), A_h.data(), 2 * trend_len, stream);
 
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngeqrf_bufferSize<Dtype>(
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf_bufferSize<Dtype>(
     cusolver_h, trend_len, 2, A_d.data(), 2, &geqrf_buffer));
 
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnorgqr_bufferSize<Dtype>(
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnorgqr_bufferSize<Dtype>(
     cusolver_h, trend_len, 2, 2, A_d.data(), 2, tau_d.data(), &orgqr_buffer));
 
   lwork_size = geqrf_buffer > orgqr_buffer ? geqrf_buffer : orgqr_buffer;
   rmm::device_uvector<Dtype> lwork_d(lwork_size, stream);
 
   // QR decomposition of A
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngeqrf<Dtype>(cusolver_h,
-                                                         trend_len,
-                                                         2,
-                                                         A_d.data(),
-                                                         trend_len,
-                                                         tau_d.data(),
-                                                         lwork_d.data(),
-                                                         lwork_size,
-                                                         dev_info_d.data(),
-                                                         stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf<Dtype>(cusolver_h,
+                                                                 trend_len,
+                                                                 2,
+                                                                 A_d.data(),
+                                                                 trend_len,
+                                                                 tau_d.data(),
+                                                                 lwork_d.data(),
+                                                                 lwork_size,
+                                                                 dev_info_d.data(),
+                                                                 stream));
 
   // Single thread kenrel to inverse R
   RinvKernel<Dtype><<<1, 1, 0, stream>>>(A_d.data(), Rinv_d.data(), trend_len);
 
   // R1QT = inv(R)*transpose(Q)
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnorgqr<Dtype>(cusolver_h,
-                                                         trend_len,
-                                                         2,
-                                                         2,
-                                                         A_d.data(),
-                                                         trend_len,
-                                                         tau_d.data(),
-                                                         lwork_d.data(),
-                                                         lwork_size,
-                                                         dev_info_d.data(),
-                                                         stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnorgqr<Dtype>(cusolver_h,
+                                                                 trend_len,
+                                                                 2,
+                                                                 2,
+                                                                 A_d.data(),
+                                                                 trend_len,
+                                                                 tau_d.data(),
+                                                                 lwork_d.data(),
+                                                                 lwork_size,
+                                                                 dev_info_d.data(),
+                                                                 stream));
 
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgemm<Dtype>(cublas_h,
-                                                  CUBLAS_OP_N,
-                                                  CUBLAS_OP_T,
-                                                  2,
-                                                  trend_len,
-                                                  2,
-                                                  &one,
-                                                  Rinv_d.data(),
-                                                  2,
-                                                  A_d.data(),
-                                                  trend_len,
-                                                  &zero,
-                                                  R1Qt_d.data(),
-                                                  2,
-                                                  stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm<Dtype>(cublas_h,
+                                                          CUBLAS_OP_N,
+                                                          CUBLAS_OP_T,
+                                                          2,
+                                                          trend_len,
+                                                          2,
+                                                          &one,
+                                                          Rinv_d.data(),
+                                                          2,
+                                                          A_d.data(),
+                                                          trend_len,
+                                                          &zero,
+                                                          R1Qt_d.data(),
+                                                          2,
+                                                          stream));
 
   batched_ls_solver_kernel<Dtype>
     <<<GET_NUM_BLOCKS(batch_size), GET_THREADS_PER_BLOCK(batch_size), 0, stream>>>(
@@ -277,20 +286,21 @@ void stl_decomposition_gpu(const raft::handle_t& handle,
   if (seasonal == ML::SeasonalType::ADDITIVE) {
     const Dtype one       = 1.;
     const Dtype minus_one = -1.;
-    RAFT_CUBLAS_TRY(raft::linalg::cublasgeam<Dtype>(cublas_h,
-                                                    CUBLAS_OP_N,
-                                                    CUBLAS_OP_N,
-                                                    trend_len,
-                                                    batch_size,
-                                                    &one,
-                                                    ts + ts_offset,
-                                                    trend_len,
-                                                    &minus_one,
-                                                    trend_d.data(),
-                                                    trend_len,
-                                                    season_d.data(),
-                                                    trend_len,
-                                                    stream));
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgeam<Dtype>(cublas_h,
+                                                            CUBLAS_OP_N,
+                                                            CUBLAS_OP_N,
+                                                            trend_len,
+                                                            batch_size,
+                                                            &one,
+                                                            ts + ts_offset,
+                                                            trend_len,
+                                                            &minus_one,
+                                                            trend_d.data(),
+                                                            trend_len,
+                                                            season_d.data(),
+                                                            trend_len,
+                                                            stream));
   } else {
     rmm::device_uvector<Dtype> aligned_ts(batch_size * trend_len, stream);
     raft::copy(aligned_ts.data(), ts + ts_offset, batch_size * trend_len, stream);
diff --git a/cpp/src/holtwinters/internal/hw_utils.cuh b/cpp/src/holtwinters/internal/hw_utils.cuh
index a32f3d7591..ca2c578ad1 100644
--- a/cpp/src/holtwinters/internal/hw_utils.cuh
+++ b/cpp/src/holtwinters/internal/hw_utils.cuh
@@ -19,9 +19,7 @@
 #include <cuml/tsa/holtwinters_params.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 #include <raft/mr/device/allocator.hpp>
 #include <vector>
 
diff --git a/cpp/src/holtwinters/runner.cuh b/cpp/src/holtwinters/runner.cuh
index 59f6059e16..e06bd50543 100644
--- a/cpp/src/holtwinters/runner.cuh
+++ b/cpp/src/holtwinters/runner.cuh
@@ -22,7 +22,9 @@
 #include "internal/hw_optim.cuh"
 #include <cuml/tsa/holtwinters_params.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/transpose.h>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace ML {
@@ -89,8 +91,9 @@ void HoltWintersDecompose(const raft::handle_t& handle,
     raft::copy(start_level, ts + batch_size, batch_size, stream);
     raft::copy(start_trend, ts + batch_size, batch_size, stream);
     const Dtype alpha = -1.;
-    RAFT_CUBLAS_TRY(
-      raft::linalg::cublasaxpy(cublas_h, batch_size, &alpha, ts, 1, start_trend, 1, stream));
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy(
+      cublas_h, batch_size, &alpha, ts, 1, start_trend, 1, stream));
     // cublas::axpy(batch_size, (Dtype)-1., ts, start_trend);
   } else if (start_level != nullptr && start_trend != nullptr && start_season != nullptr) {
     stl_decomposition_gpu(handle_impl,
diff --git a/cpp/src/kmeans/common.cuh b/cpp/src/kmeans/common.cuh
index 125a1a82b5..811e155439 100644
--- a/cpp/src/kmeans/common.cuh
+++ b/cpp/src/kmeans/common.cuh
@@ -32,10 +32,10 @@
 #include <raft/comms/comms.hpp>
 #include <raft/cudart_utils.h>
 #include <raft/distance/fused_l2_nn.hpp>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/mean_squared_error.cuh>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/mean_squared_error.hpp>
+#include <raft/linalg/reduce.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/src/metrics/silhouette_score.cu b/cpp/src/metrics/silhouette_score.cu
index 4c3e7ccc87..c80fe099f1 100644
--- a/cpp/src/metrics/silhouette_score.cu
+++ b/cpp/src/metrics/silhouette_score.cu
@@ -18,7 +18,7 @@
 #include <cuml/metrics/metrics.hpp>
 #include <metrics/batched/silhouette_score.cuh>
 #include <metrics/silhouette_score.cuh>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 namespace ML {
 
diff --git a/cpp/src/pca/pca.cuh b/cpp/src/pca/pca.cuh
index 0e886a1ef6..9261ea127c 100644
--- a/cpp/src/pca/pca.cuh
+++ b/cpp/src/pca/pca.cuh
@@ -19,10 +19,9 @@
 #include <cuml/decomposition/params.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/eig.cuh>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/eig.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/stats/mean.hpp>
diff --git a/cpp/src/pca/pca_mg.cu b/cpp/src/pca/pca_mg.cu
index 29fd0a1722..87b7fee68d 100644
--- a/cpp/src/pca/pca_mg.cu
+++ b/cpp/src/pca/pca_mg.cu
@@ -29,7 +29,7 @@
 #include <raft/comms/comms.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/transpose.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/stats/mean_center.hpp>
 
diff --git a/cpp/src/random_projection/rproj.cuh b/cpp/src/random_projection/rproj.cuh
index f266e24664..83f96105c6 100644
--- a/cpp/src/random_projection/rproj.cuh
+++ b/cpp/src/random_projection/rproj.cuh
@@ -22,7 +22,8 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/sparse/cusparse_wrappers.h>
 
 #include <cstddef>
@@ -162,21 +163,22 @@ void RPROJtransform(const raft::handle_t& handle,
     auto& ldb = k;
     auto& ldc = m;
 
-    RAFT_CUBLAS_TRY(raft::linalg::cublasgemm(cublas_handle,
-                                             CUBLAS_OP_N,
-                                             CUBLAS_OP_N,
-                                             params->n_samples,
-                                             n,
-                                             k,
-                                             &alfa,
-                                             input,
-                                             lda,
-                                             random_matrix->dense_data.data(),
-                                             ldb,
-                                             &beta,
-                                             output,
-                                             ldc,
-                                             stream));
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                     CUBLAS_OP_N,
+                                                     CUBLAS_OP_N,
+                                                     params->n_samples,
+                                                     n,
+                                                     k,
+                                                     &alfa,
+                                                     input,
+                                                     lda,
+                                                     random_matrix->dense_data.data(),
+                                                     ldb,
+                                                     &beta,
+                                                     output,
+                                                     ldc,
+                                                     stream));
 
   } else if (random_matrix->type == sparse) {
     cusparseHandle_t cusparse_handle = handle.get_cusparse_handle();
diff --git a/cpp/src/solver/cd.cuh b/cpp/src/solver/cd.cuh
index 142a1fbc9f..b534358273 100644
--- a/cpp/src/solver/cd.cuh
+++ b/cpp/src/solver/cd.cuh
@@ -24,13 +24,14 @@
 #include <glm/preprocess.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/multiply.cuh>
-#include <raft/linalg/subtract.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/multiply.hpp>
+#include <raft/linalg/subtract.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 
diff --git a/cpp/src/solver/cd_mg.cu b/cpp/src/solver/cd_mg.cu
index cc9e409c67..837d0d83ba 100644
--- a/cpp/src/solver/cd_mg.cu
+++ b/cpp/src/solver/cd_mg.cu
@@ -28,11 +28,11 @@
 #include <raft/comms/comms.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/multiply.cuh>
-#include <raft/linalg/subtract.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/multiply.hpp>
+#include <raft/linalg/subtract.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 
diff --git a/cpp/src/solver/lars_impl.cuh b/cpp/src/solver/lars_impl.cuh
index 9e5ccf91e3..09646c64a8 100644
--- a/cpp/src/solver/lars_impl.cuh
+++ b/cpp/src/solver/lars_impl.cuh
@@ -26,12 +26,13 @@
 #include <cuml/common/logger.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/cholesky_r1_update.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/gemv.h>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/cholesky_r1_update.hpp>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemv.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <thrust/copy.h>
@@ -150,17 +151,22 @@ void swapFeatures(cublasHandle_t handle,
 {
   std::swap(indices[j], indices[k]);
   if (G) {
+    // #TODO: Call from public API when ready
     RAFT_CUBLAS_TRY(
-      raft::linalg::cublasSwap(handle, n_cols, G + ld_G * j, 1, G + ld_G * k, 1, stream));
-    RAFT_CUBLAS_TRY(raft::linalg::cublasSwap(handle, n_cols, G + j, ld_G, G + k, ld_G, stream));
+      raft::linalg::detail::cublasSwap(handle, n_cols, G + ld_G * j, 1, G + ld_G * k, 1, stream));
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(
+      raft::linalg::detail::cublasSwap(handle, n_cols, G + j, ld_G, G + k, ld_G, stream));
   } else {
     // Only swap X if G is nullptr. Only in that case will we use the feature
     // columns, otherwise all the necessary information is already there in G.
+    // #TODO: Call from public API when ready
     RAFT_CUBLAS_TRY(
-      raft::linalg::cublasSwap(handle, n_rows, X + ld_X * j, 1, X + ld_X * k, 1, stream));
+      raft::linalg::detail::cublasSwap(handle, n_rows, X + ld_X * j, 1, X + ld_X * k, 1, stream));
   }
   // swap (c[j], c[k])
-  RAFT_CUBLAS_TRY(raft::linalg::cublasSwap(handle, 1, cor + j, 1, cor + k, 1, stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasSwap(handle, 1, cor + j, 1, cor + k, 1, stream));
 }
 
 /**
@@ -280,19 +286,20 @@ void updateCholesky(const raft::handle_t& handle,
     const math_t* X_row = X + (n_active - 1) * ld_X;
     math_t one          = 1;
     math_t zero         = 0;
-    RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(handle.get_cublas_handle(),
-                                             CUBLAS_OP_T,
-                                             n_rows,
-                                             n_cols,
-                                             &one,
-                                             X,
-                                             n_rows,
-                                             X_row,
-                                             1,
-                                             &zero,
-                                             G_row,
-                                             1,
-                                             stream));
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(handle.get_cublas_handle(),
+                                                     CUBLAS_OP_T,
+                                                     n_rows,
+                                                     n_cols,
+                                                     &one,
+                                                     X,
+                                                     n_rows,
+                                                     X_row,
+                                                     1,
+                                                     &zero,
+                                                     G_row,
+                                                     1,
+                                                     stream));
   } else if (G0 != U) {
     // Copy the new column of G0 into U, because the factorization works in
     // place.
@@ -342,34 +349,36 @@ void calcW0(const raft::handle_t& handle,
   // First we calculate x by solving equation U.T x = sign_A.
   raft::copy(ws, sign, n_active, stream);
   math_t alpha = 1;
-  RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(handle.get_cublas_handle(),
-                                           CUBLAS_SIDE_LEFT,
-                                           fillmode,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_DIAG_NON_UNIT,
-                                           n_active,
-                                           1,
-                                           &alpha,
-                                           U,
-                                           ld_U,
-                                           ws,
-                                           ld_U,
-                                           stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublastrsm(handle.get_cublas_handle(),
+                                                   CUBLAS_SIDE_LEFT,
+                                                   fillmode,
+                                                   CUBLAS_OP_T,
+                                                   CUBLAS_DIAG_NON_UNIT,
+                                                   n_active,
+                                                   1,
+                                                   &alpha,
+                                                   U,
+                                                   ld_U,
+                                                   ws,
+                                                   ld_U,
+                                                   stream));
 
   // ws stores x, the solution of U.T x = sign_A. Now we solve U * ws = x
-  RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(handle.get_cublas_handle(),
-                                           CUBLAS_SIDE_LEFT,
-                                           fillmode,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_DIAG_NON_UNIT,
-                                           n_active,
-                                           1,
-                                           &alpha,
-                                           U,
-                                           ld_U,
-                                           ws,
-                                           ld_U,
-                                           stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublastrsm(handle.get_cublas_handle(),
+                                                   CUBLAS_SIDE_LEFT,
+                                                   fillmode,
+                                                   CUBLAS_OP_N,
+                                                   CUBLAS_DIAG_NON_UNIT,
+                                                   n_active,
+                                                   1,
+                                                   &alpha,
+                                                   U,
+                                                   ld_U,
+                                                   ws,
+                                                   ld_U,
+                                                   stream));
   // Now ws = G0^(-1) sign_A = S GA^{-1} 1_A.
 }
 
@@ -513,19 +522,20 @@ LarsFitStatus calcEquiangularVec(const raft::handle_t& handle,
     // Calculate u_eq only in the case if the Gram matrix is not stored.
     math_t one  = 1;
     math_t zero = 0;
-    RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(handle.get_cublas_handle(),
-                                             CUBLAS_OP_N,
-                                             n_rows,
-                                             n_active,
-                                             &one,
-                                             X,
-                                             ld_X,
-                                             ws,
-                                             1,
-                                             &zero,
-                                             u_eq,
-                                             1,
-                                             stream));
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(handle.get_cublas_handle(),
+                                                     CUBLAS_OP_N,
+                                                     n_rows,
+                                                     n_active,
+                                                     &one,
+                                                     X,
+                                                     ld_X,
+                                                     ws,
+                                                     1,
+                                                     &zero,
+                                                     u_eq,
+                                                     1,
+                                                     stream));
   }
   return LarsFitStatus::kOk;
 }
@@ -601,37 +611,39 @@ void calcMaxStep(const raft::handle_t& handle,
       // Calculate a = X.T[:,n_active:] * u                              (2.11)
       math_t one  = 1;
       math_t zero = 0;
-      RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(handle.get_cublas_handle(),
-                                               CUBLAS_OP_T,
-                                               n_rows,
-                                               n_inactive,
-                                               &one,
-                                               X + n_active * ld_X,
-                                               ld_X,
-                                               u,
-                                               1,
-                                               &zero,
-                                               a_vec,
-                                               1,
-                                               stream));
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(handle.get_cublas_handle(),
+                                                       CUBLAS_OP_T,
+                                                       n_rows,
+                                                       n_inactive,
+                                                       &one,
+                                                       X + n_active * ld_X,
+                                                       ld_X,
+                                                       u,
+                                                       1,
+                                                       &zero,
+                                                       a_vec,
+                                                       1,
+                                                       stream));
     } else {
       // Calculate a = X.T[:,n_A:] * u = X.T[:, n_A:] * X[:,:n_A] * ws
       //             = G[n_A:,:n_A] * ws                                 (2.11)
       math_t one  = 1;
       math_t zero = 0;
-      RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(handle.get_cublas_handle(),
-                                               CUBLAS_OP_N,
-                                               n_inactive,
-                                               n_active,
-                                               &one,
-                                               G + n_active,
-                                               ld_G,
-                                               ws,
-                                               1,
-                                               &zero,
-                                               a_vec,
-                                               1,
-                                               stream));
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(handle.get_cublas_handle(),
+                                                       CUBLAS_OP_N,
+                                                       n_inactive,
+                                                       n_active,
+                                                       &one,
+                                                       G + n_active,
+                                                       ld_G,
+                                                       ws,
+                                                       1,
+                                                       &zero,
+                                                       a_vec,
+                                                       1,
+                                                       stream));
     }
     const math_t tiny = std::numeric_limits<math_t>::min();
     const math_t huge = std::numeric_limits<math_t>::max();
@@ -719,19 +731,20 @@ void larsInit(const raft::handle_t& handle,
   math_t one  = 1;
   math_t zero = 0;
   // Set initial correlation to X.T * y
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(handle.get_cublas_handle(),
-                                           CUBLAS_OP_T,
-                                           n_rows,
-                                           n_cols,
-                                           &one,
-                                           X,
-                                           ld_X,
-                                           y,
-                                           1,
-                                           &zero,
-                                           cor.data(),
-                                           1,
-                                           stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(handle.get_cublas_handle(),
+                                                   CUBLAS_OP_T,
+                                                   n_rows,
+                                                   n_cols,
+                                                   &one,
+                                                   X,
+                                                   ld_X,
+                                                   y,
+                                                   1,
+                                                   &zero,
+                                                   cor.data(),
+                                                   1,
+                                                   stream));
   if (coef_path) {
     RAFT_CUDA_TRY(
       cudaMemsetAsync(coef_path, 0, sizeof(math_t) * (*max_iter + 1) * (*max_iter), stream));
@@ -1110,19 +1123,20 @@ void larsPredict(const raft::handle_t& handle,
   thrust::device_ptr<math_t> pred_ptr(preds);
   thrust::fill(execution_policy, pred_ptr, pred_ptr + n_rows, intercept);
   math_t one = 1;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(handle.get_cublas_handle(),
-                                           CUBLAS_OP_N,
-                                           n_rows,
-                                           n_active,
-                                           &one,
-                                           X,
-                                           ld_X,
-                                           beta,
-                                           1,
-                                           &one,
-                                           preds,
-                                           1,
-                                           stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(handle.get_cublas_handle(),
+                                                   CUBLAS_OP_N,
+                                                   n_rows,
+                                                   n_active,
+                                                   &one,
+                                                   X,
+                                                   ld_X,
+                                                   beta,
+                                                   1,
+                                                   &one,
+                                                   preds,
+                                                   1,
+                                                   stream));
 }
 };  // namespace Lars
 };  // namespace Solver
diff --git a/cpp/src/solver/sgd.cuh b/cpp/src/solver/sgd.cuh
index 7dc5f9d278..a3b77fbdd4 100644
--- a/cpp/src/solver/sgd.cuh
+++ b/cpp/src/solver/sgd.cuh
@@ -25,13 +25,12 @@
 #include <glm/preprocess.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/gemv.h>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/subtract.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/gemv.hpp>
+#include <raft/linalg/norm.hpp>
+#include <raft/linalg/subtract.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/stats/mean.hpp>
diff --git a/cpp/src/svm/kernelcache.cuh b/cpp/src/svm/kernelcache.cuh
index f99efa223c..d1a011df49 100644
--- a/cpp/src/svm/kernelcache.cuh
+++ b/cpp/src/svm/kernelcache.cuh
@@ -25,7 +25,7 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/gemm.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/src/svm/linear.cu b/cpp/src/svm/linear.cu
index eab6141af5..e06d87a2fc 100644
--- a/cpp/src/svm/linear.cu
+++ b/cpp/src/svm/linear.cu
@@ -26,13 +26,12 @@
 #include <omp.h>
 #include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/gemv.h>
-#include <raft/linalg/map.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/transpose.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/gemv.hpp>
+#include <raft/linalg/map.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/transpose.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_uvector.hpp>
 #include <thrust/copy.h>
diff --git a/cpp/src/svm/results.cuh b/cpp/src/svm/results.cuh
index 7c280bb224..85123d224c 100644
--- a/cpp/src/svm/results.cuh
+++ b/cpp/src/svm/results.cuh
@@ -26,10 +26,9 @@
 #include <cub/device/device_select.cuh>
 #include <linalg/init.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
diff --git a/cpp/src/svm/smosolver.cuh b/cpp/src/svm/smosolver.cuh
index 6e5367a8bf..f8b52c73a2 100644
--- a/cpp/src/svm/smosolver.cuh
+++ b/cpp/src/svm/smosolver.cuh
@@ -22,9 +22,10 @@
 #include <matrix/grammatrix.cuh>
 #include <matrix/kernelfactory.cuh>
 
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/gemv.h>
-#include <raft/linalg/unary_op.cuh>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemv.hpp>
+#include <raft/linalg/unary_op.hpp>
 
 #include <iostream>
 #include <limits>
@@ -44,9 +45,8 @@
 #include <cuml/matrix/kernelparams.h>
 #include <matrix/grammatrix.cuh>
 #include <matrix/kernelfactory.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/gemv.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/gemv.hpp>
+#include <raft/linalg/unary_op.hpp>
 
 #include "results.cuh"
 
@@ -218,35 +218,37 @@ class SmoSolver {
   {
     // multipliers used in the equation : f = 1*cachtile * delta_alpha + 1*f
     math_t one = 1;
-    RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(handle.get_cublas_handle(),
-                                             CUBLAS_OP_N,
-                                             n_rows,
-                                             n_ws,
-                                             &one,
-                                             cacheTile,
-                                             n_rows,
-                                             delta_alpha,
-                                             1,
-                                             &one,
-                                             f,
-                                             1,
-                                             stream));
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(handle.get_cublas_handle(),
+                                                     CUBLAS_OP_N,
+                                                     n_rows,
+                                                     n_ws,
+                                                     &one,
+                                                     cacheTile,
+                                                     n_rows,
+                                                     delta_alpha,
+                                                     1,
+                                                     &one,
+                                                     f,
+                                                     1,
+                                                     stream));
     if (svmType == EPSILON_SVR) {
       // SVR has doubled the number of trainig vectors and we need to update
       // alpha for both batches individually
-      RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(handle.get_cublas_handle(),
-                                               CUBLAS_OP_N,
-                                               n_rows,
-                                               n_ws,
-                                               &one,
-                                               cacheTile,
-                                               n_rows,
-                                               delta_alpha,
-                                               1,
-                                               &one,
-                                               f + n_rows,
-                                               1,
-                                               stream));
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(handle.get_cublas_handle(),
+                                                       CUBLAS_OP_N,
+                                                       n_rows,
+                                                       n_ws,
+                                                       &one,
+                                                       cacheTile,
+                                                       n_rows,
+                                                       delta_alpha,
+                                                       1,
+                                                       &one,
+                                                       f + n_rows,
+                                                       1,
+                                                       stream));
     }
   }
 
diff --git a/cpp/src/svm/svc.cu b/cpp/src/svm/svc.cu
index 4c2c18951f..ea3d7032f6 100644
--- a/cpp/src/svm/svc.cu
+++ b/cpp/src/svm/svc.cu
@@ -22,8 +22,7 @@
 #include <cuml/svm/svc.hpp>
 #include <label/classlabels.cuh>
 #include <matrix/kernelfactory.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace ML {
 namespace SVM {
diff --git a/cpp/src/svm/svc_impl.cuh b/cpp/src/svm/svc_impl.cuh
index 9a55a95325..548f860727 100644
--- a/cpp/src/svm/svc_impl.cuh
+++ b/cpp/src/svm/svc_impl.cuh
@@ -31,8 +31,9 @@
 #include <label/classlabels.cuh>
 #include <matrix/kernelfactory.cuh>
 #include <raft/label/classlabels.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/unary_op.cuh>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -176,19 +177,20 @@ void svcPredict(const raft::handle_t& handle,
                      n_batch);
     math_t one  = 1;
     math_t null = 0;
-    RAFT_CUBLAS_TRY(raft::linalg::cublasgemv(cublas_handle,
-                                             CUBLAS_OP_N,
-                                             n_batch,
-                                             model.n_support,
-                                             &one,
-                                             K.data(),
-                                             n_batch,
-                                             model.dual_coefs,
-                                             1,
-                                             &null,
-                                             y.data() + i,
-                                             1,
-                                             stream));
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemv(cublas_handle,
+                                                     CUBLAS_OP_N,
+                                                     n_batch,
+                                                     model.n_support,
+                                                     &one,
+                                                     K.data(),
+                                                     n_batch,
+                                                     model.dual_coefs,
+                                                     1,
+                                                     &null,
+                                                     y.data() + i,
+                                                     1,
+                                                     stream));
   }
   math_t* labels = model.unique_labels;
   math_t b       = model.b;
diff --git a/cpp/src/svm/svr.cu b/cpp/src/svm/svr.cu
index 55186eaa28..3aabe68b1f 100644
--- a/cpp/src/svm/svr.cu
+++ b/cpp/src/svm/svr.cu
@@ -22,8 +22,7 @@
 #include <cuml/svm/svc.hpp>
 #include <label/classlabels.cuh>
 #include <matrix/kernelfactory.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace ML {
 namespace SVM {
diff --git a/cpp/src/svm/svr_impl.cuh b/cpp/src/svm/svr_impl.cuh
index 1bf996bb55..f8ae2406c4 100644
--- a/cpp/src/svm/svr_impl.cuh
+++ b/cpp/src/svm/svr_impl.cuh
@@ -30,8 +30,7 @@
 #include <cuml/svm/svm_parameter.h>
 #include <label/classlabels.cuh>
 #include <matrix/kernelfactory.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
diff --git a/cpp/src/svm/workingset.cuh b/cpp/src/svm/workingset.cuh
index 34471538b4..ba71677952 100644
--- a/cpp/src/svm/workingset.cuh
+++ b/cpp/src/svm/workingset.cuh
@@ -29,8 +29,8 @@
 #include <cub/cub.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <thrust/device_ptr.h>
diff --git a/cpp/src/tsne/barnes_hut_tsne.cuh b/cpp/src/tsne/barnes_hut_tsne.cuh
index 2e9cfd6c1b..6f3435eceb 100644
--- a/cpp/src/tsne/barnes_hut_tsne.cuh
+++ b/cpp/src/tsne/barnes_hut_tsne.cuh
@@ -20,7 +20,7 @@
 #include <cuml/common/logger.hpp>
 #include <cuml/manifold/tsne.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 
 namespace ML {
 namespace TSNE {
diff --git a/cpp/src/tsne/distances.cuh b/cpp/src/tsne/distances.cuh
index 315921bea6..828e93f3f7 100644
--- a/cpp/src/tsne/distances.cuh
+++ b/cpp/src/tsne/distances.cuh
@@ -18,8 +18,8 @@
 
 #include <cuml/neighbors/knn_sparse.hpp>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/eltwise.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/linalg/symmetrize.hpp>
 #include <raft/sparse/selection/knn.hpp>
diff --git a/cpp/src/tsne/exact_kernels.cuh b/cpp/src/tsne/exact_kernels.cuh
index d0971587d8..d744c4a060 100644
--- a/cpp/src/tsne/exact_kernels.cuh
+++ b/cpp/src/tsne/exact_kernels.cuh
@@ -19,7 +19,7 @@
 #include <float.h>
 #include <math.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 
 #define restrict __restrict__
 
diff --git a/cpp/src/tsne/fft_tsne.cuh b/cpp/src/tsne/fft_tsne.cuh
index 05c00b6d50..c9ed35d78a 100644
--- a/cpp/src/tsne/fft_tsne.cuh
+++ b/cpp/src/tsne/fft_tsne.cuh
@@ -29,7 +29,7 @@
 #include <common/device_utils.cuh>
 #include <cufft_utils.h>
 #include <linalg/init.h>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/stats/sum.hpp>
 #include <rmm/device_scalar.hpp>
diff --git a/cpp/src/tsne/utils.cuh b/cpp/src/tsne/utils.cuh
index 80b92c6de1..bcf1f1bcea 100644
--- a/cpp/src/tsne/utils.cuh
+++ b/cpp/src/tsne/utils.cuh
@@ -22,8 +22,8 @@
 #include <stdio.h>
 
 #include <cuml/common/logger.hpp>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/norm.hpp>
 
 #include <cuda_runtime.h>
 
diff --git a/cpp/src/tsvd/tsvd.cuh b/cpp/src/tsvd/tsvd.cuh
index 75d7d727c7..f452fd613f 100644
--- a/cpp/src/tsvd/tsvd.cuh
+++ b/cpp/src/tsvd/tsvd.cuh
@@ -20,12 +20,11 @@
 #include <linalg/rsvd.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/eig.cuh>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/eig.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/stats/mean.hpp>
diff --git a/cpp/src/tsvd/tsvd_mg.cu b/cpp/src/tsvd/tsvd_mg.cu
index 6f9d5762e1..56c02d4d27 100644
--- a/cpp/src/tsvd/tsvd_mg.cu
+++ b/cpp/src/tsvd/tsvd_mg.cu
@@ -28,7 +28,7 @@
 #include <raft/comms/comms.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/stats/mean_center.hpp>
 
diff --git a/cpp/src/umap/init_embed/spectral_algo.cuh b/cpp/src/umap/init_embed/spectral_algo.cuh
index 031dce893a..ef09165173 100644
--- a/cpp/src/umap/init_embed/spectral_algo.cuh
+++ b/cpp/src/umap/init_embed/spectral_algo.cuh
@@ -18,10 +18,10 @@
 
 #include <cuml/manifold/umapparams.h>
 
-#include <raft/linalg/add.cuh>
+#include <raft/linalg/add.hpp>
 #include <raft/sparse/coo.hpp>
 
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/transpose.hpp>
 #include <raft/random/rng.hpp>
 
 #include <cuml/cluster/spectral.hpp>
diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh
index 1004fafd5d..94f107b668 100644
--- a/cpp/src/umap/knn_graph/algo.cuh
+++ b/cpp/src/umap/knn_graph/algo.cuh
@@ -20,8 +20,8 @@
 #include <cuml/manifold/umapparams.h>
 #include <cuml/neighbors/knn_sparse.hpp>
 #include <iostream>
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/sparse/selection/knn.hpp>
 #include <selection/knn.cuh>
 
diff --git a/cpp/src/umap/optimize.cuh b/cpp/src/umap/optimize.cuh
index dd6323b381..685d95d32b 100644
--- a/cpp/src/umap/optimize.cuh
+++ b/cpp/src/umap/optimize.cuh
@@ -21,11 +21,10 @@
 
 #include <linalg/power.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/multiply.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/multiply.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/stats/mean.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/src/umap/simpl_set_embed/algo.cuh b/cpp/src/umap/simpl_set_embed/algo.cuh
index d64b9d3674..1f358f177c 100644
--- a/cpp/src/umap/simpl_set_embed/algo.cuh
+++ b/cpp/src/umap/simpl_set_embed/algo.cuh
@@ -34,7 +34,7 @@
 #include <cstdlib>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/sparse/coo.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/src_prims/distance/epsilon_neighborhood.cuh b/cpp/src_prims/distance/epsilon_neighborhood.cuh
index b92e1fd8a0..628b3b70a5 100644
--- a/cpp/src_prims/distance/epsilon_neighborhood.cuh
+++ b/cpp/src_prims/distance/epsilon_neighborhood.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <common/device_utils.cuh>
-#include <raft/linalg/contractions.cuh>
+#include <raft/linalg/contractions.hpp>
 
 namespace MLCommon {
 namespace Distance {
diff --git a/cpp/src_prims/functions/hinge.cuh b/cpp/src_prims/functions/hinge.cuh
index e7faf31ad4..a7d2952937 100644
--- a/cpp/src_prims/functions/hinge.cuh
+++ b/cpp/src_prims/functions/hinge.cuh
@@ -18,14 +18,13 @@
 
 #include "penalty.cuh"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/subtract.cuh>
-#include <raft/linalg/transpose.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/subtract.hpp>
+#include <raft/linalg/transpose.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
diff --git a/cpp/src_prims/functions/linearReg.cuh b/cpp/src_prims/functions/linearReg.cuh
index 02653cb3b3..b79e1996dd 100644
--- a/cpp/src_prims/functions/linearReg.cuh
+++ b/cpp/src_prims/functions/linearReg.cuh
@@ -18,12 +18,11 @@
 
 #include "penalty.cuh"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/subtract.cuh>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/subtract.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
diff --git a/cpp/src_prims/functions/log.cuh b/cpp/src_prims/functions/log.cuh
index ec1b6ff073..328276bc32 100644
--- a/cpp/src_prims/functions/log.cuh
+++ b/cpp/src_prims/functions/log.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace MLCommon {
 namespace Functions {
diff --git a/cpp/src_prims/functions/logisticReg.cuh b/cpp/src_prims/functions/logisticReg.cuh
index 0ff1686289..af5489d017 100644
--- a/cpp/src_prims/functions/logisticReg.cuh
+++ b/cpp/src_prims/functions/logisticReg.cuh
@@ -19,13 +19,11 @@
 #include "penalty.cuh"
 #include "sigmoid.cuh"
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/subtract.cuh>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/subtract.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/stats/mean.hpp>
diff --git a/cpp/src_prims/functions/penalty.cuh b/cpp/src_prims/functions/penalty.cuh
index c4f8f96511..247bae10ba 100644
--- a/cpp/src_prims/functions/penalty.cuh
+++ b/cpp/src_prims/functions/penalty.cuh
@@ -19,9 +19,9 @@
 #include "sign.cuh"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/norm.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/norm.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_scalar.hpp>
diff --git a/cpp/src_prims/functions/sigmoid.cuh b/cpp/src_prims/functions/sigmoid.cuh
index a192104f3f..537d171b18 100644
--- a/cpp/src_prims/functions/sigmoid.cuh
+++ b/cpp/src_prims/functions/sigmoid.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace MLCommon {
 namespace Functions {
diff --git a/cpp/src_prims/functions/sign.cuh b/cpp/src_prims/functions/sign.cuh
index 486ca889c9..85d18787da 100644
--- a/cpp/src_prims/functions/sign.cuh
+++ b/cpp/src_prims/functions/sign.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace MLCommon {
 namespace Functions {
diff --git a/cpp/src_prims/functions/softThres.cuh b/cpp/src_prims/functions/softThres.cuh
index 63dd045739..05bf651a26 100644
--- a/cpp/src_prims/functions/softThres.cuh
+++ b/cpp/src_prims/functions/softThres.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace MLCommon {
 namespace Functions {
diff --git a/cpp/src_prims/label/classlabels.cuh b/cpp/src_prims/label/classlabels.cuh
index bd4bd2d7b6..17b9aa030a 100644
--- a/cpp/src_prims/label/classlabels.cuh
+++ b/cpp/src_prims/label/classlabels.cuh
@@ -22,7 +22,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/label/classlabels.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/src_prims/linalg/batched/matrix.cuh b/cpp/src_prims/linalg/batched/matrix.cuh
index ac95f1c78e..f3d5627a12 100644
--- a/cpp/src_prims/linalg/batched/matrix.cuh
+++ b/cpp/src_prims/linalg/batched/matrix.cuh
@@ -22,9 +22,10 @@
 
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/execution_policy.h>
@@ -389,9 +390,11 @@ class Matrix {
   {
     int n = A.m_shape.first;
 
-    RAFT_CUBLAS_TRY(raft::linalg::cublasgetrfBatched(
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgetrfBatched(
       A.m_cublasHandle, n, A.data(), n, d_P, d_info, A.m_batch_size, A.m_stream));
-    RAFT_CUBLAS_TRY(raft::linalg::cublasgetriBatched(
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgetriBatched(
       A.m_cublasHandle, n, A.data(), n, d_P, Ainv.data(), n, d_info, A.m_batch_size, A.m_stream));
   }
 
@@ -579,25 +582,26 @@ void b_gemm(bool aT,
   cublasOperation_t opB = bT ? CUBLAS_OP_T : CUBLAS_OP_N;
 
   // Call cuBLAS
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgemmStridedBatched(A.cublasHandle(),
-                                                         opA,
-                                                         opB,
-                                                         m,
-                                                         n,
-                                                         k,
-                                                         &alpha,
-                                                         A.raw_data(),
-                                                         A.shape().first,
-                                                         A.shape().first * A.shape().second,
-                                                         B.raw_data(),
-                                                         B.shape().first,
-                                                         B.shape().first * B.shape().second,
-                                                         &beta,
-                                                         C.raw_data(),
-                                                         C.shape().first,
-                                                         C.shape().first * C.shape().second,
-                                                         A.batches(),
-                                                         A.stream()));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemmStridedBatched(A.cublasHandle(),
+                                                                 opA,
+                                                                 opB,
+                                                                 m,
+                                                                 n,
+                                                                 k,
+                                                                 &alpha,
+                                                                 A.raw_data(),
+                                                                 A.shape().first,
+                                                                 A.shape().first * A.shape().second,
+                                                                 B.raw_data(),
+                                                                 B.shape().first,
+                                                                 B.shape().first * B.shape().second,
+                                                                 &beta,
+                                                                 C.raw_data(),
+                                                                 C.shape().first,
+                                                                 C.shape().first * C.shape().second,
+                                                                 A.batches(),
+                                                                 A.stream()));
 }
 
 /**
@@ -658,19 +662,20 @@ void b_gels(const Matrix<T>& A, Matrix<T>& C, int* devInfoArray = nullptr)
   Matrix<T> Acopy(A);
 
   int info;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgelsBatched(A.cublasHandle(),
-                                                  CUBLAS_OP_N,
-                                                  m,
-                                                  n,
-                                                  nrhs,
-                                                  Acopy.data(),
-                                                  m,
-                                                  C.data(),
-                                                  m,
-                                                  &info,
-                                                  devInfoArray,
-                                                  A.batches(),
-                                                  A.stream()));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgelsBatched(A.cublasHandle(),
+                                                          CUBLAS_OP_N,
+                                                          m,
+                                                          n,
+                                                          nrhs,
+                                                          Acopy.data(),
+                                                          m,
+                                                          C.data(),
+                                                          m,
+                                                          &info,
+                                                          devInfoArray,
+                                                          A.batches(),
+                                                          A.stream()));
 }
 
 /**
diff --git a/cpp/src_prims/linalg/lstsq.cuh b/cpp/src_prims/linalg/lstsq.cuh
index d4810f4ae7..890775b274 100644
--- a/cpp/src_prims/linalg/lstsq.cuh
+++ b/cpp/src_prims/linalg/lstsq.cuh
@@ -20,15 +20,17 @@
 #include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
-#include <raft/linalg/eig.cuh>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/gemv.h>
-#include <raft/linalg/qr.cuh>
-#include <raft/linalg/svd.cuh>
-#include <raft/linalg/transpose.h>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
+#include <raft/linalg/eig.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/gemv.hpp>
+#include <raft/linalg/qr.hpp>
+#include <raft/linalg/svd.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
@@ -129,7 +131,8 @@ void lstsqSvdQR(const raft::handle_t& handle,
   const int minmn              = min(n_rows, n_cols);
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
   int cusolverWorkSetSize      = 0;
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvd_bufferSize<math_t>(
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd_bufferSize<math_t>(
     cusolverH, n_rows, n_cols, &cusolverWorkSetSize));
 
   rmm::device_uvector<math_t> workset(cusolverWorkSetSize  // cuSolver
@@ -147,23 +150,24 @@ void lstsqSvdQR(const raft::handle_t& handle,
   math_t* Ub              = S + minmn;
   int* devInfo            = reinterpret_cast<int*>(Ub + minmn);
 
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvd<math_t>(cusolverH,
-                                                          'S',
-                                                          'S',
-                                                          n_rows,
-                                                          n_cols,
-                                                          A,
-                                                          n_rows,
-                                                          S,
-                                                          U,
-                                                          n_rows,
-                                                          Vt,
-                                                          n_cols,
-                                                          cusolverWorkSet,
-                                                          cusolverWorkSetSize,
-                                                          nullptr,
-                                                          devInfo,
-                                                          stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvd<math_t>(cusolverH,
+                                                                  'S',
+                                                                  'S',
+                                                                  n_rows,
+                                                                  n_cols,
+                                                                  A,
+                                                                  n_rows,
+                                                                  S,
+                                                                  U,
+                                                                  n_rows,
+                                                                  Vt,
+                                                                  n_cols,
+                                                                  cusolverWorkSet,
+                                                                  cusolverWorkSetSize,
+                                                                  nullptr,
+                                                                  devInfo,
+                                                                  stream));
   raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream);
   raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero<math_t>(), stream);
   raft::linalg::gemv(handle, Vt, minmn, n_cols, n_cols, Ub, w, true, stream);
@@ -189,20 +193,22 @@ void lstsqSvdJacobi(const raft::handle_t& handle,
   RAFT_CUSOLVER_TRY(cusolverDnCreateGesvdjInfo(&gesvdj_params));
   int cusolverWorkSetSize      = 0;
   cusolverDnHandle_t cusolverH = handle.get_cusolver_dn_handle();
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj_bufferSize<math_t>(cusolverH,
-                                                                      CUSOLVER_EIG_MODE_VECTOR,
-                                                                      1,
-                                                                      n_rows,
-                                                                      n_cols,
-                                                                      A,
-                                                                      n_rows,
-                                                                      nullptr,
-                                                                      nullptr,
-                                                                      n_rows,
-                                                                      nullptr,
-                                                                      n_cols,
-                                                                      &cusolverWorkSetSize,
-                                                                      gesvdj_params));
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(
+    raft::linalg::detail::cusolverDngesvdj_bufferSize<math_t>(cusolverH,
+                                                              CUSOLVER_EIG_MODE_VECTOR,
+                                                              1,
+                                                              n_rows,
+                                                              n_cols,
+                                                              A,
+                                                              n_rows,
+                                                              nullptr,
+                                                              nullptr,
+                                                              n_rows,
+                                                              nullptr,
+                                                              n_cols,
+                                                              &cusolverWorkSetSize,
+                                                              gesvdj_params));
   rmm::device_uvector<math_t> workset(cusolverWorkSetSize  // cuSolver
                                         + n_rows * minmn   // U
                                         + n_cols * minmn   // V
@@ -217,23 +223,24 @@ void lstsqSvdJacobi(const raft::handle_t& handle,
   math_t* S               = V + n_cols * minmn;
   math_t* Ub              = S + minmn;
   int* devInfo            = reinterpret_cast<int*>(Ub + minmn);
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngesvdj<math_t>(cusolverH,
-                                                           CUSOLVER_EIG_MODE_VECTOR,
-                                                           1,
-                                                           n_rows,
-                                                           n_cols,
-                                                           A,
-                                                           n_rows,
-                                                           S,
-                                                           U,
-                                                           n_rows,
-                                                           V,
-                                                           n_cols,
-                                                           cusolverWorkSet,
-                                                           cusolverWorkSetSize,
-                                                           devInfo,
-                                                           gesvdj_params,
-                                                           stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngesvdj<math_t>(cusolverH,
+                                                                   CUSOLVER_EIG_MODE_VECTOR,
+                                                                   1,
+                                                                   n_rows,
+                                                                   n_cols,
+                                                                   A,
+                                                                   n_rows,
+                                                                   S,
+                                                                   U,
+                                                                   n_rows,
+                                                                   V,
+                                                                   n_cols,
+                                                                   cusolverWorkSet,
+                                                                   cusolverWorkSetSize,
+                                                                   devInfo,
+                                                                   gesvdj_params,
+                                                                   stream));
   raft::linalg::gemv(handle, U, n_rows, minmn, b, Ub, true, stream);
   raft::linalg::binaryOp(Ub, Ub, S, minmn, DivideByNonZero<math_t>(), stream);
   raft::linalg::gemv(handle, V, n_cols, minmn, Ub, w, false, stream);
@@ -366,48 +373,52 @@ void lstsqQR(const raft::handle_t& handle,
   const int lda = m;
   const int ldb = m;
 
+  // #TODO: Call from public API when ready
   RAFT_CUSOLVER_TRY(
-    raft::linalg::cusolverDngeqrf_bufferSize(cusolverH, m, n, A, lda, &lwork_geqrf));
-
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnormqr_bufferSize(cusolverH,
-                                                             side,
-                                                             trans,
-                                                             m,
-                                                             1,
-                                                             n,
-                                                             A,
-                                                             lda,
-                                                             d_tau.data(),
-                                                             b,    // C,
-                                                             lda,  // ldc,
-                                                             &lwork_ormqr));
+    raft::linalg::detail::cusolverDngeqrf_bufferSize(cusolverH, m, n, A, lda, &lwork_geqrf));
+
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr_bufferSize(cusolverH,
+                                                                     side,
+                                                                     trans,
+                                                                     m,
+                                                                     1,
+                                                                     n,
+                                                                     A,
+                                                                     lda,
+                                                                     d_tau.data(),
+                                                                     b,    // C,
+                                                                     lda,  // ldc,
+                                                                     &lwork_ormqr));
 
   lwork = (lwork_geqrf > lwork_ormqr) ? lwork_geqrf : lwork_ormqr;
 
   rmm::device_uvector<math_t> d_work(lwork, stream);
 
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDngeqrf(
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDngeqrf(
     cusolverH, m, n, A, lda, d_tau.data(), d_work.data(), lwork, d_info.data(), stream));
 
   RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
   ASSERT(0 == info, "lstsq.h: QR wasn't successful");
 
-  RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnormqr(cusolverH,
-                                                  side,
-                                                  trans,
-                                                  m,
-                                                  1,
-                                                  n,
-                                                  A,
-                                                  lda,
-                                                  d_tau.data(),
-                                                  b,
-                                                  ldb,
-                                                  d_work.data(),
-                                                  lwork,
-                                                  d_info.data(),
-                                                  stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnormqr(cusolverH,
+                                                          side,
+                                                          trans,
+                                                          m,
+                                                          1,
+                                                          n,
+                                                          A,
+                                                          lda,
+                                                          d_tau.data(),
+                                                          b,
+                                                          ldb,
+                                                          d_work.data(),
+                                                          lwork,
+                                                          d_info.data(),
+                                                          stream));
 
   RAFT_CUDA_TRY(cudaMemcpyAsync(&info, d_info.data(), sizeof(int), cudaMemcpyDeviceToHost, stream));
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
@@ -415,19 +426,20 @@ void lstsqQR(const raft::handle_t& handle,
 
   const math_t one = 1;
 
-  RAFT_CUBLAS_TRY(raft::linalg::cublastrsm(cublasH,
-                                           side,
-                                           CUBLAS_FILL_MODE_UPPER,
-                                           CUBLAS_OP_N,
-                                           CUBLAS_DIAG_NON_UNIT,
-                                           n,
-                                           1,
-                                           &one,
-                                           A,
-                                           lda,
-                                           b,
-                                           ldb,
-                                           stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublastrsm(cublasH,
+                                                   side,
+                                                   CUBLAS_FILL_MODE_UPPER,
+                                                   CUBLAS_OP_N,
+                                                   CUBLAS_DIAG_NON_UNIT,
+                                                   n,
+                                                   1,
+                                                   &one,
+                                                   A,
+                                                   lda,
+                                                   b,
+                                                   ldb,
+                                                   stream));
 
   RAFT_CUDA_TRY(cudaMemcpyAsync(w, b, sizeof(math_t) * n, cudaMemcpyDeviceToDevice, stream));
 }
diff --git a/cpp/src_prims/linalg/power.cuh b/cpp/src_prims/linalg/power.cuh
index 563e86040a..bf0ad989ee 100644
--- a/cpp/src_prims/linalg/power.cuh
+++ b/cpp/src_prims/linalg/power.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/unary_op.hpp>
 
 namespace MLCommon {
 namespace LinAlg {
diff --git a/cpp/src_prims/linalg/rsvd.cuh b/cpp/src_prims/linalg/rsvd.cuh
index 55a3ecfa86..463582622c 100644
--- a/cpp/src_prims/linalg/rsvd.cuh
+++ b/cpp/src_prims/linalg/rsvd.cuh
@@ -17,13 +17,11 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
-#include <raft/linalg/eig.cuh>
-#include <raft/linalg/gemm.cuh>
-#include <raft/linalg/qr.cuh>
-#include <raft/linalg/svd.cuh>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/eig.hpp>
+#include <raft/linalg/gemm.hpp>
+#include <raft/linalg/qr.hpp>
+#include <raft/linalg/svd.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/matrix/math.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
diff --git a/cpp/src_prims/linalg/sqrt.cuh b/cpp/src_prims/linalg/sqrt.cuh
index 638bd32823..703814d023 100644
--- a/cpp/src_prims/linalg/sqrt.cuh
+++ b/cpp/src_prims/linalg/sqrt.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/cuda_utils.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace MLCommon {
 namespace LinAlg {
diff --git a/cpp/src_prims/matrix/grammatrix.cuh b/cpp/src_prims/matrix/grammatrix.cuh
index 4c8f0d1267..bd66f815e2 100644
--- a/cpp/src_prims/matrix/grammatrix.cuh
+++ b/cpp/src_prims/matrix/grammatrix.cuh
@@ -17,8 +17,9 @@
 #pragma once
 
 #include <raft/distance/distance.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/gemm.cuh>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/gemm.hpp>
 
 namespace MLCommon {
 namespace Matrix {
@@ -146,37 +147,39 @@ class GramMatrixBase {
     math_t alpha = 1.0;
     math_t beta  = 0.0;
     if (is_row_major) {
-      RAFT_CUBLAS_TRY(raft::linalg::cublasgemm(cublas_handle,
-                                               CUBLAS_OP_T,
-                                               CUBLAS_OP_N,
-                                               n2,
-                                               n1,
-                                               n_cols,
-                                               &alpha,
-                                               x2,
-                                               ld2,
-                                               x1,
-                                               ld1,
-                                               &beta,
-                                               out,
-                                               ld_out,
-                                               stream));
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                       CUBLAS_OP_T,
+                                                       CUBLAS_OP_N,
+                                                       n2,
+                                                       n1,
+                                                       n_cols,
+                                                       &alpha,
+                                                       x2,
+                                                       ld2,
+                                                       x1,
+                                                       ld1,
+                                                       &beta,
+                                                       out,
+                                                       ld_out,
+                                                       stream));
     } else {
-      RAFT_CUBLAS_TRY(raft::linalg::cublasgemm(cublas_handle,
-                                               CUBLAS_OP_N,
-                                               CUBLAS_OP_T,
-                                               n1,
-                                               n2,
-                                               n_cols,
-                                               &alpha,
-                                               x1,
-                                               ld1,
-                                               x2,
-                                               ld2,
-                                               &beta,
-                                               out,
-                                               ld_out,
-                                               stream));
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_T,
+                                                       n1,
+                                                       n2,
+                                                       n_cols,
+                                                       &alpha,
+                                                       x1,
+                                                       ld1,
+                                                       x2,
+                                                       ld2,
+                                                       &beta,
+                                                       out,
+                                                       ld_out,
+                                                       stream));
     }
   }
 
diff --git a/cpp/src_prims/matrix/kernelmatrices.cuh b/cpp/src_prims/matrix/kernelmatrices.cuh
index 70cf931c55..6e40dc243a 100644
--- a/cpp/src_prims/matrix/kernelmatrices.cuh
+++ b/cpp/src_prims/matrix/kernelmatrices.cuh
@@ -19,7 +19,7 @@
 #include "grammatrix.cuh"
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/distance.hpp>
-#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/gemm.hpp>
 
 namespace MLCommon {
 namespace Matrix {
diff --git a/cpp/src_prims/metrics/adjusted_rand_index.cuh b/cpp/src_prims/metrics/adjusted_rand_index.cuh
index 4f08f01aab..4bb5ef44c7 100644
--- a/cpp/src_prims/metrics/adjusted_rand_index.cuh
+++ b/cpp/src_prims/metrics/adjusted_rand_index.cuh
@@ -27,8 +27,8 @@
 #include <math.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/reduce.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <stats/histogram.cuh>
diff --git a/cpp/src_prims/metrics/batched/information_criterion.cuh b/cpp/src_prims/metrics/batched/information_criterion.cuh
index a4a61ce7a1..8770246f07 100644
--- a/cpp/src_prims/metrics/batched/information_criterion.cuh
+++ b/cpp/src_prims/metrics/batched/information_criterion.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@
  *  - BIC: https://en.wikipedia.org/wiki/Bayesian_information_criterion
  */
 
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 #include <cmath>
 
diff --git a/cpp/src_prims/metrics/dispersion.cuh b/cpp/src_prims/metrics/dispersion.cuh
index e018bf254c..0af5b93dca 100644
--- a/cpp/src_prims/metrics/dispersion.cuh
+++ b/cpp/src_prims/metrics/dispersion.cuh
@@ -20,7 +20,7 @@
 #include <memory>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/eltwise.hpp>
 #include <rmm/device_uvector.hpp>
 
 namespace MLCommon {
diff --git a/cpp/src_prims/metrics/entropy.cuh b/cpp/src_prims/metrics/entropy.cuh
index 76737d6aa5..e6cd6a21e2 100644
--- a/cpp/src_prims/metrics/entropy.cuh
+++ b/cpp/src_prims/metrics/entropy.cuh
@@ -23,8 +23,8 @@
 #include <math.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/divide.cuh>
-#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/divide.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/src_prims/metrics/kl_divergence.cuh b/cpp/src_prims/metrics/kl_divergence.cuh
index 2b637a88ac..cb6e69d951 100644
--- a/cpp/src_prims/metrics/kl_divergence.cuh
+++ b/cpp/src_prims/metrics/kl_divergence.cuh
@@ -24,7 +24,7 @@
 #include <math.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/map_then_reduce.cuh>
+#include <raft/linalg/map_then_reduce.hpp>
 #include <rmm/device_scalar.hpp>
 
 namespace MLCommon {
diff --git a/cpp/src_prims/metrics/mutual_info_score.cuh b/cpp/src_prims/metrics/mutual_info_score.cuh
index 552d0bf01b..b08b5a309d 100644
--- a/cpp/src_prims/metrics/mutual_info_score.cuh
+++ b/cpp/src_prims/metrics/mutual_info_score.cuh
@@ -29,7 +29,7 @@
 #include <math.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/reduce.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/src_prims/metrics/scores.cuh b/cpp/src_prims/metrics/scores.cuh
index 3132ad3de0..5fde1df62f 100644
--- a/cpp/src_prims/metrics/scores.cuh
+++ b/cpp/src_prims/metrics/scores.cuh
@@ -20,8 +20,8 @@
 #include <memory>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/subtract.cuh>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/subtract.hpp>
 #include <raft/spatial/knn/knn.hpp>
 #include <raft/stats/mean.hpp>
 #include <rmm/device_scalar.hpp>
diff --git a/cpp/src_prims/metrics/silhouette_score.cuh b/cpp/src_prims/metrics/silhouette_score.cuh
index 859faf8963..bd212924a2 100644
--- a/cpp/src_prims/metrics/silhouette_score.cuh
+++ b/cpp/src_prims/metrics/silhouette_score.cuh
@@ -25,12 +25,12 @@
 #include <numeric>
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/distance.hpp>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/eltwise.cuh>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/reduce.cuh>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/eltwise.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/reduce.hpp>
 #include <rmm/device_scalar.hpp>
 
 namespace MLCommon {
diff --git a/cpp/src_prims/random/make_blobs.cuh b/cpp/src_prims/random/make_blobs.cuh
index c33125d105..1e0f3eeb9f 100644
--- a/cpp/src_prims/random/make_blobs.cuh
+++ b/cpp/src_prims/random/make_blobs.cuh
@@ -19,7 +19,7 @@
 #include "permute.cuh"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 #include <vector>
diff --git a/cpp/src_prims/random/make_regression.cuh b/cpp/src_prims/random/make_regression.cuh
index 21f70170b9..b9ce3e552d 100644
--- a/cpp/src_prims/random/make_regression.cuh
+++ b/cpp/src_prims/random/make_regression.cuh
@@ -26,10 +26,11 @@
 #include <linalg/init.h>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/qr.cuh>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/add.hpp>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/qr.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/matrix/matrix.hpp>
 #include <raft/mr/device/buffer.hpp>
 #include <raft/random/rng.hpp>
@@ -90,36 +91,38 @@ static void _make_low_rank_matrix(const raft::handle_t& handle,
   rmm::device_uvector<DataT> temp_q0s(n_rows * n, stream);
   rmm::device_uvector<DataT> temp_out(n_rows * n_cols, stream);
   DataT alpha = 1.0, beta = 0.0;
-  raft::linalg::cublasgemm(cublas_handle,
-                           CUBLAS_OP_N,
-                           CUBLAS_OP_N,
-                           n_rows,
-                           n,
-                           n,
-                           &alpha,
-                           q0.data(),
-                           n_rows,
-                           singular_mat.data(),
-                           n,
-                           &beta,
-                           temp_q0s.data(),
-                           n_rows,
-                           stream);
-  raft::linalg::cublasgemm(cublas_handle,
-                           CUBLAS_OP_N,
-                           CUBLAS_OP_T,
-                           n_rows,
-                           n_cols,
-                           n,
-                           &alpha,
-                           temp_q0s.data(),
-                           n_rows,
-                           q1.data(),
-                           n_cols,
-                           &beta,
-                           temp_out.data(),
-                           n_rows,
-                           stream);
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                   CUBLAS_OP_N,
+                                                   CUBLAS_OP_N,
+                                                   n_rows,
+                                                   n,
+                                                   n,
+                                                   &alpha,
+                                                   q0.data(),
+                                                   n_rows,
+                                                   singular_mat.data(),
+                                                   n,
+                                                   &beta,
+                                                   temp_q0s.data(),
+                                                   n_rows,
+                                                   stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                   CUBLAS_OP_N,
+                                                   CUBLAS_OP_T,
+                                                   n_rows,
+                                                   n_cols,
+                                                   n,
+                                                   &alpha,
+                                                   temp_q0s.data(),
+                                                   n_rows,
+                                                   q1.data(),
+                                                   n_cols,
+                                                   &beta,
+                                                   temp_out.data(),
+                                                   n_rows,
+                                                   stream));
 
   // Transpose from column-major to row-major
   raft::linalg::transpose(handle, temp_out.data(), out, n_rows, n_cols, stream);
@@ -254,21 +257,22 @@ void make_regression(const raft::handle_t& handle,
 
   // Compute the output values
   DataT alpha = (DataT)1.0, beta = (DataT)0.0;
-  RAFT_CUBLAS_TRY(raft::linalg::cublasgemm(cublas_handle,
-                                           CUBLAS_OP_T,
-                                           CUBLAS_OP_T,
-                                           n_rows,
-                                           n_targets,
-                                           n_informative,
-                                           &alpha,
-                                           out,
-                                           n_cols,
-                                           _coef,
-                                           n_targets,
-                                           &beta,
-                                           _values_col,
-                                           n_rows,
-                                           stream));
+  // #TODO: Call from public API when ready
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
+                                                   CUBLAS_OP_T,
+                                                   CUBLAS_OP_T,
+                                                   n_rows,
+                                                   n_targets,
+                                                   n_informative,
+                                                   &alpha,
+                                                   out,
+                                                   n_cols,
+                                                   _coef,
+                                                   n_targets,
+                                                   &beta,
+                                                   _values_col,
+                                                   n_rows,
+                                                   stream));
 
   // Transpose the values from column-major to row-major if needed
   if (n_targets > 1) {
diff --git a/cpp/src_prims/random/mvg.cuh b/cpp/src_prims/random/mvg.cuh
index 1153dbc834..6c8465d856 100644
--- a/cpp/src_prims/random/mvg.cuh
+++ b/cpp/src_prims/random/mvg.cuh
@@ -19,10 +19,12 @@
 #include <cmath>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/cusolver_wrappers.h>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/unary_op.cuh>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <stdio.h>
 
 // mvg.cuh takes in matrices that are colomn major (as in fortan)
@@ -163,16 +165,19 @@ class MultiVarGaussian {
     CURAND_CHECK(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
     CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(gen, 28));  // SEED
     if (method == chol_decomp) {
-      RAFT_CUSOLVER_TRY(
-        raft::linalg::cusolverDnpotrf_bufferSize(cusolverHandle, uplo, dim, P, dim, &Lwork));
+      // #TODO: Call from public API when ready
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf_bufferSize(
+        cusolverHandle, uplo, dim, P, dim, &Lwork));
     } else if (method == jacobi) {  // jacobi init
       RAFT_CUSOLVER_TRY(cusolverDnCreateSyevjInfo(&syevj_params));
       RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetTolerance(syevj_params, tol));
       RAFT_CUSOLVER_TRY(cusolverDnXsyevjSetMaxSweeps(syevj_params, max_sweeps));
-      RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnsyevj_bufferSize(
+      // #TODO: Call from public API when ready
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnsyevj_bufferSize(
         cusolverHandle, jobz, uplo, dim, P, dim, eig, &Lwork, syevj_params));
     } else {  // method == qr
-      RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnsyevd_bufferSize(
+      // #TODO: Call from public API when ready
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnsyevd_bufferSize(
         cusolverHandle, jobz, uplo, dim, P, dim, eig, &Lwork));
     }
     return give_buffer_size();
@@ -189,24 +194,27 @@ class MultiVarGaussian {
   {
     if (method == chol_decomp) {
       // lower part will contains chol_decomp
-      RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf(
+      // #TODO: Call from public API when ready
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf(
         cusolverHandle, uplo, dim, P, dim, workspace_decomp, Lwork, info, cudaStream));
     } else if (method == jacobi) {
+      // #TODO: Call from public API when ready
       RAFT_CUSOLVER_TRY(
-        raft::linalg::cusolverDnsyevj(cusolverHandle,
-                                      jobz,
-                                      uplo,
-                                      dim,
-                                      P,
-                                      dim,
-                                      eig,
-                                      workspace_decomp,
-                                      Lwork,
-                                      info,
-                                      syevj_params,
-                                      cudaStream));  // vectors stored as cols. & col major
-    } else {                                         // qr
-      RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnsyevd(
+        raft::linalg::detail::cusolverDnsyevj(cusolverHandle,
+                                              jobz,
+                                              uplo,
+                                              dim,
+                                              P,
+                                              dim,
+                                              eig,
+                                              workspace_decomp,
+                                              Lwork,
+                                              info,
+                                              syevj_params,
+                                              cudaStream));  // vectors stored as cols. & col major
+    } else {
+      // #TODO: Call from public API when ready                                  // qr
+      RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnsyevd(
         cusolverHandle, jobz, uplo, dim, P, dim, eig, workspace_decomp, Lwork, info, cudaStream));
     }
     raft::update_host(&info_h, info, 1, cudaStream);
@@ -224,21 +232,22 @@ class MultiVarGaussian {
       RAFT_CUDA_TRY(cudaPeekAtLastError());
 
       // P is lower triangular chol decomp mtrx
-      RAFT_CUBLAS_TRY(raft::linalg::cublasgemm(cublasHandle,
-                                               CUBLAS_OP_N,
-                                               CUBLAS_OP_N,
-                                               dim,
-                                               nPoints,
-                                               dim,
-                                               &alfa,
-                                               P,
-                                               dim,
-                                               X,
-                                               dim,
-                                               &beta,
-                                               X,
-                                               dim,
-                                               cudaStream));
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasHandle,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_N,
+                                                       dim,
+                                                       nPoints,
+                                                       dim,
+                                                       &alfa,
+                                                       P,
+                                                       dim,
+                                                       X,
+                                                       dim,
+                                                       &beta,
+                                                       X,
+                                                       dim,
+                                                       cudaStream));
     } else {
       epsilonToZero(eig, epsilon, dim, cudaStream);
       dim3 block(64);
@@ -254,21 +263,22 @@ class MultiVarGaussian {
       ASSERT(info_h == 0, "mvg: Cov matrix has %dth Eigenval negative", info_h);
 
       // Got Q = eigvect*eigvals.sqrt in P, Q*X in X below
-      RAFT_CUBLAS_TRY(raft::linalg::cublasgemm(cublasHandle,
-                                               CUBLAS_OP_N,
-                                               CUBLAS_OP_N,
-                                               dim,
-                                               nPoints,
-                                               dim,
-                                               &alfa,
-                                               P,
-                                               dim,
-                                               X,
-                                               dim,
-                                               &beta,
-                                               X,
-                                               dim,
-                                               cudaStream));
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublasHandle,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_N,
+                                                       dim,
+                                                       nPoints,
+                                                       dim,
+                                                       &alfa,
+                                                       P,
+                                                       dim,
+                                                       X,
+                                                       dim,
+                                                       &beta,
+                                                       X,
+                                                       dim,
+                                                       cudaStream));
     }
     // working to make mean not 0
     // since we are working with column-major, nPoints and dim are swapped
diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh
index 5be591a968..31d5284c35 100644
--- a/cpp/src_prims/selection/knn.cuh
+++ b/cpp/src_prims/selection/knn.cuh
@@ -26,7 +26,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <raft/mr/device/allocator.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
diff --git a/cpp/src_prims/selection/processing.cuh b/cpp/src_prims/selection/processing.cuh
index b9a4f79e86..b559efda45 100644
--- a/cpp/src_prims/selection/processing.cuh
+++ b/cpp/src_prims/selection/processing.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,9 @@
 
 #include <cuml/neighbors/knn.hpp>
 
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/norm.hpp>
+#include <raft/linalg/unary_op.hpp>
 
 #include <raft/stats/mean.hpp>
 #include <raft/stats/mean_center.hpp>
diff --git a/cpp/src_prims/sparse/batched/csr.cuh b/cpp/src_prims/sparse/batched/csr.cuh
index a46efc7ab8..1cdcf40e9f 100644
--- a/cpp/src_prims/sparse/batched/csr.cuh
+++ b/cpp/src_prims/sparse/batched/csr.cuh
@@ -30,7 +30,6 @@
 
 #include <linalg/batched/matrix.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/src_prims/stats/cov.cuh b/cpp/src_prims/stats/cov.cuh
index e57d9c02ee..5b33f46fb6 100644
--- a/cpp/src_prims/stats/cov.cuh
+++ b/cpp/src_prims/stats/cov.cuh
@@ -16,8 +16,7 @@
 
 #pragma once
 
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/gemm.hpp>
 #include <raft/stats/mean_center.hpp>
 
 namespace MLCommon {
@@ -65,21 +64,22 @@ void cov(const raft::handle_t& handle,
     Type alpha = Type(1) / (sample ? Type(N - 1) : Type(N));
     Type beta  = Type(0);
     if (rowMajor) {
-      RAFT_CUBLAS_TRY(raft::linalg::cublasgemm(cublas_h,
-                                               CUBLAS_OP_N,
-                                               CUBLAS_OP_T,
-                                               D,
-                                               D,
-                                               N,
-                                               &alpha,
-                                               data,
-                                               D,
-                                               data,
-                                               D,
-                                               &beta,
-                                               covar,
-                                               D,
-                                               stream));
+      // #TODO: Call from public API when ready
+      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_h,
+                                                       CUBLAS_OP_N,
+                                                       CUBLAS_OP_T,
+                                                       D,
+                                                       D,
+                                                       N,
+                                                       &alpha,
+                                                       data,
+                                                       D,
+                                                       data,
+                                                       D,
+                                                       &beta,
+                                                       covar,
+                                                       D,
+                                                       stream));
     } else {
       raft::linalg::gemm(
         handle, data, N, D, data, covar, D, D, CUBLAS_OP_T, CUBLAS_OP_N, alpha, beta, stream);
diff --git a/cpp/src_prims/stats/weighted_mean.cuh b/cpp/src_prims/stats/weighted_mean.cuh
index da1969fdb7..cc148d0b1f 100644
--- a/cpp/src_prims/stats/weighted_mean.cuh
+++ b/cpp/src_prims/stats/weighted_mean.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/coalesced_reduction.cuh>
-#include <raft/linalg/strided_reduction.cuh>
+#include <raft/linalg/coalesced_reduction.hpp>
+#include <raft/linalg/strided_reduction.hpp>
 
 namespace MLCommon {
 namespace Stats {
diff --git a/cpp/src_prims/timeSeries/arima_helpers.cuh b/cpp/src_prims/timeSeries/arima_helpers.cuh
index 78bd068289..e07a13f019 100644
--- a/cpp/src_prims/timeSeries/arima_helpers.cuh
+++ b/cpp/src_prims/timeSeries/arima_helpers.cuh
@@ -23,8 +23,8 @@
 #include <linalg/batched/matrix.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/mr/device/allocator.hpp>
 
 // Private helper functions and kernels in the anonymous namespace
diff --git a/cpp/src_prims/timeSeries/fillna.cuh b/cpp/src_prims/timeSeries/fillna.cuh
index e82924734e..a017d86941 100644
--- a/cpp/src_prims/timeSeries/fillna.cuh
+++ b/cpp/src_prims/timeSeries/fillna.cuh
@@ -24,8 +24,8 @@
 #include <linalg/batched/matrix.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/mr/device/allocator.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/src_prims/timeSeries/jones_transform.cuh b/cpp/src_prims/timeSeries/jones_transform.cuh
index 6888340e67..2b50852d40 100644
--- a/cpp/src_prims/timeSeries/jones_transform.cuh
+++ b/cpp/src_prims/timeSeries/jones_transform.cuh
@@ -24,7 +24,7 @@
 #include <math.h>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/unary_op.hpp>
 
 namespace MLCommon {
 
diff --git a/cpp/src_prims/timeSeries/stationarity.cuh b/cpp/src_prims/timeSeries/stationarity.cuh
index 3fd6c102b1..519dface1a 100644
--- a/cpp/src_prims/timeSeries/stationarity.cuh
+++ b/cpp/src_prims/timeSeries/stationarity.cuh
@@ -35,9 +35,8 @@
 
 #include "arima_helpers.cuh"
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/linalg/reduce.hpp>
 #include <raft/stats/mean.hpp>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/mg/pca.cu b/cpp/test/mg/pca.cu
index 2c3c5d28a1..7ddcaa8388 100644
--- a/cpp/test/mg/pca.cu
+++ b/cpp/test/mg/pca.cu
@@ -22,7 +22,6 @@
 #include <opg/matrix/matrix_utils.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/matrix/matrix.hpp>
 #include <test_utils.h>
 
diff --git a/cpp/test/prims/add_sub_dev_scalar.cu b/cpp/test/prims/add_sub_dev_scalar.cu
index 99ff84e3b4..c2fd3b7730 100644
--- a/cpp/test/prims/add_sub_dev_scalar.cu
+++ b/cpp/test/prims/add_sub_dev_scalar.cu
@@ -17,9 +17,9 @@
 #include "test_utils.h"
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
-#include <raft/linalg/subtract.cuh>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/subtract.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/prims/batched/matrix.cu b/cpp/test/prims/batched/matrix.cu
index b19368c85c..cc949e8ba4 100644
--- a/cpp/test/prims/batched/matrix.cu
+++ b/cpp/test/prims/batched/matrix.cu
@@ -20,7 +20,7 @@
 #include <linalg/batched/matrix.cuh>
 
 #include <raft/cudart_utils.h>
-#include <raft/linalg/add.cuh>
+#include <raft/linalg/add.hpp>
 #include <raft/mr/device/allocator.hpp>
 
 #include <gtest/gtest.h>
diff --git a/cpp/test/prims/knn_regression.cu b/cpp/test/prims/knn_regression.cu
index dd41530ddc..5e2fddb006 100644
--- a/cpp/test/prims/knn_regression.cu
+++ b/cpp/test/prims/knn_regression.cu
@@ -20,8 +20,7 @@
 #include <label/classlabels.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cusolver_wrappers.h>
-#include <raft/linalg/reduce.cuh>
+#include <raft/linalg/reduce.hpp>
 #include <raft/random/rng.hpp>
 #include <raft/spatial/knn/knn.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/prims/make_regression.cu b/cpp/test/prims/make_regression.cu
index a85d92ef0c..444832b3c5 100644
--- a/cpp/test/prims/make_regression.cu
+++ b/cpp/test/prims/make_regression.cu
@@ -21,9 +21,10 @@
 #include "test_utils.h"
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/subtract.cuh>
-#include <raft/linalg/transpose.h>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/subtract.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <random/make_regression.cuh>
 
 namespace MLCommon {
@@ -80,21 +81,22 @@ class MakeRegressionTest : public ::testing::TestWithParam<MakeRegressionInputs<
 
     // Calculate the values from the data and coefficients (column-major)
     T alpha = (T)1.0, beta = (T)0.0;
-    RAFT_CUBLAS_TRY(raft::linalg::cublasgemm(handle.get_cublas_handle(),
-                                             CUBLAS_OP_T,
-                                             CUBLAS_OP_T,
-                                             params.n_samples,
-                                             params.n_targets,
-                                             params.n_features,
-                                             &alpha,
-                                             data.data(),
-                                             params.n_features,
-                                             coef.data(),
-                                             params.n_targets,
-                                             &beta,
-                                             values_cm.data(),
-                                             params.n_samples,
-                                             stream));
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(handle.get_cublas_handle(),
+                                                     CUBLAS_OP_T,
+                                                     CUBLAS_OP_T,
+                                                     params.n_samples,
+                                                     params.n_targets,
+                                                     params.n_features,
+                                                     &alpha,
+                                                     data.data(),
+                                                     params.n_features,
+                                                     coef.data(),
+                                                     params.n_targets,
+                                                     &beta,
+                                                     values_cm.data(),
+                                                     params.n_samples,
+                                                     stream));
 
     // Transpose the values to row-major
     raft::linalg::transpose(
diff --git a/cpp/test/prims/mvg.cu b/cpp/test/prims/mvg.cu
index c74cf3ca2d..fe5d1ea190 100644
--- a/cpp/test/prims/mvg.cu
+++ b/cpp/test/prims/mvg.cu
@@ -19,6 +19,8 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <random/mvg.cuh>
 #include <random>
 #include <rmm/device_uvector.hpp>
@@ -170,21 +172,22 @@ class MVGTest : public ::testing::TestWithParam<MVGInputs<T>> {
     T alfa = 1.0 / (nPoints - 1), beta = 0.0;
     cublasHandle_t handle;
     RAFT_CUBLAS_TRY(cublasCreate(&handle));
-    RAFT_CUBLAS_TRY(raft::linalg::cublasgemm(handle,
-                                             CUBLAS_OP_N,
-                                             CUBLAS_OP_T,
-                                             dim,
-                                             dim,
-                                             nPoints,
-                                             &alfa,
-                                             X_d.data(),
-                                             dim,
-                                             X_d.data(),
-                                             dim,
-                                             &beta,
-                                             Rand_cov.data(),
-                                             dim,
-                                             stream));
+    // #TODO: Call from public API when ready
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(handle,
+                                                     CUBLAS_OP_N,
+                                                     CUBLAS_OP_T,
+                                                     dim,
+                                                     dim,
+                                                     nPoints,
+                                                     &alfa,
+                                                     X_d.data(),
+                                                     dim,
+                                                     X_d.data(),
+                                                     dim,
+                                                     &beta,
+                                                     Rand_cov.data(),
+                                                     dim,
+                                                     stream));
 
     // restoring cov provided into P_d
     raft::update_device(P_d.data(), P.data(), dim * dim, stream);
diff --git a/cpp/test/prims/silhouette_score.cu b/cpp/test/prims/silhouette_score.cu
index 2f0f09caac..c28d4c3a10 100644
--- a/cpp/test/prims/silhouette_score.cu
+++ b/cpp/test/prims/silhouette_score.cu
@@ -21,7 +21,7 @@
 #include <metrics/batched/silhouette_score.cuh>
 #include <metrics/silhouette_score.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 #include <random>
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sg/cd_test.cu b/cpp/test/sg/cd_test.cu
index 4b06543367..a639eab613 100644
--- a/cpp/test/sg/cd_test.cu
+++ b/cpp/test/sg/cd_test.cu
@@ -16,7 +16,6 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_uvector.hpp>
 #include <solver/cd.cuh>
diff --git a/cpp/test/sg/dbscan_test.cu b/cpp/test/sg/dbscan_test.cu
index ae0b400eaf..9afd801f43 100644
--- a/cpp/test/sg/dbscan_test.cu
+++ b/cpp/test/sg/dbscan_test.cu
@@ -23,11 +23,10 @@
 #include <cuml/datasets/make_blobs.hpp>
 #include <cuml/metrics/metrics.hpp>
 #include <raft/distance/distance.hpp>
-#include <raft/linalg/distance_type.h>
+#include <raft/distance/distance_type.hpp>
 
 #include <raft/handle.hpp>
-#include <raft/linalg/cublas_wrappers.h>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/transpose.hpp>
 
 #include <test_utils.h>
 
diff --git a/cpp/test/sg/hdbscan_test.cu b/cpp/test/sg/hdbscan_test.cu
index 525ec3b4e9..b0e6ecb703 100644
--- a/cpp/test/sg/hdbscan_test.cu
+++ b/cpp/test/sg/hdbscan_test.cu
@@ -30,8 +30,8 @@
 
 #include <raft/sparse/hierarchy/detail/agglomerative.cuh>
 
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/transpose.h>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/mr/device/allocator.hpp>
 #include <raft/sparse/coo.hpp>
 #include <raft/sparse/op/sort.hpp>
diff --git a/cpp/test/sg/lars_test.cu b/cpp/test/sg/lars_test.cu
index 0164f110bf..6baa991c79 100644
--- a/cpp/test/sg/lars_test.cu
+++ b/cpp/test/sg/lars_test.cu
@@ -18,7 +18,8 @@
 #include <iomanip>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/cusolver_wrappers.h>
+// #TODO: Replace with public header when ready
+#include <raft/linalg/detail/cusolver_wrappers.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 #include <solver/lars_impl.cuh>
@@ -111,20 +112,22 @@ class LarsTest : public ::testing::Test {
     rmm::device_uvector<math_t> workspace(0, stream);
     int n_work;
     const int ld_U = n_cols;
-    RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf_bufferSize(
+    // #TODO: Call from public API when ready
+    RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf_bufferSize(
       handle.get_cusolver_dn_handle(), CUBLAS_FILL_MODE_UPPER, n_cols, U_dev_exp, ld_U, &n_work));
     workspace.resize(n_work, stream);
     // Expected solution using Cholesky factorization from scratch
     raft::copy(U_dev_exp, G, n_cols * ld_U, stream);
-    RAFT_CUSOLVER_TRY(raft::linalg::cusolverDnpotrf(handle.get_cusolver_dn_handle(),
-                                                    CUBLAS_FILL_MODE_UPPER,
-                                                    n_cols,
-                                                    U_dev_exp,
-                                                    ld_U,
-                                                    workspace.data(),
-                                                    n_work,
-                                                    devInfo.data(),
-                                                    stream));
+    // #TODO: Call from public API when ready
+    RAFT_CUSOLVER_TRY(raft::linalg::detail::cusolverDnpotrf(handle.get_cusolver_dn_handle(),
+                                                            CUBLAS_FILL_MODE_UPPER,
+                                                            n_cols,
+                                                            U_dev_exp,
+                                                            ld_U,
+                                                            workspace.data(),
+                                                            n_work,
+                                                            devInfo.data(),
+                                                            stream));
   }
 
   // Initialize a mix of G and U matrices to test updateCholesky
diff --git a/cpp/test/sg/linear_svm_test.cu b/cpp/test/sg/linear_svm_test.cu
index 48a380f574..9da7926c71 100644
--- a/cpp/test/sg/linear_svm_test.cu
+++ b/cpp/test/sg/linear_svm_test.cu
@@ -18,10 +18,10 @@
 #include <cuml/datasets/make_regression.hpp>
 #include <cuml/svm/linear.hpp>
 #include <gtest/gtest.h>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/reduce.cuh>
-#include <raft/linalg/transpose.h>
-#include <raft/linalg/unary_op.cuh>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/reduce.hpp>
+#include <raft/linalg/transpose.hpp>
+#include <raft/linalg/unary_op.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/sg/linkage_test.cu b/cpp/test/sg/linkage_test.cu
index ff360a1c47..1a9bd7f7f9 100644
--- a/cpp/test/sg/linkage_test.cu
+++ b/cpp/test/sg/linkage_test.cu
@@ -24,8 +24,8 @@
 #include <hierarchy/pw_dist_graph.cuh>
 #include <raft/mr/device/allocator.hpp>
 
-#include <raft/linalg/distance_type.h>
-#include <raft/linalg/transpose.h>
+#include <raft/distance/distance_type.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/sparse/coo.hpp>
 
 #include <cuml/common/logger.hpp>
diff --git a/cpp/test/sg/pca_test.cu b/cpp/test/sg/pca_test.cu
index d164ab6d3a..09664b279c 100644
--- a/cpp/test/sg/pca_test.cu
+++ b/cpp/test/sg/pca_test.cu
@@ -19,7 +19,6 @@
 #include <pca/pca.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cublas_wrappers.h>
 #include <raft/random/rng.hpp>
 #include <test_utils.h>
 #include <vector>
diff --git a/cpp/test/sg/quasi_newton.cu b/cpp/test/sg/quasi_newton.cu
index 37c5078449..8a39a3c81a 100644
--- a/cpp/test/sg/quasi_newton.cu
+++ b/cpp/test/sg/quasi_newton.cu
@@ -22,7 +22,7 @@
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/transpose.hpp>
 #include <test_utils.h>
 #include <vector>
 
diff --git a/cpp/test/sg/rf_test.cu b/cpp/test/sg/rf_test.cu
index 3a1a850bff..b70e3f3ff0 100644
--- a/cpp/test/sg/rf_test.cu
+++ b/cpp/test/sg/rf_test.cu
@@ -29,7 +29,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/transpose.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/device_vector.h>
diff --git a/cpp/test/sg/rproj_test.cu b/cpp/test/sg/rproj_test.cu
index 4f7a294a79..967223f765 100644
--- a/cpp/test/sg/rproj_test.cu
+++ b/cpp/test/sg/rproj_test.cu
@@ -21,7 +21,7 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/distance.hpp>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/transpose.hpp>
 #include <random>
 #include <test_utils.h>
 #include <vector>
diff --git a/cpp/test/sg/sgd.cu b/cpp/test/sg/sgd.cu
index de62055f0e..4255f89242 100644
--- a/cpp/test/sg/sgd.cu
+++ b/cpp/test/sg/sgd.cu
@@ -16,7 +16,6 @@
 
 #include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/cusolver_wrappers.h>
 #include <raft/matrix/matrix.hpp>
 #include <rmm/device_uvector.hpp>
 #include <solver/sgd.cuh>
diff --git a/cpp/test/sg/svc_test.cu b/cpp/test/sg/svc_test.cu
index 22e562c976..3b4ce98194 100644
--- a/cpp/test/sg/svc_test.cu
+++ b/cpp/test/sg/svc_test.cu
@@ -27,9 +27,9 @@
 #include <matrix/kernelmatrices.cuh>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
-#include <raft/linalg/binary_op.cuh>
-#include <raft/linalg/map_then_reduce.cuh>
-#include <raft/linalg/transpose.h>
+#include <raft/linalg/add.hpp>
+#include <raft/linalg/map_then_reduce.hpp>
+#include <raft/linalg/transpose.hpp>
 #include <raft/random/rng.hpp>
 #include <random/make_blobs.cuh>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/test/sg/tsne_test.cu b/cpp/test/sg/tsne_test.cu
index d460581ea9..35595b3f52 100644
--- a/cpp/test/sg/tsne_test.cu
+++ b/cpp/test/sg/tsne_test.cu
@@ -16,7 +16,7 @@
 
 #include <cuml/manifold/tsne.h>
 #include <cuml/metrics/metrics.hpp>
-#include <raft/linalg/map.cuh>
+#include <raft/linalg/map.hpp>
 
 #include <cuml/common/logger.hpp>
 #include <datasets/boston.h>
diff --git a/python/cuml/metrics/distance_type.pxd b/python/cuml/metrics/distance_type.pxd
index 4286ea1c9d..7f779c85f4 100644
--- a/python/cuml/metrics/distance_type.pxd
+++ b/python/cuml/metrics/distance_type.pxd
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 # limitations under the License.
 #
 
-cdef extern from "raft/linalg/distance_type.h" namespace "raft::distance":
+cdef extern from "raft/distance/distance_type.hpp" namespace "raft::distance":
 
     ctypedef enum DistanceType:
         L2Expanded "raft::distance::DistanceType::L2Expanded"
diff --git a/python/cuml/metrics/trustworthiness.pyx b/python/cuml/metrics/trustworthiness.pyx
index d1d474cd40..cf7a5e16db 100644
--- a/python/cuml/metrics/trustworthiness.pyx
+++ b/python/cuml/metrics/trustworthiness.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ from cuml.common.input_utils import input_to_cuml_array
 from cuml.raft.common.handle import Handle
 from cuml.raft.common.handle cimport handle_t
 
-cdef extern from "raft/linalg/distance_type.h" namespace "raft::distance":
+cdef extern from "raft/distance/distance_type.hpp" namespace "raft::distance":
 
     ctypedef int DistanceType
     ctypedef DistanceType euclidean "(raft::distance::DistanceType)5"