diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp
index 1c17b5bcb9..693e50a506 100644
--- a/cpp/include/raft/core/device_mdarray.hpp
+++ b/cpp/include/raft/core/device_mdarray.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <raft/core/detail/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/mdarray.hpp>
diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp
index 7257b65f58..394ea228b4 100644
--- a/cpp/include/raft/core/device_mdspan.hpp
+++ b/cpp/include/raft/core/device_mdspan.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <raft/core/host_device_accessor.hpp>
 #include <raft/core/mdspan.hpp>
 
diff --git a/cpp/include/raft/core/host_mdarray.hpp b/cpp/include/raft/core/host_mdarray.hpp
index 6221ca59f0..20cb5c1446 100644
--- a/cpp/include/raft/core/host_mdarray.hpp
+++ b/cpp/include/raft/core/host_mdarray.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <raft/core/host_mdspan.hpp>
 
 #include <raft/core/detail/host_mdarray.hpp>
diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp
index 961a7a7ccb..0b49ca9945 100644
--- a/cpp/include/raft/core/host_mdspan.hpp
+++ b/cpp/include/raft/core/host_mdspan.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <raft/core/mdspan.hpp>
 
 #include <raft/core/host_device_accessor.hpp>
diff --git a/cpp/include/raft/core/mdspan_types.hpp b/cpp/include/raft/core/mdspan_types.hpp
index bc2ba314a3..07c69f472c 100644
--- a/cpp/include/raft/core/mdspan_types.hpp
+++ b/cpp/include/raft/core/mdspan_types.hpp
@@ -47,7 +47,7 @@ using vector_extent = std::experimental::extents<IndexType, dynamic_extent>;
 template <typename IndexType>
 using matrix_extent = std::experimental::extents<IndexType, dynamic_extent, dynamic_extent>;
 
-template <typename IndexType = std::uint32_t>
+template <typename IndexType>
 using scalar_extent = std::experimental::extents<IndexType, 1>;
 
 /**
diff --git a/cpp/include/raft/linalg/matrix_vector.cuh b/cpp/include/raft/linalg/matrix_vector.cuh
new file mode 100644
index 0000000000..57bc0cf21f
--- /dev/null
+++ b/cpp/include/raft/linalg/matrix_vector.cuh
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/linalg/linalg_types.hpp>
+#include <raft/matrix/detail/math.cuh>
+#include <raft/util/input_validation.hpp>
+
+namespace raft::linalg {
+
+/**
+ * @brief multiply each row or column of matrix with vector, skipping zeros in vector
+ * @param [in] handle: raft handle for managing library resources
+ * @param[inout] data: input matrix, results are in-place
+ * @param[in] vec: input vector
+ * @param[in] apply whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns using enum class raft::linalg::Apply
+ */
+template <typename math_t, typename idx_t, typename layout_t>
+void binary_mult_skip_zero(const raft::handle_t& handle,
+                           raft::device_matrix_view<math_t, idx_t, layout_t> data,
+                           raft::device_vector_view<const math_t, idx_t> vec,
+                           Apply apply)
+{
+  bool row_major        = raft::is_row_major(data);
+  auto bcast_along_rows = apply == Apply::ALONG_ROWS;
+
+  idx_t vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+
+  RAFT_EXPECTS(
+    vec.extent(0) == vec_size,
+    "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
+    "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
+
+  matrix::detail::matrixVectorBinaryMultSkipZero(data.data_handle(),
+                                                 vec.data_handle(),
+                                                 data.extent(0),
+                                                 data.extent(1),
+                                                 row_major,
+                                                 bcast_along_rows,
+                                                 handle.get_stream());
+}
+
+/**
+ * @brief divide each row or column of matrix with vector
+ * @param[in] handle: raft handle for managing library resources
+ * @param[inout] data: input matrix, results are in-place
+ * @param[in] vec: input vector
+ * @param[in] apply whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns using enum class raft::linalg::Apply
+ */
+template <typename math_t, typename idx_t, typename layout_t>
+void binary_div(const raft::handle_t& handle,
+                raft::device_matrix_view<math_t, idx_t, layout_t> data,
+                raft::device_vector_view<const math_t, idx_t> vec,
+                Apply apply)
+{
+  bool row_major        = raft::is_row_major(data);
+  auto bcast_along_rows = apply == Apply::ALONG_ROWS;
+
+  idx_t vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+
+  RAFT_EXPECTS(
+    vec.extent(0) == vec_size,
+    "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
+    "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
+
+  matrix::detail::matrixVectorBinaryDiv(data.data_handle(),
+                                        vec.data_handle(),
+                                        data.extent(0),
+                                        data.extent(1),
+                                        row_major,
+                                        bcast_along_rows,
+                                        handle.get_stream());
+}
+
+/**
+ * @brief divide each row or column of matrix with vector, skipping zeros in vector
+ * @param[in] handle: raft handle for managing library resources
+ * @param[inout] data: input matrix, results are in-place
+ * @param[in] vec: input vector
+ * @param[in] apply whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns using enum class raft::linalg::Apply
+ * @param[in] return_zero: result is zero if true and vector value is below threshold, original
+ * value if false
+ */
+template <typename math_t, typename idx_t, typename layout_t>
+void binary_div_skip_zero(const raft::handle_t& handle,
+                          raft::device_matrix_view<math_t, idx_t, layout_t> data,
+                          raft::device_vector_view<const math_t, idx_t> vec,
+                          Apply apply,
+                          bool return_zero = false)
+{
+  bool row_major        = raft::is_row_major(data);
+  auto bcast_along_rows = apply == Apply::ALONG_ROWS;
+
+  idx_t vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+
+  RAFT_EXPECTS(
+    vec.extent(0) == vec_size,
+    "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
+    "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
+
+  matrix::detail::matrixVectorBinaryDivSkipZero(data.data_handle(),
+                                                vec.data_handle(),
+                                                data.extent(0),
+                                                data.extent(1),
+                                                row_major,
+                                                bcast_along_rows,
+                                                handle.get_stream(),
+                                                return_zero);
+}
+
+/**
+ * @brief add each row or column of matrix with vector
+ * @param[in] handle: raft handle for managing library resources
+ * @param[inout] data: input matrix, results are in-place
+ * @param[in] vec: input vector
+ * @param[in] apply whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns using enum class raft::linalg::Apply
+ */
+template <typename math_t, typename idx_t, typename layout_t>
+void binary_add(const raft::handle_t& handle,
+                raft::device_matrix_view<math_t, idx_t, layout_t> data,
+                raft::device_vector_view<const math_t, idx_t> vec,
+                Apply apply)
+{
+  bool row_major        = raft::is_row_major(data);
+  auto bcast_along_rows = apply == Apply::ALONG_ROWS;
+
+  idx_t vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+
+  RAFT_EXPECTS(
+    vec.extent(0) == vec_size,
+    "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
+    "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
+
+  matrix::detail::matrixVectorBinaryAdd(data.data_handle(),
+                                        vec.data_handle(),
+                                        data.extent(0),
+                                        data.extent(1),
+                                        row_major,
+                                        bcast_along_rows,
+                                        handle.get_stream());
+}
+
+/**
+ * @brief subtract each row or column of matrix with vector
+ * @param[in] handle: raft handle for managing library resources
+ * @param[inout] data: input matrix, results are in-place
+ * @param[in] vec: input vector
+ * @param[in] apply whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns using enum class raft::linalg::Apply
+ */
+template <typename math_t, typename idx_t, typename layout_t>
+void binary_sub(const raft::handle_t& handle,
+                raft::device_matrix_view<math_t, idx_t, layout_t> data,
+                raft::device_vector_view<const math_t, idx_t> vec,
+                Apply apply)
+{
+  bool row_major        = raft::is_row_major(data);
+  auto bcast_along_rows = apply == Apply::ALONG_ROWS;
+
+  idx_t vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+
+  RAFT_EXPECTS(
+    vec.extent(0) == vec_size,
+    "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
+    "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
+
+  matrix::detail::matrixVectorBinarySub(data.data_handle(),
+                                        vec.data_handle(),
+                                        data.extent(0),
+                                        data.extent(1),
+                                        row_major,
+                                        bcast_along_rows,
+                                        handle.get_stream());
+}
+}  // namespace raft::linalg
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/argmax.cuh b/cpp/include/raft/matrix/argmax.cuh
new file mode 100644
index 0000000000..b3face1012
--- /dev/null
+++ b/cpp/include/raft/matrix/argmax.cuh
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/math.cuh>
+
+namespace raft::matrix {
+
+/**
+ * @brief Argmax: find the row idx with maximum value for each column
+ * @param[in] handle: raft handle
+ * @param[in] in: input matrix of size (n_rows, n_cols)
+ * @param[out] out: output vector of size n_cols
+ */
+template <typename math_t, typename idx_t, typename matrix_idx_t>
+void argmax(const raft::handle_t& handle,
+            raft::device_matrix_view<const math_t, matrix_idx_t, row_major> in,
+            raft::device_vector_view<idx_t, matrix_idx_t> out)
+{
+  RAFT_EXPECTS(out.extent(0) == in.extent(0),
+               "Size of output vector must equal number of rows in input matrix.");
+  detail::argmax(
+    in.data_handle(), in.extent(0), in.extent(1), out.data_handle(), handle.get_stream());
+}
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index c425aad79b..17a40be5d6 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -116,15 +116,14 @@ void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 
   thrust::for_each(
     rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
-      idx_t dest_row = idx % m;
-      idx_t dest_col = idx / m;
+      idx_t dest_row = idx % (m / 2);
+      idx_t dest_col = idx / (m / 2);
       idx_t src_row  = (m - dest_row) - 1;
-      ;
-      idx_t src_col = dest_col;
+      idx_t src_col  = dest_col;
 
-      m_t temp                   = (m_t)d_q_reversed[idx];
-      d_q_reversed[idx]          = d_q[src_col * m + src_row];
-      d_q[src_col * m + src_row] = temp;
+      m_t temp                              = (m_t)d_q_reversed[dest_col * m + dest_row];
+      d_q_reversed[dest_col * m + dest_row] = d_q[src_col * m + src_row];
+      d_q[src_col * m + src_row]            = temp;
     });
 }
 
@@ -170,7 +169,7 @@ void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
  */
 template <typename m_t, typename idx_t = int>
 __global__ void slice(
-  m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2)
+  const m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2)
 {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t dm = x2 - x1, dn = y2 - y1;
@@ -182,7 +181,7 @@ __global__ void slice(
 }
 
 template <typename m_t, typename idx_t = int>
-void sliceMatrix(m_t* in,
+void sliceMatrix(const m_t* in,
                  idx_t n_rows,
                  idx_t n_cols,
                  m_t* out,
@@ -207,7 +206,7 @@ void sliceMatrix(m_t* in,
  * @param k: min(n_rows, n_cols)
  */
 template <typename m_t, typename idx_t = int>
-__global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k)
+__global__ void getUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k)
 {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t m = n_rows, n = n_cols;
@@ -218,7 +217,7 @@ __global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_col
 }
 
 template <typename m_t, typename idx_t = int>
-void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+void copyUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
   idx_t m = n_rows, n = n_cols;
   idx_t k = std::min(m, n);
@@ -236,16 +235,32 @@ void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStr
  * @param k: dimensionality
  */
 template <typename m_t, typename idx_t = int>
-__global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k)
+__global__ void copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k)
 {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   if (idx < k) { matrix[idx + idx * m] = vec[idx]; }
 }
 
+/**
+ * @brief Copy matrix diagonal to vector
+ * @param vec: vector of length k = min(n_rows, n_cols)
+ * @param matrix: matrix of size n_rows x n_cols
+ * @param m: number of rows of the matrix
+ * @param n: number of columns of the matrix
+ * @param k: dimensionality
+ */
+template <typename m_t, typename idx_t = int>
+__global__ void copyVectorFromMatrixDiagonal(m_t* vec, const m_t* matrix, idx_t m, idx_t n, idx_t k)
+{
+  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  if (idx < k) { vec[idx] = matrix[idx + idx * m]; }
+}
+
 template <typename m_t, typename idx_t = int>
 void initializeDiagonalMatrix(
-  m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+  const m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
   idx_t k = std::min(n_rows, n_cols);
   dim3 block(64);
@@ -253,6 +268,15 @@ void initializeDiagonalMatrix(
   copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows, n_cols, k);
 }
 
+template <typename m_t, typename idx_t = int>
+void getDiagonalMatrix(m_t* vec, const m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  idx_t k = std::min(n_rows, n_cols);
+  dim3 block(64);
+  dim3 grid((k + block.x - 1) / block.x);
+  copyVectorFromMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows, n_cols, k);
+}
+
 /**
  * @brief Calculate the inverse of the diagonal of a square matrix
  * element-wise and in place
@@ -275,11 +299,15 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
 }
 
 template <typename m_t, typename idx_t = int>
-m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
+m_t getL2Norm(const raft::handle_t& handle, const m_t* in, idx_t size, cudaStream_t stream)
 {
   cublasHandle_t cublasH = handle.get_cublas_handle();
   m_t normval            = 0;
-  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  RAFT_EXPECTS(
+    std::is_integral_v<idx_t> && (std::size_t)size <= (std::size_t)std::numeric_limits<int>::max(),
+    "Index type not supported");
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublasnrm2(cublasH, static_cast<int>(size), in, 1, &normval, stream));
   return normval;
 }
 
diff --git a/cpp/include/raft/matrix/diagonal.cuh b/cpp/include/raft/matrix/diagonal.cuh
new file mode 100644
index 0000000000..d83c932fcd
--- /dev/null
+++ b/cpp/include/raft/matrix/diagonal.cuh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/matrix.cuh>
+#include <raft/matrix/matrix.cuh>
+
+namespace raft::matrix {
+
+/**
+ * @brief Initialize a diagonal matrix with a vector
+ * @param[in] handle: raft handle
+ * @param[in] vec: vector of length k = min(n_rows, n_cols)
+ * @param[out] matrix: matrix of size n_rows x n_cols
+ */
+template <typename m_t, typename idx_t, typename layout>
+void set_diagonal(const raft::handle_t& handle,
+                  raft::device_vector_view<const m_t, idx_t> vec,
+                  raft::device_matrix_view<m_t, idx_t, layout> matrix)
+{
+  RAFT_EXPECTS(vec.extent(0) == std::min(matrix.extent(0), matrix.extent(1)),
+               "Diagonal vector must be min(matrix.n_rows, matrix.n_cols)");
+
+  detail::initializeDiagonalMatrix(vec.data_handle(),
+                                   matrix.data_handle(),
+                                   matrix.extent(0),
+                                   matrix.extent(1),
+                                   handle.get_stream());
+}
+
+/**
+ * @brief Initialize a diagonal matrix with a vector
+ * @param handle: raft handle
+ * @param[in] matrix: matrix of size n_rows x n_cols
+ * @param[out] vec: vector of length k = min(n_rows, n_cols)
+ */
+template <typename m_t, typename idx_t, typename layout>
+void get_diagonal(const raft::handle_t& handle,
+                  raft::device_matrix_view<const m_t, idx_t, layout> matrix,
+                  raft::device_vector_view<m_t, idx_t> vec)
+{
+  RAFT_EXPECTS(vec.extent(0) == std::min(matrix.extent(0), matrix.extent(1)),
+               "Diagonal vector must be min(matrix.n_rows, matrix.n_cols)");
+  detail::getDiagonalMatrix(vec.data_handle(),
+                            matrix.data_handle(),
+                            matrix.extent(0),
+                            matrix.extent(1),
+                            handle.get_stream());
+}
+
+/**
+ * @brief Take reciprocal of elements on diagonal of square matrix (in-place)
+ * @param handle raft handle
+ * @param[inout] inout: square input matrix with size len x len
+ */
+template <typename m_t, typename idx_t, typename layout>
+void invert_diagonal(const raft::handle_t& handle,
+                     raft::device_matrix_view<m_t, idx_t, layout> inout)
+{
+  // TODO: Use get_diagonal for this to support rectangular
+  RAFT_EXPECTS(inout.extent(0) == inout.extent(1), "Matrix must be square.");
+  detail::getDiagonalInverseMatrix(inout.data_handle(), inout.extent(0), handle.get_stream());
+}
+}  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/gather.cuh b/cpp/include/raft/matrix/gather.cuh
index fa6e73de49..12b0b94fa5 100644
--- a/cpp/include/raft/matrix/gather.cuh
+++ b/cpp/include/raft/matrix/gather.cuh
@@ -221,8 +221,8 @@ template <typename matrix_t,
 void gather_if(const raft::handle_t& handle,
                raft::device_matrix_view<const matrix_t, idx_t, row_major> in,
                raft::device_matrix_view<matrix_t, idx_t, row_major> out,
-               raft::device_vector_view<const map_t> map,
-               raft::device_vector_view<const stencil_t> stencil,
+               raft::device_vector_view<const map_t, idx_t> map,
+               raft::device_vector_view<const stencil_t, idx_t> stencil,
                unary_pred_t pred_op)
 {
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh
index e3a6c09fe6..caee2555a9 100644
--- a/cpp/include/raft/matrix/init.cuh
+++ b/cpp/include/raft/matrix/init.cuh
@@ -18,28 +18,49 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
-#include <raft/matrix/detail/matrix.cuh>
+#include <raft/matrix/detail/math.cuh>
 #include <raft/matrix/matrix.cuh>
 
 namespace raft::matrix {
 /**
  * @brief set values to scalar in matrix
  * @tparam math_t data-type upon which the math operation will be performed
- * @tparam idx_t integer type used for indexing
+ * @tparam extents dimension and indexing type used for the input
  * @tparam layout layout of the matrix data (must be row or col major)
  * @param[in] handle: raft handle
  * @param[in] in input matrix
  * @param[out] out output matrix. The result is stored in the out matrix
  * @param[in] scalar scalar value to fill matrix elements
  */
-template <typename math_t, typename idx_t, typename layout>
+template <typename math_t, typename extents, typename layout>
 void fill(const raft::handle_t& handle,
-          raft::device_matrix_view<const math_t, idx_t, layout> in,
-          raft::device_matrix_view<math_t, idx_t, layout> out,
+          raft::device_mdspan<const math_t, extents, layout> in,
+          raft::device_mdspan<math_t, extents, layout> out,
           raft::host_scalar_view<math_t> scalar)
 {
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Data layout not supported");
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must be the same size.");
+  RAFT_EXPECTS(scalar.data_handle() != nullptr, "Empty scalar");
   detail::setValue(
     out.data_handle(), in.data_handle(), *(scalar.data_handle()), in.size(), handle.get_stream());
 }
+
+/**
+ * @brief set values to scalar in matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam extents dimension and indexing type used for the input
+ * @tparam layout_t layout of the matrix data (must be row or col major)
+ * @param[in] handle: raft handle
+ * @param[inout] inout input matrix
+ * @param[in] scalar scalar value to fill matrix elements
+ */
+template <typename math_t, typename extents, typename layout>
+void fill(const raft::handle_t& handle,
+          raft::device_mdspan<math_t, extents, layout> inout,
+          math_t scalar)
+{
+  RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Data layout not supported");
+  detail::setValue(
+    inout.data_handle(), inout.data_handle(), scalar, inout.size(), handle.get_stream());
+}
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/norm.cuh b/cpp/include/raft/matrix/norm.cuh
new file mode 100644
index 0000000000..deb3657905
--- /dev/null
+++ b/cpp/include/raft/matrix/norm.cuh
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/matrix.cuh>
+
+namespace raft::matrix {
+
+/**
+ * @brief Get the L2/F-norm of a matrix
+ * @param[in] handle: raft handle
+ * @param[in] in: input matrix/vector with totally size elements
+ * @returns matrix l2 norm
+ */
+template <typename m_t, typename idx_t>
+m_t l2_norm(const raft::handle_t& handle, raft::device_mdspan<const m_t, idx_t> in)
+{
+  return detail::getL2Norm(handle, in.data_handle(), in.size(), handle.get_stream());
+}
+}  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
new file mode 100644
index 0000000000..e00a240577
--- /dev/null
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/matrix.cuh>
+#include <raft/util/input_validation.hpp>
+
+namespace raft::matrix {
+
+/**
+ * @brief Reverse the columns of a matrix in place (i.e. first column and
+ * last column are swapped)
+ * @param[in] handle: raft handle
+ * @param[inout] inout: input and output matrix
+ */
+template <typename m_t, typename idx_t, typename layout_t>
+void col_reverse(const raft::handle_t& handle, raft::device_matrix_view<m_t, idx_t, layout_t> inout)
+{
+  RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Unsupported matrix layout");
+  if (raft::is_col_major(inout)) {
+    detail::colReverse(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
+  } else {
+    detail::rowReverse(inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
+  }
+}
+
+/**
+ * @brief Reverse the rows of a matrix in place (i.e. first row and last
+ * row are swapped)
+ * @param[in] handle: raft handle
+ * @param[inout] inout: input and output matrix
+ */
+template <typename m_t, typename idx_t, typename layout_t>
+void row_reverse(const raft::handle_t& handle, raft::device_matrix_view<m_t, idx_t, layout_t> inout)
+{
+  RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Unsupported matrix layout");
+  if (raft::is_col_major(inout)) {
+    detail::rowReverse(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
+  } else {
+    detail::colReverse(inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
+  }
+}
+}  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/slice.cuh b/cpp/include/raft/matrix/slice.cuh
new file mode 100644
index 0000000000..eda2853c78
--- /dev/null
+++ b/cpp/include/raft/matrix/slice.cuh
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/matrix.cuh>
+
+namespace raft::matrix {
+
+template <typename idx_t>
+struct slice_coordinates {
+  idx_t row1;  ///< row coordinate of the top-left point of the wanted area (0-based)
+  idx_t col1;  ///< column coordinate of the top-left point of the wanted area (0-based)
+  idx_t row2;  ///< row coordinate of the bottom-right point of the wanted area (1-based)
+  idx_t col2;  ///< column coordinate of the bottom-right point of the wanted area (1-based)
+
+  slice_coordinates(idx_t row1_, idx_t col1_, idx_t row2_, idx_t col2_)
+    : row1(row1_), col1(col1_), row2(row2_), col2(col2_)
+  {
+  }
+};
+
+/**
+ * @brief Slice a matrix (in-place)
+ * @tparam m_t type of matrix elements
+ * @tparam idx_t integer type used for indexing
+ * @param[in] handle: raft handle
+ * @param[in] in: input matrix (column-major)
+ * @param[out] out: output matrix (column-major)
+ * @param[in] coords: coordinates of the wanted slice
+ * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice(handle, in, out, {0, 1, 4, 3});
+ */
+template <typename m_t, typename idx_t>
+void slice(const raft::handle_t& handle,
+           raft::device_matrix_view<const m_t, idx_t, col_major> in,
+           raft::device_matrix_view<m_t, idx_t, col_major> out,
+           slice_coordinates<idx_t> coords)
+{
+  RAFT_EXPECTS(coords.row2 > coords.row1, "row2 must be > row1");
+  RAFT_EXPECTS(coords.col2 > coords.col1, "col2 must be > col1");
+  RAFT_EXPECTS(coords.row1 >= 0, "row1 must be >= 0");
+  RAFT_EXPECTS(coords.row2 <= in.extent(0), "row2 must be <= number of rows in the input matrix");
+  RAFT_EXPECTS(coords.col1 >= 0, "col1 must be >= 0");
+  RAFT_EXPECTS(coords.col2 <= in.extent(1),
+               "col2 must be <= number of columns in the input matrix");
+
+  detail::sliceMatrix(in.data_handle(),
+                      in.extent(0),
+                      in.extent(1),
+                      out.data_handle(),
+                      coords.row1,
+                      coords.col1,
+                      coords.row2,
+                      coords.col2,
+                      handle.get_stream());
+}
+}  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/triangular.cuh b/cpp/include/raft/matrix/triangular.cuh
new file mode 100644
index 0000000000..fad3dd77af
--- /dev/null
+++ b/cpp/include/raft/matrix/triangular.cuh
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/matrix.cuh>
+
+namespace raft::matrix {
+
+/**
+ * @brief Copy the upper triangular part of a matrix to another
+ * @param[in] handle: raft handle
+ * @param[in] src: input matrix with a size of n_rows x n_cols
+ * @param[out] dst: output matrix with a size of kxk, k = min(n_rows, n_cols)
+ */
+template <typename m_t, typename idx_t>
+void upper_triangular(const raft::handle_t& handle,
+                      raft::device_matrix_view<const m_t, idx_t, col_major> src,
+                      raft::device_matrix_view<m_t, idx_t, col_major> dst)
+{
+  auto k = std::min(src.extent(0), src.extent(1));
+  RAFT_EXPECTS(k == dst.extent(0) && k == dst.extent(1),
+               "dst should be of size kxk, k = min(n_rows, n_cols)");
+  detail::copyUpperTriangular(
+    src.data_handle(), dst.data_handle(), src.extent(0), src.extent(1), handle.get_stream());
+}
+}  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 3c5d651b56..e03e08d8b9 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -140,6 +140,7 @@ if(BUILD_TESTS)
             test/linalg/gemv.cu
             test/linalg/map.cu
             test/linalg/map_then_reduce.cu
+            test/linalg/matrix_vector.cu
             test/linalg/matrix_vector_op.cu
             test/linalg/multiply.cu
             test/linalg/norm.cu
@@ -159,11 +160,17 @@ if(BUILD_TESTS)
 
     ConfigureTest(NAME MATRIX_TEST
             PATH
+            test/matrix/argmax.cu
+            test/matrix/columnSort.cu
+            test/matrix/diagonal.cu
             test/matrix/gather.cu
+            test/matrix/linewise_op.cu
             test/matrix/math.cu
             test/matrix/matrix.cu
-            test/matrix/columnSort.cu
-            test/matrix/linewise_op.cu
+            test/matrix/norm.cu
+            test/matrix/reverse.cu
+            test/matrix/slice.cu
+            test/matrix/triangular.cu
             test/spectral_matrix.cu
     )
 
diff --git a/cpp/test/linalg/matrix_vector.cu b/cpp/test/linalg/matrix_vector.cu
new file mode 100644
index 0000000000..9062f3be4d
--- /dev/null
+++ b/cpp/test/linalg/matrix_vector.cu
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "matrix_vector_op.cuh"
+#include <gtest/gtest.h>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/linalg/matrix_vector.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft {
+namespace linalg {
+
+template <typename T, typename IdxType = int>
+struct MatrixVectorInputs {
+  T tolerance;
+  IdxType rows, cols;
+  int operation_type;
+  bool row_major, bcast_along_rows;
+  unsigned long long int seed;
+};
+
+template <typename T, typename IdxType>
+::std::ostream& operator<<(::std::ostream& os, const MatrixVectorInputs<T, IdxType>& dims)
+{
+  return os;
+}
+
+// Or else, we get the following compilation error
+// for an extended __device__ lambda cannot have private or protected access
+// within its class
+template <typename T, typename IdxType>
+void matrix_vector_op_launch(const raft::handle_t& handle,
+                             T* in,
+                             const T* vec1,
+                             IdxType D,
+                             IdxType N,
+                             bool row_major,
+                             bool bcast_along_rows,
+                             int operation_type)
+{
+  auto in_row_major = raft::make_device_matrix_view<T, IdxType, raft::row_major>(in, N, D);
+  auto in_col_major = raft::make_device_matrix_view<T, IdxType, raft::col_major>(in, N, D);
+
+  auto apply     = bcast_along_rows ? Apply::ALONG_ROWS : Apply::ALONG_COLUMNS;
+  auto len       = bcast_along_rows ? D : N;
+  auto vec1_view = raft::make_device_vector_view<const T, IdxType>(vec1, len);
+
+  if (operation_type == 0) {
+    if (row_major) {
+      binary_mult_skip_zero(handle, in_row_major, vec1_view, apply);
+    } else {
+      binary_mult_skip_zero(handle, in_col_major, vec1_view, apply);
+    }
+  } else if (operation_type == 1) {
+    if (row_major) {
+      binary_div(handle, in_row_major, vec1_view, apply);
+    } else {
+      binary_div(handle, in_col_major, vec1_view, apply);
+    }
+  } else if (operation_type == 2) {
+    if (row_major) {
+      binary_div_skip_zero(handle, in_row_major, vec1_view, apply);
+    } else {
+      binary_div_skip_zero(handle, in_col_major, vec1_view, apply);
+    }
+  } else if (operation_type == 3) {
+    if (row_major) {
+      binary_add(handle, in_row_major, vec1_view, apply);
+    } else {
+      binary_add(handle, in_col_major, vec1_view, apply);
+    }
+  } else if (operation_type == 4) {
+    if (row_major) {
+      binary_sub(handle, in_row_major, vec1_view, apply);
+    } else {
+      binary_sub(handle, in_col_major, vec1_view, apply);
+    }
+  } else {
+    THROW("Unknown operation type '%d'!", (int)operation_type);
+  }
+}
+
+template <typename T, typename IdxType>
+void naive_matrix_vector_op_launch(const raft::handle_t& handle,
+                                   T* in,
+                                   const T* vec1,
+                                   IdxType D,
+                                   IdxType N,
+                                   bool row_major,
+                                   bool bcast_along_rows,
+                                   int operation_type)
+{
+  auto stream                       = handle.get_stream();
+  auto operation_bin_mult_skip_zero = [] __device__(T mat_element, T vec_element) {
+    if (vec_element != T(0)) {
+      return mat_element * vec_element;
+    } else {
+      return mat_element;
+    }
+  };
+  auto operation_div = [] __device__(T mat_element, T vec_element) {
+    return mat_element / vec_element;
+  };
+  auto operation_bin_div_skip_zero = [] __device__(T mat_element, T vec_element) {
+    if (raft::myAbs(vec_element) < T(1e-10))
+      return T(0);
+    else
+      return mat_element / vec_element;
+  };
+  auto operation_bin_add = [] __device__(T mat_element, T vec_element) {
+    return mat_element + vec_element;
+  };
+  auto operation_bin_sub = [] __device__(T mat_element, T vec_element) {
+    return mat_element - vec_element;
+  };
+
+  if (operation_type == 0) {
+    naiveMatVecOp(
+      in, vec1, D, N, row_major, bcast_along_rows, operation_bin_mult_skip_zero, stream);
+  } else if (operation_type == 1) {
+    naiveMatVecOp(in, vec1, D, N, row_major, bcast_along_rows, operation_div, stream);
+  } else if (operation_type == 2) {
+    naiveMatVecOp(in, vec1, D, N, row_major, bcast_along_rows, operation_bin_div_skip_zero, stream);
+  } else if (operation_type == 3) {
+    naiveMatVecOp(in, vec1, D, N, row_major, bcast_along_rows, operation_bin_add, stream);
+  } else if (operation_type == 4) {
+    naiveMatVecOp(in, vec1, D, N, row_major, bcast_along_rows, operation_bin_sub, stream);
+  } else {
+    THROW("Unknown operation type '%d'!", (int)operation_type);
+  }
+}
+
+template <typename T, typename IdxType>
+class MatrixVectorTest : public ::testing::TestWithParam<MatrixVectorInputs<T, IdxType>> {
+ public:
+  MatrixVectorTest()
+    : params(::testing::TestWithParam<MatrixVectorInputs<T, IdxType>>::GetParam()),
+      stream(handle.get_stream()),
+      in(params.rows * params.cols, stream),
+      out_ref(params.rows * params.cols, stream),
+      out(params.rows * params.cols, stream),
+      vec1(params.bcast_along_rows ? params.cols : params.rows, stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    raft::random::RngState r(params.seed);
+    IdxType N = params.rows, D = params.cols;
+    IdxType len    = N * D;
+    IdxType vecLen = params.bcast_along_rows ? D : N;
+    uniform(handle, r, in.data(), len, (T)-1.0, (T)1.0);
+    uniform(handle, r, vec1.data(), vecLen, (T)-1.0, (T)1.0);
+    raft::copy(out_ref.data(), in.data(), len, handle.get_stream());
+    raft::copy(out.data(), in.data(), len, handle.get_stream());
+    naive_matrix_vector_op_launch(handle,
+                                  out_ref.data(),
+                                  vec1.data(),
+                                  D,
+                                  N,
+                                  params.row_major,
+                                  params.bcast_along_rows,
+                                  params.operation_type);
+    matrix_vector_op_launch(handle,
+                            out.data(),
+                            vec1.data(),
+                            D,
+                            N,
+                            params.row_major,
+                            params.bcast_along_rows,
+                            params.operation_type);
+    handle.sync_stream();
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  MatrixVectorInputs<T, IdxType> params;
+  rmm::device_uvector<T> in, out, out_ref, vec1;
+};
+
+const std::vector<MatrixVectorInputs<float, int>> inputsf_i32 = {
+  {0.00001f, 1024, 32, 0, true, true, 1234ULL},
+  {0.00001f, 1024, 64, 1, true, true, 1234ULL},
+  {0.00001f, 1024, 32, 2, true, false, 1234ULL},
+  {0.00001f, 1024, 64, 3, true, false, 1234ULL},
+  {0.00001f, 1024, 32, 4, false, true, 1234ULL},
+  {0.00001f, 1024, 64, 0, false, true, 1234ULL},
+  {0.00001f, 1024, 32, 1, false, false, 1234ULL},
+  {0.00001f, 1024, 64, 2, false, false, 1234ULL},
+
+  {0.00001f, 1024, 32, 3, true, true, 1234ULL},
+  {0.00001f, 1024, 64, 4, true, true, 1234ULL},
+  {0.00001f, 1024, 32, 0, true, false, 1234ULL},
+  {0.00001f, 1024, 64, 1, true, false, 1234ULL},
+  {0.00001f, 1024, 32, 2, false, true, 1234ULL},
+  {0.00001f, 1024, 64, 3, false, true, 1234ULL},
+  {0.00001f, 1024, 32, 4, false, false, 1234ULL},
+  {0.00001f, 1024, 64, 0, false, false, 1234ULL}};
+typedef MatrixVectorTest<float, int> MatrixVectorTestF_i32;
+TEST_P(MatrixVectorTestF_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.rows * params.cols, CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MatrixVectorTests,
+                         MatrixVectorTestF_i32,
+                         ::testing::ValuesIn(inputsf_i32));
+
+const std::vector<MatrixVectorInputs<float, size_t>> inputsf_i64 = {
+  {0.00001f, 2500, 250, 0, false, false, 1234ULL}, {0.00001f, 2500, 250, 1, false, false, 1234ULL}};
+typedef MatrixVectorTest<float, size_t> MatrixVectorTestF_i64;
+TEST_P(MatrixVectorTestF_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.rows * params.cols, CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MatrixVectorTests,
+                         MatrixVectorTestF_i64,
+                         ::testing::ValuesIn(inputsf_i64));
+
+const std::vector<MatrixVectorInputs<double, int>> inputsd_i32 = {
+  {0.0000001, 1024, 32, 0, true, true, 1234ULL},
+  {0.0000001, 1024, 64, 1, true, true, 1234ULL},
+  {0.0000001, 1024, 32, 2, true, false, 1234ULL},
+  {0.0000001, 1024, 64, 3, true, false, 1234ULL},
+  {0.0000001, 1024, 32, 4, false, true, 1234ULL},
+  {0.0000001, 1024, 64, 0, false, true, 1234ULL},
+  {0.0000001, 1024, 32, 1, false, false, 1234ULL},
+  {0.0000001, 1024, 64, 2, false, false, 1234ULL},
+
+  {0.0000001, 1024, 32, 3, true, true, 1234ULL},
+  {0.0000001, 1024, 64, 4, true, true, 1234ULL},
+  {0.0000001, 1024, 32, 0, true, false, 1234ULL},
+  {0.0000001, 1024, 64, 1, true, false, 1234ULL},
+  {0.0000001, 1024, 32, 2, false, true, 1234ULL},
+  {0.0000001, 1024, 64, 3, false, true, 1234ULL},
+  {0.0000001, 1024, 32, 4, false, false, 1234ULL},
+  {0.0000001, 1024, 64, 0, false, false, 1234ULL}};
+typedef MatrixVectorTest<double, int> MatrixVectorTestD_i32;
+TEST_P(MatrixVectorTestD_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref.data(),
+                          out.data(),
+                          params.rows * params.cols,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MatrixVectorTests,
+                         MatrixVectorTestD_i32,
+                         ::testing::ValuesIn(inputsd_i32));
+
+const std::vector<MatrixVectorInputs<double, size_t>> inputsd_i64 = {
+  {0.0000001, 2500, 250, 0, false, false, 1234ULL},
+  {0.0000001, 2500, 250, 1, false, false, 1234ULL}};
+typedef MatrixVectorTest<double, size_t> MatrixVectorTestD_i64;
+TEST_P(MatrixVectorTestD_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref.data(),
+                          out.data(),
+                          params.rows * params.cols,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MatrixVectorTests,
+                         MatrixVectorTestD_i64,
+                         ::testing::ValuesIn(inputsd_i64));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 2023ce4121..b5a3168a06 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -17,6 +17,7 @@
 #include "../test_utils.h"
 #include "matrix_vector_op.cuh"
 #include <gtest/gtest.h>
+#include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
 
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index f46d70eaa3..934c2f3e0d 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -21,6 +21,48 @@
 namespace raft {
 namespace linalg {
 
+template <typename Type, typename IdxType, typename LambdaOp>
+__global__ void naiveMatVecOpKernel(Type* mat,
+                                    const Type* vec,
+                                    IdxType D,
+                                    IdxType N,
+                                    bool rowMajor,
+                                    bool bcastAlongRows,
+                                    LambdaOp operation)
+{
+  IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
+  IdxType len = N * D;
+  IdxType col;
+  if (rowMajor && bcastAlongRows) {
+    col = idx % D;
+  } else if (!rowMajor && !bcastAlongRows) {
+    col = idx % N;
+  } else if (rowMajor && !bcastAlongRows) {
+    col = idx / D;
+  } else {
+    col = idx / N;
+  }
+  if (idx < len) { mat[idx] = operation(mat[idx], vec[col]); }
+}
+
+template <typename Type, typename IdxType, typename LambdaOp>
+void naiveMatVecOp(Type* mat,
+                   const Type* vec,
+                   IdxType D,
+                   IdxType N,
+                   bool rowMajor,
+                   bool bcastAlongRows,
+                   LambdaOp operation,
+                   cudaStream_t stream)
+{
+  static const IdxType TPB = 64;
+  IdxType len              = N * D;
+  IdxType nblks            = raft::ceildiv(len, TPB);
+  naiveMatVecOpKernel<Type>
+    <<<nblks, TPB, 0, stream>>>(mat, vec, D, N, rowMajor, bcastAlongRows, operation);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 template <typename Type, typename IdxType = int>
 __global__ void naiveMatVecKernel(Type* out,
                                   const Type* mat,
diff --git a/cpp/test/matrix/argmax.cu b/cpp/test/matrix/argmax.cu
new file mode 100644
index 0000000000..9568c06d93
--- /dev/null
+++ b/cpp/test/matrix/argmax.cu
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/argmax.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T, typename IdxT>
+struct ArgMaxInputs {
+  std::vector<T> input_matrix;
+  std::vector<IdxT> output_matrix;
+  std::size_t n_cols;
+  std::size_t n_rows;
+};
+
+template <typename T, typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const ArgMaxInputs<T, IdxT>& dims)
+{
+  return os;
+}
+
+template <typename T, typename IdxT>
+class ArgMaxTest : public ::testing::TestWithParam<ArgMaxInputs<T, IdxT>> {
+ public:
+  ArgMaxTest()
+    : params(::testing::TestWithParam<ArgMaxInputs<T, IdxT>>::GetParam()),
+      input(raft::make_device_matrix<T, std::uint32_t, row_major>(
+        handle, params.n_rows, params.n_cols)),
+      output(raft::make_device_vector<IdxT, std::uint32_t>(handle, params.n_rows)),
+      expected(raft::make_device_vector<IdxT, std::uint32_t>(handle, params.n_rows))
+  {
+    raft::update_device(input.data_handle(),
+                        params.input_matrix.data(),
+                        params.input_matrix.size(),
+                        handle.get_stream());
+    raft::update_device(expected.data_handle(),
+                        params.output_matrix.data(),
+                        params.output_matrix.size(),
+                        handle.get_stream());
+
+    auto input_const_view = raft::make_device_matrix_view<const T, std::uint32_t, row_major>(
+      input.data_handle(), input.extent(0), input.extent(1));
+
+    raft::matrix::argmax(handle, input_const_view, output.view());
+
+    handle.sync_stream();
+  }
+
+ protected:
+  raft::handle_t handle;
+  ArgMaxInputs<T, IdxT> params;
+
+  raft::device_matrix<T, std::uint32_t, row_major> input;
+  raft::device_vector<IdxT, std::uint32_t> output;
+  raft::device_vector<IdxT, std::uint32_t> expected;
+};
+
+const std::vector<ArgMaxInputs<float, int>> inputsf = {
+  {{0.1f, 0.2f, 0.3f, 0.4f, 0.4f, 0.3f, 0.2f, 0.1f, 0.2f, 0.3f, 0.5f, 0.0f}, {3, 0, 2}, 3, 4}};
+
+const std::vector<ArgMaxInputs<double, int>> inputsd = {
+  {{0.1, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0.1, 0.2, 0.3, 0.5, 0.0}, {3, 0, 2}, 3, 4}};
+
+typedef ArgMaxTest<float, int> ArgMaxTestF;
+TEST_P(ArgMaxTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(expected.data_handle(),
+                          output.data_handle(),
+                          params.n_rows,
+                          Compare<int>(),
+                          handle.get_stream()));
+}
+
+typedef ArgMaxTest<double, int> ArgMaxTestD;
+TEST_P(ArgMaxTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(expected.data_handle(),
+                          output.data_handle(),
+                          params.n_rows,
+                          Compare<int>(),
+                          handle.get_stream()));
+}
+
+INSTANTIATE_TEST_SUITE_P(ArgMaxTest, ArgMaxTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(ArgMaxTest, ArgMaxTestD, ::testing::ValuesIn(inputsd));
+
+}  // namespace matrix
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/matrix/diagonal.cu b/cpp/test/matrix/diagonal.cu
new file mode 100644
index 0000000000..e1ad9e144b
--- /dev/null
+++ b/cpp/test/matrix/diagonal.cu
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/diagonal.cuh>
+#include <raft/matrix/init.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T>
+struct DiagonalInputs {
+  int n_rows;
+  int n_cols;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const DiagonalInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class DiagonalTest : public ::testing::TestWithParam<DiagonalInputs<T>> {
+ public:
+  DiagonalTest()
+    : params(::testing::TestWithParam<DiagonalInputs<T>>::GetParam()),
+      input(raft::make_device_matrix<T, std::uint32_t>(handle, params.n_rows, params.n_cols)),
+      diag_expected(raft::make_device_vector<T, std::uint32_t>(handle, diag_size)),
+      diag_actual(raft::make_device_vector<T, std::uint32_t>(handle, diag_size)),
+      diag_size(std::min(params.n_rows, params.n_cols))
+  {
+    T mat_fill_scalar  = 1.0;
+    T diag_fill_scalar = 5.0;
+
+    auto input_view = raft::make_device_matrix_view<const T, std::uint32_t>(
+      input.data_handle(), input.extent(0), input.extent(1));
+    auto diag_expected_view =
+      raft::make_device_vector_view<const T, std::uint32_t>(diag_expected.data_handle(), diag_size);
+
+    raft::matrix::fill(
+      handle, input_view, input.view(), raft::make_host_scalar_view<T>(&mat_fill_scalar));
+    raft::matrix::fill(handle,
+                       diag_expected_view,
+                       diag_expected.view(),
+                       raft::make_host_scalar_view<T>(&diag_fill_scalar));
+
+    handle.sync_stream();
+
+    raft::matrix::set_diagonal(handle, diag_expected_view, input.view());
+
+    handle.sync_stream();
+
+    raft::matrix::get_diagonal(handle, input_view, diag_actual.view());
+
+    handle.sync_stream();
+  }
+
+ protected:
+  raft::handle_t handle;
+  DiagonalInputs<T> params;
+
+  int diag_size;
+
+  raft::device_matrix<T, std::uint32_t> input;
+  raft::device_vector<T, std::uint32_t> diag_expected;
+  raft::device_vector<T, std::uint32_t> diag_actual;
+};
+
+const std::vector<DiagonalInputs<float>> inputsf = {{4, 4}, {4, 10}, {10, 4}};
+
+const std::vector<DiagonalInputs<double>> inputsd = {{4, 4}, {4, 10}, {10, 4}};
+
+typedef DiagonalTest<float> DiagonalTestF;
+TEST_P(DiagonalTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(diag_expected.data_handle(),
+                          diag_actual.data_handle(),
+                          diag_size,
+                          Compare<float>(),
+                          handle.get_stream()));
+}
+
+typedef DiagonalTest<double> DiagonalTestD;
+TEST_P(DiagonalTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(diag_expected.data_handle(),
+                          diag_actual.data_handle(),
+                          diag_size,
+                          Compare<double>(),
+                          handle.get_stream()));
+}
+
+INSTANTIATE_TEST_SUITE_P(DiagonalTest, DiagonalTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(DiagonalTest, DiagonalTestD, ::testing::ValuesIn(inputsd));
+
+}  // namespace matrix
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index ad4a37825c..684b550dfc 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -32,7 +32,7 @@ namespace raft {
 namespace matrix {
 
 template <typename Type>
-__global__ void nativePowerKernel(Type* in, Type* out, int len)
+__global__ void naivePowerKernel(Type* in, Type* out, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = in[idx] * in[idx]; }
@@ -43,12 +43,12 @@ void naivePower(Type* in, Type* out, int len, cudaStream_t stream)
 {
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
-  nativePowerKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
+  naivePowerKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void nativeSqrtKernel(Type* in, Type* out, int len)
+__global__ void naiveSqrtKernel(Type* in, Type* out, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = std::sqrt(in[idx]); }
@@ -59,7 +59,7 @@ void naiveSqrt(Type* in, Type* out, int len, cudaStream_t stream)
 {
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
-  nativeSqrtKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
+  naiveSqrtKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
diff --git a/cpp/test/matrix/norm.cu b/cpp/test/matrix/norm.cu
new file mode 100644
index 0000000000..38fdd409eb
--- /dev/null
+++ b/cpp/test/matrix/norm.cu
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/matrix/norm.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_scalar.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T>
+struct NormInputs {
+  T tolerance;
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const NormInputs<T>& I)
+{
+  os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.seed << '}'
+     << std::endl;
+  return os;
+}
+
+template <typename Type>
+Type naiveNorm(const Type* data, int D, int N)
+{
+  Type out_scalar = 0;
+  for (int i = 0; i < N * D; ++i) {
+    out_scalar += data[i] * data[i];
+  }
+  out_scalar = std::sqrt(out_scalar);
+  return out_scalar;
+}
+
+template <typename T>
+class NormTest : public ::testing::TestWithParam<NormInputs<T>> {
+ public:
+  NormTest()
+    : params(::testing::TestWithParam<NormInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    raft::random::RngState r(params.seed);
+    int rows = params.rows, cols = params.cols, len = rows * cols;
+    uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
+    std::vector<T> h_data(rows * cols);
+    raft::update_host(h_data.data(), data.data(), rows * cols, stream);
+    out_scalar_exp = naiveNorm(h_data.data(), cols, rows);
+    auto input = raft::make_device_matrix_view<const T, int>(data.data(), params.rows, params.cols);
+    out_scalar_act = l2_norm(handle, input);
+    handle.sync_stream(stream);
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  NormInputs<T> params;
+  rmm::device_uvector<T> data;
+  T out_scalar_exp = 0;
+  T out_scalar_act = 0;
+};
+
+///// Row- and column-wise tests
+const std::vector<NormInputs<float>> inputsf = {{0.00001f, 32, 1024, 1234ULL},
+                                                {0.00001f, 64, 1024, 1234ULL},
+                                                {0.00001f, 128, 1024, 1234ULL},
+                                                {0.00001f, 256, 1024, 1234ULL},
+                                                {0.00001f, 512, 512, 1234ULL},
+                                                {0.00001f, 1024, 32, 1234ULL},
+                                                {0.00001f, 1024, 64, 1234ULL},
+                                                {0.00001f, 1024, 128, 1234ULL},
+                                                {0.00001f, 1024, 256, 1234ULL}};
+
+const std::vector<NormInputs<double>> inputsd = {
+  {0.00000001, 32, 1024, 1234ULL},
+  {0.00000001, 64, 1024, 1234ULL},
+  {0.00000001, 128, 1024, 1234ULL},
+  {0.00000001, 256, 1024, 1234ULL},
+  {0.00000001, 512, 512, 1234ULL},
+  {0.00000001, 1024, 32, 1234ULL},
+  {0.00000001, 1024, 64, 1234ULL},
+  {0.00000001, 1024, 128, 1234ULL},
+  {0.00000001, 1024, 256, 1234ULL},
+};
+
+typedef NormTest<float> NormTestF;
+TEST_P(NormTestF, Result)
+{
+  ASSERT_NEAR(out_scalar_exp, out_scalar_act, params.tolerance * params.rows * params.cols);
+}
+
+typedef NormTest<double> NormTestD;
+TEST_P(NormTestD, Result)
+{
+  ASSERT_NEAR(out_scalar_exp, out_scalar_act, params.tolerance * params.rows * params.cols);
+}
+
+INSTANTIATE_TEST_CASE_P(NormTests, NormTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(NormTests, NormTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace matrix
+}  // end namespace raft
diff --git a/cpp/test/matrix/reverse.cu b/cpp/test/matrix/reverse.cu
new file mode 100644
index 0000000000..c905b8711e
--- /dev/null
+++ b/cpp/test/matrix/reverse.cu
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/matrix/reverse.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T>
+struct ReverseInputs {
+  bool row_major, row_reverse;
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const ReverseInputs<T>& I)
+{
+  os << "{ " << I.row_major << ", " << I.row_reverse << ", " << I.rows << ", " << I.cols << ", "
+     << I.seed << '}' << std::endl;
+  return os;
+}
+
+// col-reverse reference test
+template <typename Type>
+void naive_col_reverse(std::vector<Type>& data, int rows, int cols, bool row_major)
+{
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols / 2; ++j) {
+      auto index_in   = row_major ? i * cols + j : i + j * rows;
+      auto index_out  = row_major ? i * cols + (cols - j - 1) : i + (cols - j - 1) * rows;
+      auto tmp        = data[index_in];
+      data[index_in]  = data[index_out];
+      data[index_out] = tmp;
+    }
+  }
+}
+
+// row-reverse reference test
+template <typename Type>
+void naive_row_reverse(std::vector<Type>& data, int rows, int cols, bool row_major)
+{
+  for (int i = 0; i < rows / 2; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      auto index_in   = row_major ? i * cols + j : i + j * rows;
+      auto index_out  = row_major ? (rows - i - 1) * cols + j : (rows - i - 1) + j * rows;
+      auto tmp        = data[index_in];
+      data[index_in]  = data[index_out];
+      data[index_out] = tmp;
+    }
+  }
+}
+
+template <typename T>
+class ReverseTest : public ::testing::TestWithParam<ReverseInputs<T>> {
+ public:
+  ReverseTest()
+    : params(::testing::TestWithParam<ReverseInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    raft::random::RngState r(params.seed);
+    int rows = params.rows, cols = params.cols, len = rows * cols;
+
+    act_result.resize(len);
+    exp_result.resize(len);
+
+    uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
+    raft::update_host(exp_result.data(), data.data(), len, stream);
+
+    auto input_col_major =
+      raft::make_device_matrix_view<T, int, raft::col_major>(data.data(), rows, cols);
+    auto input_row_major =
+      raft::make_device_matrix_view<T, int, raft::row_major>(data.data(), rows, cols);
+    if (params.row_major) {
+      if (params.row_reverse) {
+        row_reverse(handle, input_row_major);
+        naive_row_reverse(exp_result, rows, cols, params.row_major);
+      } else {
+        col_reverse(handle, input_row_major);
+        naive_col_reverse(exp_result, rows, cols, params.row_major);
+      }
+    } else {
+      if (params.row_reverse) {
+        row_reverse(handle, input_col_major);
+        naive_row_reverse(exp_result, rows, cols, params.row_major);
+      } else {
+        col_reverse(handle, input_col_major);
+        naive_col_reverse(exp_result, rows, cols, params.row_major);
+      }
+    }
+
+    raft::update_host(act_result.data(), data.data(), len, stream);
+    handle.sync_stream(stream);
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  ReverseInputs<T> params;
+  rmm::device_uvector<T> data;
+  std::vector<T> exp_result, act_result;
+};
+
+///// Row- and column-wise tests
+const std::vector<ReverseInputs<float>> inputsf = {{true, true, 4, 4, 1234ULL},
+                                                   {true, true, 2, 12, 1234ULL},
+                                                   {true, false, 2, 12, 1234ULL},
+                                                   {true, false, 2, 64, 1234ULL},
+                                                   {true, true, 64, 512, 1234ULL},
+                                                   {true, false, 64, 1024, 1234ULL},
+                                                   {true, true, 128, 1024, 1234ULL},
+                                                   {true, false, 256, 1024, 1234ULL},
+                                                   {false, true, 512, 512, 1234ULL},
+                                                   {false, false, 1024, 32, 1234ULL},
+                                                   {false, true, 1024, 64, 1234ULL},
+                                                   {false, false, 1024, 128, 1234ULL},
+                                                   {false, true, 1024, 256, 1234ULL}};
+
+const std::vector<ReverseInputs<double>> inputsd = {{true, true, 4, 4, 1234ULL},
+                                                    {true, true, 2, 12, 1234ULL},
+                                                    {true, false, 2, 12, 1234ULL},
+                                                    {true, false, 2, 64, 1234ULL},
+                                                    {true, true, 64, 512, 1234ULL},
+                                                    {true, false, 64, 1024, 1234ULL},
+                                                    {true, true, 128, 1024, 1234ULL},
+                                                    {true, false, 256, 1024, 1234ULL},
+                                                    {false, true, 512, 512, 1234ULL},
+                                                    {false, false, 1024, 32, 1234ULL},
+                                                    {false, true, 1024, 64, 1234ULL},
+                                                    {false, false, 1024, 128, 1234ULL},
+                                                    {false, true, 1024, 256, 1234ULL}};
+
+typedef ReverseTest<float> ReverseTestF;
+TEST_P(ReverseTestF, Result)
+{
+  ASSERT_TRUE(hostVecMatch(exp_result, act_result, raft::Compare<float>()));
+}
+
+typedef ReverseTest<double> ReverseTestD;
+TEST_P(ReverseTestD, Result)
+{
+  ASSERT_TRUE(hostVecMatch(exp_result, act_result, raft::Compare<double>()));
+}
+
+INSTANTIATE_TEST_CASE_P(ReverseTests, ReverseTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(ReverseTests, ReverseTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace matrix
+}  // end namespace raft
diff --git a/cpp/test/matrix/slice.cu b/cpp/test/matrix/slice.cu
new file mode 100644
index 0000000000..9744e3724a
--- /dev/null
+++ b/cpp/test/matrix/slice.cu
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/matrix/slice.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_scalar.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T>
+struct SliceInputs {
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const SliceInputs<T>& I)
+{
+  os << "{ " << I.rows << ", " << I.cols << ", " << I.seed << '}' << std::endl;
+  return os;
+}
+
+// Col-major slice reference test
+template <typename Type>
+void naiveSlice(const Type* in, Type* out, int rows, int cols, int x1, int y1, int x2, int y2)
+{
+  int out_rows = x2 - x1;
+  // int out_cols = y2 - y1;
+  for (int j = y1; j < y2; ++j) {
+    for (int i = x1; i < x2; ++i) {
+      out[(i - x1) + (j - y1) * out_rows] = in[i + j * rows];
+    }
+  }
+}
+
+template <typename T>
+class SliceTest : public ::testing::TestWithParam<SliceInputs<T>> {
+ public:
+  SliceTest()
+    : params(::testing::TestWithParam<SliceInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    raft::random::RngState r(params.seed);
+    int rows = params.rows, cols = params.cols, len = rows * cols;
+    uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
+
+    std::uniform_int_distribution<int> rowGenerator(0, rows / 2);
+    auto row1 = rowGenerator(dre);
+    auto row2 = rowGenerator(dre) + rows / 2;
+
+    std::uniform_int_distribution<int> colGenerator(0, cols / 2);
+    auto col1 = colGenerator(dre);
+    auto col2 = colGenerator(dre) + cols / 2;
+
+    rmm::device_uvector<T> d_act_result((row2 - row1) * (col2 - col1), stream);
+    act_result.resize((row2 - row1) * (col2 - col1));
+    exp_result.resize((row2 - row1) * (col2 - col1));
+
+    std::vector<T> h_data(rows * cols);
+    raft::update_host(h_data.data(), data.data(), rows * cols, stream);
+    naiveSlice(h_data.data(), exp_result.data(), rows, cols, row1, col1, row2, col2);
+    auto input =
+      raft::make_device_matrix_view<const T, int, raft::col_major>(data.data(), rows, cols);
+    auto output = raft::make_device_matrix_view<T, int, raft::col_major>(
+      d_act_result.data(), row2 - row1, col2 - col1);
+    slice(handle, input, output, slice_coordinates(row1, col1, row2, col2));
+
+    raft::update_host(act_result.data(), d_act_result.data(), d_act_result.size(), stream);
+    handle.sync_stream(stream);
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  SliceInputs<T> params;
+  rmm::device_uvector<T> data;
+  std::vector<T> exp_result, act_result;
+};
+
+///// Row- and column-wise tests
+const std::vector<SliceInputs<float>> inputsf = {{32, 1024, 1234ULL},
+                                                 {64, 1024, 1234ULL},
+                                                 {128, 1024, 1234ULL},
+                                                 {256, 1024, 1234ULL},
+                                                 {512, 512, 1234ULL},
+                                                 {1024, 32, 1234ULL},
+                                                 {1024, 64, 1234ULL},
+                                                 {1024, 128, 1234ULL},
+                                                 {1024, 256, 1234ULL}};
+
+const std::vector<SliceInputs<double>> inputsd = {
+  {32, 1024, 1234ULL},
+  {64, 1024, 1234ULL},
+  {128, 1024, 1234ULL},
+  {256, 1024, 1234ULL},
+  {512, 512, 1234ULL},
+  {1024, 32, 1234ULL},
+  {1024, 64, 1234ULL},
+  {1024, 128, 1234ULL},
+  {1024, 256, 1234ULL},
+};
+
+typedef SliceTest<float> SliceTestF;
+TEST_P(SliceTestF, Result)
+{
+  ASSERT_TRUE(hostVecMatch(exp_result, act_result, raft::Compare<float>()));
+}
+
+typedef SliceTest<double> SliceTestD;
+TEST_P(SliceTestD, Result)
+{
+  ASSERT_TRUE(hostVecMatch(exp_result, act_result, raft::Compare<double>()));
+}
+
+INSTANTIATE_TEST_CASE_P(SliceTests, SliceTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(SliceTests, SliceTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace matrix
+}  // end namespace raft
diff --git a/cpp/test/matrix/triangular.cu b/cpp/test/matrix/triangular.cu
new file mode 100644
index 0000000000..9af3defb5d
--- /dev/null
+++ b/cpp/test/matrix/triangular.cu
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/matrix/init.cuh>
+#include <raft/matrix/triangular.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T>
+struct TriangularInputs {
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const TriangularInputs<T>& I)
+{
+  os << "{ " << I.rows << ", " << I.cols << ", " << I.seed << '}' << std::endl;
+  return os;
+}
+
+// triangular reference test
+template <typename Type>
+void naive_triangular(std::vector<Type>& in, std::vector<Type>& out, int rows, int cols)
+{
+  auto k = std::min(rows, cols);
+  for (int i = 0; i < k; ++i) {
+    for (int j = 0; j <= i; ++j) {
+      auto index     = i * rows + j;
+      out[i * k + j] = in[index];
+    }
+  }
+}
+
+template <typename T>
+class TriangularTest : public ::testing::TestWithParam<TriangularInputs<T>> {
+ public:
+  TriangularTest()
+    : params(::testing::TestWithParam<TriangularInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    raft::random::RngState r(params.seed);
+    int rows = params.rows, cols = params.cols, len = rows * cols;
+    auto k = std::min(rows, cols);
+
+    rmm::device_uvector<T> d_act_result(len, stream);
+    std::vector<T> h_data(len);
+    act_result.resize(k * k);
+    exp_result.resize(k * k);
+
+    uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
+    raft::update_host(h_data.data(), data.data(), len, stream);
+    raft::matrix::fill(
+      handle,
+      raft::make_device_matrix_view<T, int, raft::col_major>(d_act_result.data(), k, k),
+      T(0));
+
+    upper_triangular(
+      handle,
+      raft::make_device_matrix_view<const T, int, raft::col_major>(data.data(), rows, cols),
+      raft::make_device_matrix_view<T, int, raft::col_major>(d_act_result.data(), k, k));
+    naive_triangular(h_data, exp_result, rows, cols);
+
+    raft::update_host(act_result.data(), d_act_result.data(), k * k, stream);
+    handle.sync_stream(stream);
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  TriangularInputs<T> params;
+  rmm::device_uvector<T> data;
+  std::vector<T> exp_result, act_result;
+};
+
+///// Row- and column-wise tests
+const std::vector<TriangularInputs<float>> inputsf = {{4, 4, 1234ULL},
+                                                      {2, 64, 1234ULL},
+                                                      {64, 512, 1234ULL},
+                                                      {64, 1024, 1234ULL},
+                                                      {256, 1024, 1234ULL},
+                                                      {512, 512, 1234ULL},
+                                                      {1024, 32, 1234ULL},
+                                                      {1024, 128, 1234ULL},
+                                                      {1024, 256, 1234ULL}};
+
+const std::vector<TriangularInputs<double>> inputsd = {{4, 4, 1234ULL},
+                                                       {2, 64, 1234ULL},
+                                                       {64, 512, 1234ULL},
+                                                       {64, 1024, 1234ULL},
+                                                       {256, 1024, 1234ULL},
+                                                       {512, 512, 1234ULL},
+                                                       {1024, 32, 1234ULL},
+                                                       {1024, 128, 1234ULL},
+                                                       {1024, 256, 1234ULL}};
+
+typedef TriangularTest<float> TriangularTestF;
+TEST_P(TriangularTestF, Result)
+{
+  ASSERT_TRUE(hostVecMatch(exp_result, act_result, raft::Compare<float>()));
+}
+
+typedef TriangularTest<double> TriangularTestD;
+TEST_P(TriangularTestD, Result)
+{
+  ASSERT_TRUE(hostVecMatch(exp_result, act_result, raft::Compare<double>()));
+}
+
+INSTANTIATE_TEST_CASE_P(TriangularTests, TriangularTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(TriangularTests, TriangularTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace matrix
+}  // end namespace raft