From d833a8563cba5796390bd95266781959870d1a22 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 3 Oct 2022 19:10:29 -0400
Subject: [PATCH 01/13] Adding remaining matrix files (still need many tests)

---
 cpp/include/raft/matrix/argmax.cuh        |  41 +++++++
 cpp/include/raft/matrix/diagonal.cuh      |  53 ++++++++
 cpp/include/raft/matrix/matrix_vector.cuh | 142 ++++++++++++++++++++++
 cpp/include/raft/matrix/norm.cuh          |  37 ++++++
 cpp/include/raft/matrix/reverse.cuh       |  85 +++++++++++++
 cpp/include/raft/matrix/slice.cuh         |  55 +++++++++
 cpp/include/raft/matrix/triangular.cuh    |  39 ++++++
 cpp/test/matrix/argmax.cu                 |  96 +++++++++++++++
 8 files changed, 548 insertions(+)
 create mode 100644 cpp/include/raft/matrix/argmax.cuh
 create mode 100644 cpp/include/raft/matrix/diagonal.cuh
 create mode 100644 cpp/include/raft/matrix/matrix_vector.cuh
 create mode 100644 cpp/include/raft/matrix/norm.cuh
 create mode 100644 cpp/include/raft/matrix/reverse.cuh
 create mode 100644 cpp/include/raft/matrix/slice.cuh
 create mode 100644 cpp/include/raft/matrix/triangular.cuh
 create mode 100644 cpp/test/matrix/argmax.cu
diff --git a/cpp/include/raft/matrix/argmax.cuh b/cpp/include/raft/matrix/argmax.cuh
new file mode 100644
index 0000000000..5afd026745
--- /dev/null
+++ b/cpp/include/raft/matrix/argmax.cuh
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/matrix.cuh>
+#include <raft/matrix/matrix.cuh>
+
+namespace raft::matrix {
+
+/**
+ * @brief Argmax: find the row idx with maximum value for each column
+ * @param handle: raft handle
+ * @param in: input matrix of size (n_rows, n_cols)
+ * @param out: output vector of size n_cols
+ */
+template <typename math_t, typename idx_t, typename matrix_idx_t>
+void argmax(const raft::handle_t& handle,
+            raft::device_matrix_view<const math_t, matrix_idx_t, col_major> in,
+            raft::device_vector_view<idx_t> out)
+{
+  RAFT_EXPECTS(out.extent(1) == in.extent(1),
+               "Size of output vector must equal number of columns in input matrix.");
+  detail::argmax(
+    in.data_handle(), in.extent(0), in.extent(1), out.data_handle(), handle.get_stream());
+}
+}  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/diagonal.cuh b/cpp/include/raft/matrix/diagonal.cuh
new file mode 100644
index 0000000000..e51778d939
--- /dev/null
+++ b/cpp/include/raft/matrix/diagonal.cuh
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/matrix.cuh>
+#include <raft/matrix/matrix.cuh>
+
+namespace raft::matrix {
+
+/**
+ * @brief Initialize a diagonal matrix with a vector
+ * @param vec: vector of length k = min(n_rows, n_cols)
+ * @param matrix: matrix of size n_rows x n_cols
+ */
+template <typename m_t, typename idx_t = int>
+void initialize_diagonal(const raft::handle_t& handle,
+                         raft::device_vector_view<m_t> vec,
+                         raft::device_matrix_view<m_t, idx_t, col_major> matrix)
+{
+  detail::initializeDiagonalMatrix(vec.data_handle(),
+                                   matrix.data_handle(),
+                                   matrix.extent(0),
+                                   matrix.extent(1),
+                                   handle.get_stream());
+}
+
+/**
+ * @brief Take reciprocal of elements on diagonal of square matrix (in-place)
+ * @param in: square input matrix with size len x len
+ */
+template <typename m_t, typename idx_t = int>
+void invert_diagonal(const raft::handle_t& handle,
+                     raft::device_matrix_view<m_t, idx_t, col_major> in)
+{
+  RAFT_EXPECTS(in.extent(0) == in.extent(1), "Matrix must be square.");
+  detail::getDiagonalInverseMatrix(in.data_handle(), in.extent(0), handle.get_stream());
+}
+}  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/matrix_vector.cuh b/cpp/include/raft/matrix/matrix_vector.cuh
new file mode 100644
index 0000000000..5d05d03d2c
--- /dev/null
+++ b/cpp/include/raft/matrix/matrix_vector.cuh
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "detail/matrix.cuh"
+#include <raft/core/device_mdspan.hpp>
+
+namespace raft::matrix {
+
+/**
+ * @brief multiply each row or column of matrix with vector, skipping zeros in vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void binary_mult_skip_zero(Type* data,
+                           const Type* vec,
+                           IdxType n_row,
+                           IdxType n_col,
+                           bool rowMajor,
+                           bool bcastAlongRows,
+                           cudaStream_t stream)
+{
+  detail::matrixVectorBinaryMultSkipZero<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief divide each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void binary_div(Type* data,
+                const Type* vec,
+                IdxType n_row,
+                IdxType n_col,
+                bool rowMajor,
+                bool bcastAlongRows,
+                cudaStream_t stream)
+{
+  detail::matrixVectorBinaryDiv<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief divide each row or column of matrix with vector, skipping zeros in vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ * @param return_zero result is zero if true and vector value is below threshold, original value if
+ * false
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void binary_div_skip_zero(Type* data,
+                          const Type* vec,
+                          IdxType n_row,
+                          IdxType n_col,
+                          bool rowMajor,
+                          bool bcastAlongRows,
+                          cudaStream_t stream,
+                          bool return_zero = false)
+{
+  detail::matrixVectorBinaryDivSkipZero<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream, return_zero);
+}
+
+/**
+ * @brief add each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void binary_add(Type* data,
+                const Type* vec,
+                IdxType n_row,
+                IdxType n_col,
+                bool rowMajor,
+                bool bcastAlongRows,
+                cudaStream_t stream)
+{
+  detail::matrixVectorBinaryAdd<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+/**
+ * @brief subtract each row or column of matrix with vector
+ * @param data input matrix, results are in-place
+ * @param vec input vector
+ * @param n_row number of rows of input matrix
+ * @param n_col number of columns of input matrix
+ * @param rowMajor whether matrix is row major
+ * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
+ * @param stream cuda stream
+ */
+template <typename Type, typename IdxType = int, int TPB = 256>
+void binary_sub(Type* data,
+                const Type* vec,
+                IdxType n_row,
+                IdxType n_col,
+                bool rowMajor,
+                bool bcastAlongRows,
+                cudaStream_t stream)
+{
+  detail::matrixVectorBinarySub<Type, IdxType, TPB>(
+    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
+}
+
+}  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/norm.cuh b/cpp/include/raft/matrix/norm.cuh
new file mode 100644
index 0000000000..c19bde8828
--- /dev/null
+++ b/cpp/include/raft/matrix/norm.cuh
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/matrix.cuh>
+#include <raft/matrix/matrix.cuh>
+
+namespace raft::matrix {
+
+/**
+ * @brief Get the L2/F-norm of a matrix
+ * @param handle
+ * @param in: input matrix/vector with totally size elements
+ * @param size: size of the matrix/vector
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t>
+m_t l2_norm(const raft::handle_t& handle, raft::device_mdspan<m_t, idx_t> in)
+{
+  return detail::getL2Norm(handle, in.data_handle(), in.size(), handle.get_stream());
+}
+}  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
new file mode 100644
index 0000000000..6bdc381a34
--- /dev/null
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/matrix.cuh>
+#include <raft/matrix/matrix.cuh>
+
+namespace raft::matrix {
+
+/**
+ * @brief Columns of a column major matrix are reversed in place (i.e. first column and
+ * last column are swapped)
+ * @param inout: input and output matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t>
+void col_reverse(const raft::handle_t& handle,
+                 raft::device_matrix_view<m_t, idx_t, col_major> inout)
+{
+  detail::colReverse(inout.data_handle(), inout.extent(0), inout.extent(1), stream);
+}
+
+/**
+ * @brief Columns of a column major matrix are reversed in place (i.e. first column and
+ * last column are swapped)
+ * @param inout: input and output matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t>
+void col_reverse(const raft::handle_t& handle,
+                 raft::device_matrix_view<m_t, idx_t, row_major> inout)
+{
+  detail::rowReverse(inout.data_handle(), inout.extent(0), inout.extent(1), stream);
+}
+
+/**
+ * @brief Rows of a column major matrix are reversed in place (i.e. first row and last
+ * row are swapped)
+ * @param inout: input and output matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t>
+void row_reverse(const raft::handle_t& handle,
+                 raft::device_matrix_view<m_t, idx_t, col_major> inout)
+{
+  detail::rowReverse(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
+}
+
+/**
+ * @brief Rows of a column major matrix are reversed in place (i.e. first row and last
+ * row are swapped)
+ * @param inout: input and output matrix
+ * @param n_rows: number of rows of input matrix
+ * @param n_cols: number of columns of input matrix
+ * @param stream: cuda stream
+ */
+template <typename m_t, typename idx_t>
+void row_reverse(const raft::handle_t& handle,
+                 raft::device_matrix_view<m_t, idx_t, row_major> inout)
+{
+  detail::colReverse(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
+}
+
+}  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/slice.cuh b/cpp/include/raft/matrix/slice.cuh
new file mode 100644
index 0000000000..9e8f840c77
--- /dev/null
+++ b/cpp/include/raft/matrix/slice.cuh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/matrix.cuh>
+#include <raft/matrix/matrix.cuh>
+
+namespace raft::matrix {
+
+/**
+ * @brief Slice a matrix (in-place)
+ * @param handle: raft handle
+ * @param in: input matrix (column-major)
+ * @param out: output matrix (column-major)
+ * @param x1, y1: coordinate of the top-left point of the wanted area (0-based)
+ * @param x2, y2: coordinate of the bottom-right point of the wanted area
+ * (1-based)
+ * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice_matrix(M_d, 4,
+ * 3, 0, 1, 4, 3);
+ */
+template <typename m_t, typename idx_t>
+void slice(const raft::handle_t& handle,
+           raft::device_matrix_view<m_t, idx_t, col_major> in,
+           raft::device_matrix_view<m_t, idx_t, col_major> out,
+           idx_t x1,
+           idx_t y1,
+           idx_t x2,
+           idx_t y2)
+{
+  detail::sliceMatrix(in.data_handle(),
+                      in.extent(0),
+                      in.extent(1),
+                      out.data_handle(),
+                      x1,
+                      y1,
+                      x2,
+                      y2,
+                      handle.get_stream());
+}
+}  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/triangular.cuh b/cpp/include/raft/matrix/triangular.cuh
new file mode 100644
index 0000000000..a7f68bfeef
--- /dev/null
+++ b/cpp/include/raft/matrix/triangular.cuh
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/matrix.cuh>
+#include <raft/matrix/matrix.cuh>
+
+namespace raft::matrix {
+
+/**
+ * @brief Copy the upper triangular part of a matrix to another
+ * @param[in] handle: raft handle
+ * @param[in] src: input matrix with a size of n_rows x n_cols
+ * @param[out] dst: output matrix with a size of kxk, k = min(n_rows, n_cols)
+ */
+template <typename m_t, typename idx_t = int>
+void upper_triangular(const raft::handle_t& handle,
+                      raft::device_matrix_view<const m_t, idx_t, col_major> src,
+                      raft::device_matrix_view<m_t, idx_t, col_major> dst)
+{
+  detail::copyUpperTriangular(
+    src.data_handle(), dst.data_handle(), src.extent(0), src.extent(1), handle.get_stream());
+}
+}  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/test/matrix/argmax.cu b/cpp/test/matrix/argmax.cu
new file mode 100644
index 0000000000..bd375c60a5
--- /dev/null
+++ b/cpp/test/matrix/argmax.cu
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/argmax.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T, typename IdxT>
+struct ArgMaxInputs {
+  std::vector<T> input_matrix;
+  std::vector<IdxT> output_matrix;
+  int n_cols;
+  int n_rows;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const ArgMaxInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T, typename IdxT>
+class ArgMaxTest : public ::testing::TestWithParam<ArgMaxInputs<T, IdxT>> {
+ public:
+  ArgMaxTest()
+    : params(::testing::TestWithParam<ArgMaxInputs<T, IdxT>>::GetParam()),
+      input(std::move(raft::make_device_matrix<T>(handle, params.n_rows, params.n_cols))),
+      output(std::move(raft::make_device_vector<IdxT>(handle, params.n_rows))),
+      expected(std::move(raft::make_device_vector<IdxT>(handle, params.n_rows)))
+  {
+    raft::copy(input.data_handle(), params.input_matrix.data(), params.n_rows * params.n_cols);
+    raft::copy(expected.data_handle(), params.output_matrix.data(), params.n_rows * params.n_cols);
+
+    raft::matrix::argmax(handle, input, output);
+  }
+
+ protected:
+  raft::handle_t handle;
+  ArgMaxInputs<T> params;
+
+  raft::device_matrix<T> input;
+  raft::device_vector<IdxT> output;
+  raft::device_vector<IdxT> expected;
+};
+
+const std::vector<ArgMaxInputs<float, int>> inputsf = {
+  {0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, {0.2f, 0.3f, 0.5f, 0.0f}, {3, 0, 2}, 3, 4};
+
+const std::vector<ArgMaxInputs<double, int>> inputsd = {
+  {0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2, 0.1}, {0.2, 0.3, 0.5, 0.0}, {3, 0, 2}, 3, 4};
+
+typedef ArgMaxTest<float, int> ArgMaxTestF;
+TEST_P(ArgMaxTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(output.data_handle(),
+                          expected.data_handle(),
+                          params.n_rows,
+                          Compare<int>(),
+                          handle.get_stream()));
+}
+
+typedef ArgMaxTest<double, int> ArgMaxTestD;
+TEST_P(ArgMaxTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(output.data_handle(),
+                          expected.data_handle(),
+                          params.n_rows,
+                          Compare<int>(),
+                          handle.get_stream()));
+}
+
+INSTANTIATE_TEST_SUITE_P(ArgMaxTest, ArgMaxTestTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(ArgMaxTest, ArgMaxTestTestD, ::testing::ValuesIn(inputsd));
+
+}  // namespace matrix
+}  // namespace raft
\ No newline at end of file

From 4d21b8b4d502d5838e5b4ac0b24e49822496946b Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 4 Oct 2022 20:00:56 -0400
Subject: [PATCH 02/13] Adding tests for argmax and diagonal

---
 cpp/include/raft/core/device_mdarray.hpp  |   1 +
 cpp/include/raft/core/device_mdspan.hpp   |   1 +
 cpp/include/raft/core/host_mdarray.hpp    |   1 +
 cpp/include/raft/core/host_mdspan.hpp     |   1 +
 cpp/include/raft/core/mdspan_types.hpp    |   2 +-
 cpp/include/raft/linalg/matrix_vector.cuh | 184 ++++++++++++++++++++++
 cpp/include/raft/matrix/argmax.cuh        |   4 +-
 cpp/include/raft/matrix/detail/matrix.cuh |  29 +++-
 cpp/include/raft/matrix/diagonal.cuh      |  45 ++++--
 cpp/include/raft/matrix/gather.cuh        |   4 +-
 cpp/include/raft/matrix/init.cuh          |   8 +-
 cpp/include/raft/matrix/matrix_vector.cuh | 142 -----------------
 cpp/include/raft/matrix/slice.cuh         |   7 +
 cpp/include/raft/matrix/triangular.cuh    |   2 +-
 cpp/test/CMakeLists.txt                   |   2 +
 cpp/test/matrix/argmax.cu                 |  47 +++---
 cpp/test/matrix/diagonal.cu               | 116 ++++++++++++++
 17 files changed, 413 insertions(+), 183 deletions(-)
 create mode 100644 cpp/include/raft/linalg/matrix_vector.cuh
 delete mode 100644 cpp/include/raft/matrix/matrix_vector.cuh
 create mode 100644 cpp/test/matrix/diagonal.cu

diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp
index 1c17b5bcb9..693e50a506 100644
--- a/cpp/include/raft/core/device_mdarray.hpp
+++ b/cpp/include/raft/core/device_mdarray.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <raft/core/detail/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/mdarray.hpp>
diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp
index 2fc43e2a05..05cf767a53 100644
--- a/cpp/include/raft/core/device_mdspan.hpp
+++ b/cpp/include/raft/core/device_mdspan.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <raft/core/detail/host_device_accessor.hpp>
 #include <raft/core/mdspan.hpp>
 
diff --git a/cpp/include/raft/core/host_mdarray.hpp b/cpp/include/raft/core/host_mdarray.hpp
index 6221ca59f0..20cb5c1446 100644
--- a/cpp/include/raft/core/host_mdarray.hpp
+++ b/cpp/include/raft/core/host_mdarray.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <raft/core/host_mdspan.hpp>
 
 #include <raft/core/detail/host_mdarray.hpp>
diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp
index fc2a9bbd6d..3fe9ea2264 100644
--- a/cpp/include/raft/core/host_mdspan.hpp
+++ b/cpp/include/raft/core/host_mdspan.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cstdint>
 #include <raft/core/mdspan.hpp>
 
 #include <raft/core/detail/host_device_accessor.hpp>
diff --git a/cpp/include/raft/core/mdspan_types.hpp b/cpp/include/raft/core/mdspan_types.hpp
index bc2ba314a3..07c69f472c 100644
--- a/cpp/include/raft/core/mdspan_types.hpp
+++ b/cpp/include/raft/core/mdspan_types.hpp
@@ -47,7 +47,7 @@ using vector_extent = std::experimental::extents<IndexType, dynamic_extent>;
 template <typename IndexType>
 using matrix_extent = std::experimental::extents<IndexType, dynamic_extent, dynamic_extent>;
 
-template <typename IndexType = std::uint32_t>
+template <typename IndexType>
 using scalar_extent = std::experimental::extents<IndexType, 1>;
 
 /**
diff --git a/cpp/include/raft/linalg/matrix_vector.cuh b/cpp/include/raft/linalg/matrix_vector.cuh
new file mode 100644
index 0000000000..05bff59c6f
--- /dev/null
+++ b/cpp/include/raft/linalg/matrix_vector.cuh
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/detail/matrix.cuh>
+#include <raft/util/input_validation.hpp>
+
+namespace raft::linalg {
+
+/**
+ * @brief multiply each row or column of matrix with vector, skipping zeros in vector
+ * @param [in] handle: raft handle for managing library resources
+ * @param[inout] data: input matrix, results are in-place
+ * @param[in] vec: input vector
+ * @param[in] bcast_along_rows: whether to broadcast vector along rows of matrix or columns
+ */
+template <typename Type, typename IdxType>
+void binary_mult_skip_zero(const raft::handle_t& handle,
+                           raft::device_matrix_view<Type, IdxType> data,
+                           raft::device_vector_view<const Type> vec,
+                           bool bcast_along_rows)
+{
+  bool row_major = raft::is_row_major(data);
+
+  IdxType vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+
+  RAFT_EXPECTS(
+    vec.extent(0) == vec_size,
+    "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
+    "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
+
+  matrix::detail::matrixVectorBinaryMultSkipZero<Type, IdxType>(data.data_handle(),
+                                                                vec.data_handle(),
+                                                                matrix.extent(0),
+                                                                matrix.extent(1),
+                                                                row_major,
+                                                                bcast_along_rows,
+                                                                handle.get_stream());
+}
+
+/**
+ * @brief divide each row or column of matrix with vector
+ * @param[in] handle: raft handle for managing library resources
+ * @param[inout] data: input matrix, results are in-place
+ * @param[in] vec: input vector
+ * @param[in] bcast_along_rows: whether to broadcast vector along rows of matrix or columns
+ */
+template <typename Type, typename IdxType>
+void binary_div(const raft::handle_t& handle,
+                raft::device_matrix_view<Type, IdxType> data,
+                raft::device_vector_view<const Type> vec,
+                bool bcast_along_rows)
+{
+  bool row_major = raft::is_row_major(data);
+
+  IdxType vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+
+  RAFT_EXPECTS(
+    vec.extent(0) == vec_size,
+    "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
+    "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
+
+  matrix::detail::matrixVectorBinaryDiv<Type, IdxType>(data.data_handle(),
+                                                       vec.data_handle(),
+                                                       data.extent(0),
+                                                       data.extent(1),
+                                                       row_major,
+                                                       bcast_along_rows,
+                                                       handle.get_stream());
+}
+
+/**
+ * @brief divide each row or column of matrix with vector, skipping zeros in vector
+ * @param[in] handle: raft handle for managing library resources
+ * @param[inout] data: input matrix, results are in-place
+ * @param[in] vec: input vector
+ * @param[in] bcast_along_rows: whether to broadcast vector along rows of matrix or columns
+ * @param[in] return_zero: result is zero if true and vector value is below threshold, original
+ * value if false
+ */
+template <typename Type, typename IdxType>
+void binary_div_skip_zero(const raft::handle_t& handle,
+                          raft::device_matrix_view<Type, IdxType> data,
+                          raft::device_vector_view<const Type> vec,
+                          bool bcast_along_rows,
+                          bool return_zero = false)
+{
+  bool row_major = raft::is_row_major(data);
+
+  IdxType vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+
+  RAFT_EXPECTS(
+    vec.extent(0) == vec_size,
+    "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
+    "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
+
+  matrix::detail::matrixVectorBinaryDivSkipZero<Type, IdxType>(data.data_handle(),
+                                                               vec.data_handle(),
+                                                               data.extent(0),
+                                                               data.extent(1),
+                                                               row_major,
+                                                               bcast_along_rows,
+                                                               handle.get_stream(),
+                                                               return_zero);
+}
+
+/**
+ * @brief add each row or column of matrix with vector
+ * @param[in] handle: raft handle for managing library resources
+ * @param[inout] data: input matrix, results are in-place
+ * @param[in] vec: input vector
+ * @param[in] bcast_along_rows: whether to broadcast vector along rows of matrix or columns
+ */
+template <typename Type, typename IdxType>
+void binary_add(const raft::handle_t& handle,
+                raft::device_matrix_view<Type, IdxType> data,
+                raft::device_vector_view<const Type> vec,
+                bool bcast_along_rows)
+{
+  bool row_major = raft::is_row_major(data);
+
+  IdxType vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+
+  RAFT_EXPECTS(
+    vec.extent(0) == vec_size,
+    "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
+    "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
+
+  matrix::detail::matrixVectorBinaryAdd<Type, IdxType, TPB>(data.data_handle(),
+                                                            vec.data_handle(),
+                                                            data.extent(0),
+                                                            data.extent(1),
+                                                            row_major,
+                                                            bcast_along_rows,
+                                                            handle.get_stream());
+}
+
+/**
+ * @brief subtract each row or column of matrix with vector
+ * @param[in] handle: raft handle for managing library resources
+ * @param[inout] data: input matrix, results are in-place
+ * @param[in] vec: input vector
+ * @param[in] bcast_along_rows: whether to broadcast vector along rows of matrix or columns
+ */
+template <typename Type, typename IdxType>
+void binary_sub(const raft::handle_t& handle,
+                raft::device_matrix_view<Type, IdxType> data,
+                raft::device_vector_view<const Type> vec,
+                bool bcast_along_rows)
+{
+  bool row_major = raft::is_row_major(data);
+
+  IdxType vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+
+  RAFT_EXPECTS(
+    vec.extent(0) == vec_size,
+    "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
+    "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
+
+  matrix::detail::matrixVectorBinarySub<Type, IdxType>(data.data_handle(),
+                                                       vec.data_handle(),
+                                                       data.extent(0),
+                                                       data.extent(1),
+                                                       row_major,
+                                                       bcast_along_rows,
+                                                       handle.get_stream());
+}
+
+}  // namespace raft::linalg
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/argmax.cuh b/cpp/include/raft/matrix/argmax.cuh
index 5afd026745..2c4908f712 100644
--- a/cpp/include/raft/matrix/argmax.cuh
+++ b/cpp/include/raft/matrix/argmax.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/matrix/detail/matrix.cuh>
+#include <raft/matrix/detail/math.cuh>
 #include <raft/matrix/matrix.cuh>
 
 namespace raft::matrix {
@@ -31,7 +31,7 @@ namespace raft::matrix {
 template <typename math_t, typename idx_t, typename matrix_idx_t>
 void argmax(const raft::handle_t& handle,
             raft::device_matrix_view<const math_t, matrix_idx_t, col_major> in,
-            raft::device_vector_view<idx_t> out)
+            raft::device_vector_view<idx_t, matrix_idx_t> out)
 {
   RAFT_EXPECTS(out.extent(1) == in.extent(1),
                "Size of output vector must equal number of columns in input matrix.");
diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index c425aad79b..3408364b58 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -236,16 +236,32 @@ void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStr
  * @param k: dimensionality
  */
 template <typename m_t, typename idx_t = int>
-__global__ void copyVectorToMatrixDiagonal(m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k)
+__global__ void copyVectorToMatrixDiagonal(const m_t* vec, m_t* matrix, idx_t m, idx_t n, idx_t k)
 {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
 
   if (idx < k) { matrix[idx + idx * m] = vec[idx]; }
 }
 
+/**
+ * @brief Copy matrix diagonal to vector
+ * @param vec: vector of length k = min(n_rows, n_cols)
+ * @param matrix: matrix of size n_rows x n_cols
+ * @param m: number of rows of the matrix
+ * @param n: number of columns of the matrix
+ * @param k: dimensionality
+ */
+template <typename m_t, typename idx_t = int>
+__global__ void copyVectorFromMatrixDiagonal(m_t* vec, const m_t* matrix, idx_t m, idx_t n, idx_t k)
+{
+  idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  if (idx < k) { vec[idx] = matrix[idx + idx * m]; }
+}
+
 template <typename m_t, typename idx_t = int>
 void initializeDiagonalMatrix(
-  m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+  const m_t* vec, m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
   idx_t k = std::min(n_rows, n_cols);
   dim3 block(64);
@@ -253,6 +269,15 @@ void initializeDiagonalMatrix(
   copyVectorToMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows, n_cols, k);
 }
 
+template <typename m_t, typename idx_t = int>
+void getDiagonalMatrix(m_t* vec, const m_t* matrix, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+{
+  idx_t k = std::min(n_rows, n_cols);
+  dim3 block(64);
+  dim3 grid((k + block.x - 1) / block.x);
+  copyVectorFromMatrixDiagonal<<<grid, block, 0, stream>>>(vec, matrix, n_rows, n_cols, k);
+}
+
 /**
  * @brief Calculate the inverse of the diagonal of a square matrix
  * element-wise and in place
diff --git a/cpp/include/raft/matrix/diagonal.cuh b/cpp/include/raft/matrix/diagonal.cuh
index e51778d939..e546982fa9 100644
--- a/cpp/include/raft/matrix/diagonal.cuh
+++ b/cpp/include/raft/matrix/diagonal.cuh
@@ -24,14 +24,17 @@ namespace raft::matrix {
 
 /**
  * @brief Initialize a diagonal matrix with a vector
- * @param vec: vector of length k = min(n_rows, n_cols)
- * @param matrix: matrix of size n_rows x n_cols
+ * @param[in] vec: vector of length k = min(n_rows, n_cols)
+ * @param[out] matrix: matrix of size n_rows x n_cols
  */
-template <typename m_t, typename idx_t = int>
-void initialize_diagonal(const raft::handle_t& handle,
-                         raft::device_vector_view<m_t> vec,
-                         raft::device_matrix_view<m_t, idx_t, col_major> matrix)
+template <typename m_t, typename idx_t, typename layout>
+void set_diagonal(const raft::handle_t& handle,
+                  raft::device_vector_view<const m_t, idx_t> vec,
+                  raft::device_matrix_view<m_t, idx_t, layout> matrix)
 {
+  RAFT_EXPECTS(vec.extent(0) == std::min(matrix.extent(0), matrix.extent(1)),
+               "Diagonal vector must be min(matrix.n_rows, matrix.n_cols)");
+
   detail::initializeDiagonalMatrix(vec.data_handle(),
                                    matrix.data_handle(),
                                    matrix.extent(0),
@@ -39,15 +42,35 @@ void initialize_diagonal(const raft::handle_t& handle,
                                    handle.get_stream());
 }
 
+/**
+ * @brief Initialize a diagonal matrix with a vector
+ * @param[in] matrix: matrix of size n_rows x n_cols
+ * @param[out] vec: vector of length k = min(n_rows, n_cols)
+ */
+template <typename m_t, typename idx_t, typename layout>
+void get_diagonal(const raft::handle_t& handle,
+                  raft::device_matrix_view<const m_t, idx_t, layout> matrix,
+                  raft::device_vector_view<m_t, idx_t> vec)
+{
+  RAFT_EXPECTS(vec.extent(0) == std::min(matrix.extent(0), matrix.extent(1)),
+               "Diagonal vector must be min(matrix.n_rows, matrix.n_cols)");
+  detail::getDiagonalMatrix(vec.data_handle(),
+                            matrix.data_handle(),
+                            matrix.extent(0),
+                            matrix.extent(1),
+                            handle.get_stream());
+}
+
 /**
  * @brief Take reciprocal of elements on diagonal of square matrix (in-place)
- * @param in: square input matrix with size len x len
+ * @param[inout] inout: square input matrix with size len x len
  */
-template <typename m_t, typename idx_t = int>
+template <typename m_t, typename idx_t, typename layout>
 void invert_diagonal(const raft::handle_t& handle,
-                     raft::device_matrix_view<m_t, idx_t, col_major> in)
+                     raft::device_matrix_view<m_t, idx_t, layout> inout)
 {
-  RAFT_EXPECTS(in.extent(0) == in.extent(1), "Matrix must be square.");
-  detail::getDiagonalInverseMatrix(in.data_handle(), in.extent(0), handle.get_stream());
+  // TODO: Use get_diagonal for this to support rectangular
+  RAFT_EXPECTS(inout.extent(0) == inout.extent(1), "Matrix must be square.");
+  detail::getDiagonalInverseMatrix(inout.data_handle(), inout.extent(0), handle.get_stream());
 }
 }  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/gather.cuh b/cpp/include/raft/matrix/gather.cuh
index fa6e73de49..12b0b94fa5 100644
--- a/cpp/include/raft/matrix/gather.cuh
+++ b/cpp/include/raft/matrix/gather.cuh
@@ -221,8 +221,8 @@ template <typename matrix_t,
 void gather_if(const raft::handle_t& handle,
                raft::device_matrix_view<const matrix_t, idx_t, row_major> in,
                raft::device_matrix_view<matrix_t, idx_t, row_major> out,
-               raft::device_vector_view<const map_t> map,
-               raft::device_vector_view<const stencil_t> stencil,
+               raft::device_vector_view<const map_t, idx_t> map,
+               raft::device_vector_view<const stencil_t, idx_t> stencil,
                unary_pred_t pred_op)
 {
   RAFT_EXPECTS(out.extent(0) == map.extent(0),
diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh
index e3a6c09fe6..5810ffb502 100644
--- a/cpp/include/raft/matrix/init.cuh
+++ b/cpp/include/raft/matrix/init.cuh
@@ -18,7 +18,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
-#include <raft/matrix/detail/matrix.cuh>
+#include <raft/matrix/detail/math.cuh>
 #include <raft/matrix/matrix.cuh>
 
 namespace raft::matrix {
@@ -32,10 +32,10 @@ namespace raft::matrix {
  * @param[out] out output matrix. The result is stored in the out matrix
  * @param[in] scalar scalar value to fill matrix elements
  */
-template <typename math_t, typename idx_t, typename layout>
+template <typename math_t, typename extents, typename layout>
 void fill(const raft::handle_t& handle,
-          raft::device_matrix_view<const math_t, idx_t, layout> in,
-          raft::device_matrix_view<math_t, idx_t, layout> out,
+          raft::device_mdspan<const math_t, extents, layout> in,
+          raft::device_mdspan<math_t, extents, layout> out,
           raft::host_scalar_view<math_t> scalar)
 {
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must be the same size.");
diff --git a/cpp/include/raft/matrix/matrix_vector.cuh b/cpp/include/raft/matrix/matrix_vector.cuh
deleted file mode 100644
index 5d05d03d2c..0000000000
--- a/cpp/include/raft/matrix/matrix_vector.cuh
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "detail/matrix.cuh"
-#include <raft/core/device_mdspan.hpp>
-
-namespace raft::matrix {
-
-/**
- * @brief multiply each row or column of matrix with vector, skipping zeros in vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void binary_mult_skip_zero(Type* data,
-                           const Type* vec,
-                           IdxType n_row,
-                           IdxType n_col,
-                           bool rowMajor,
-                           bool bcastAlongRows,
-                           cudaStream_t stream)
-{
-  detail::matrixVectorBinaryMultSkipZero<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-/**
- * @brief divide each row or column of matrix with vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void binary_div(Type* data,
-                const Type* vec,
-                IdxType n_row,
-                IdxType n_col,
-                bool rowMajor,
-                bool bcastAlongRows,
-                cudaStream_t stream)
-{
-  detail::matrixVectorBinaryDiv<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-/**
- * @brief divide each row or column of matrix with vector, skipping zeros in vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- * @param return_zero result is zero if true and vector value is below threshold, original value if
- * false
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void binary_div_skip_zero(Type* data,
-                          const Type* vec,
-                          IdxType n_row,
-                          IdxType n_col,
-                          bool rowMajor,
-                          bool bcastAlongRows,
-                          cudaStream_t stream,
-                          bool return_zero = false)
-{
-  detail::matrixVectorBinaryDivSkipZero<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream, return_zero);
-}
-
-/**
- * @brief add each row or column of matrix with vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void binary_add(Type* data,
-                const Type* vec,
-                IdxType n_row,
-                IdxType n_col,
-                bool rowMajor,
-                bool bcastAlongRows,
-                cudaStream_t stream)
-{
-  detail::matrixVectorBinaryAdd<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-/**
- * @brief subtract each row or column of matrix with vector
- * @param data input matrix, results are in-place
- * @param vec input vector
- * @param n_row number of rows of input matrix
- * @param n_col number of columns of input matrix
- * @param rowMajor whether matrix is row major
- * @param bcastAlongRows whether to broadcast vector along rows of matrix or columns
- * @param stream cuda stream
- */
-template <typename Type, typename IdxType = int, int TPB = 256>
-void binary_sub(Type* data,
-                const Type* vec,
-                IdxType n_row,
-                IdxType n_col,
-                bool rowMajor,
-                bool bcastAlongRows,
-                cudaStream_t stream)
-{
-  detail::matrixVectorBinarySub<Type, IdxType, TPB>(
-    data, vec, n_row, n_col, rowMajor, bcastAlongRows, stream);
-}
-
-}  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/slice.cuh b/cpp/include/raft/matrix/slice.cuh
index 9e8f840c77..b556b2f903 100644
--- a/cpp/include/raft/matrix/slice.cuh
+++ b/cpp/include/raft/matrix/slice.cuh
@@ -42,6 +42,13 @@ void slice(const raft::handle_t& handle,
            idx_t x2,
            idx_t y2)
 {
+  RAFT_EXPECTS(x2 > x1, "x2 must be > x1");
+  RAFT_EXPECTS(y2 > y1, "y2 must be > y1");
+  RAFT_EXPECTS(x1 >= 0, "x1 must be >= 0");
+  RAFT_EXPECTS(x2 <= in.extents(0), "x2 must be <= number of rows in the input matrix")
+  RAFT_EXPECTS(y1 >= 0, "y1 must be >= 0");
+  RAFT_EXPECTS(y2 <= in.extents(1), "y2 must be <= number of columns in the input matrix");
+
   detail::sliceMatrix(in.data_handle(),
                       in.extent(0),
                       in.extent(1),
diff --git a/cpp/include/raft/matrix/triangular.cuh b/cpp/include/raft/matrix/triangular.cuh
index a7f68bfeef..b385201859 100644
--- a/cpp/include/raft/matrix/triangular.cuh
+++ b/cpp/include/raft/matrix/triangular.cuh
@@ -28,7 +28,7 @@ namespace raft::matrix {
  * @param[in] src: input matrix with a size of n_rows x n_cols
  * @param[out] dst: output matrix with a size of kxk, k = min(n_rows, n_cols)
  */
-template <typename m_t, typename idx_t = int>
+template <typename m_t, typename idx_t>
 void upper_triangular(const raft::handle_t& handle,
                       raft::device_matrix_view<const m_t, idx_t, col_major> src,
                       raft::device_matrix_view<m_t, idx_t, col_major> dst)
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 0c9b721294..2d8468d5f7 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -157,6 +157,8 @@ if(BUILD_TESTS)
 
     ConfigureTest(NAME MATRIX_TEST
             PATH
+            test/matrix/argmax.cu
+            test/matrix/diagonal.cu
             test/matrix/gather.cu
             test/matrix/math.cu
             test/matrix/matrix.cu
diff --git a/cpp/test/matrix/argmax.cu b/cpp/test/matrix/argmax.cu
index bd375c60a5..fbd8bc7bb8 100644
--- a/cpp/test/matrix/argmax.cu
+++ b/cpp/test/matrix/argmax.cu
@@ -15,6 +15,7 @@
  */
 
 #include "../test_utils.h"
+#include <cstdint>
 #include <gtest/gtest.h>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -28,12 +29,12 @@ template <typename T, typename IdxT>
 struct ArgMaxInputs {
   std::vector<T> input_matrix;
   std::vector<IdxT> output_matrix;
-  int n_cols;
-  int n_rows;
+  std::size_t n_cols;
+  std::size_t n_rows;
 };
 
-template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const ArgMaxInputs<T>& dims)
+template <typename T, typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const ArgMaxInputs<T, IdxT>& dims)
 {
   return os;
 }
@@ -43,30 +44,40 @@ class ArgMaxTest : public ::testing::TestWithParam<ArgMaxInputs<T, IdxT>> {
  public:
   ArgMaxTest()
     : params(::testing::TestWithParam<ArgMaxInputs<T, IdxT>>::GetParam()),
-      input(std::move(raft::make_device_matrix<T>(handle, params.n_rows, params.n_cols))),
-      output(std::move(raft::make_device_vector<IdxT>(handle, params.n_rows))),
-      expected(std::move(raft::make_device_vector<IdxT>(handle, params.n_rows)))
+      input(raft::make_device_matrix<T, std::uint32_t, col_major>(
+        handle, params.n_rows, params.n_cols)),
+      output(raft::make_device_vector<IdxT, std::uint32_t>(handle, params.n_rows)),
+      expected(raft::make_device_vector<IdxT, std::uint32_t>(handle, params.n_rows))
   {
-    raft::copy(input.data_handle(), params.input_matrix.data(), params.n_rows * params.n_cols);
-    raft::copy(expected.data_handle(), params.output_matrix.data(), params.n_rows * params.n_cols);
+    raft::copy(input.data_handle(),
+               params.input_matrix.data(),
+               params.n_rows * params.n_cols,
+               handle.get_stream());
+    raft::copy(expected.data_handle(),
+               params.output_matrix.data(),
+               params.n_rows * params.n_cols,
+               handle.get_stream());
+
+    auto input_const_view = raft::make_device_matrix_view<const T, std::uint32_t, col_major>(
+      input.data_handle(), input.extent(0), input.extent(1));
 
-    raft::matrix::argmax(handle, input, output);
+    raft::matrix::argmax(handle, input_const_view, output.view());
   }
 
  protected:
   raft::handle_t handle;
-  ArgMaxInputs<T> params;
+  ArgMaxInputs<T, IdxT> params;
 
-  raft::device_matrix<T> input;
-  raft::device_vector<IdxT> output;
-  raft::device_vector<IdxT> expected;
+  raft::device_matrix<T, std::uint32_t, col_major> input;
+  raft::device_vector<IdxT, std::uint32_t> output;
+  raft::device_vector<IdxT, std::uint32_t> expected;
 };
 
 const std::vector<ArgMaxInputs<float, int>> inputsf = {
-  {0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, {0.2f, 0.3f, 0.5f, 0.0f}, {3, 0, 2}, 3, 4};
+  {{0.1f, 0.4f, 0.2f, 0.2f, 0.3f, 0.3f, 0.3f, 0.2f, 0.5f, 0.4f, 0.1f, 0.0f}, {3, 0, 2}, 3, 4}};
 
 const std::vector<ArgMaxInputs<double, int>> inputsd = {
-  {0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2, 0.1}, {0.2, 0.3, 0.5, 0.0}, {3, 0, 2}, 3, 4};
+  {{0.1, 0.4, 0.2, 0.2, 0.3, 0.3, 0.3, 0.2, 0.5, 0.4, 0.1, 0.0}, {3, 0, 2}, 3, 4}};
 
 typedef ArgMaxTest<float, int> ArgMaxTestF;
 TEST_P(ArgMaxTestF, Result)
@@ -88,9 +99,9 @@ TEST_P(ArgMaxTestD, Result)
                           handle.get_stream()));
 }
 
-INSTANTIATE_TEST_SUITE_P(ArgMaxTest, ArgMaxTestTestF, ::testing::ValuesIn(inputsf));
+INSTANTIATE_TEST_SUITE_P(ArgMaxTest, ArgMaxTestF, ::testing::ValuesIn(inputsf));
 
-INSTANTIATE_TEST_SUITE_P(ArgMaxTest, ArgMaxTestTestD, ::testing::ValuesIn(inputsd));
+INSTANTIATE_TEST_SUITE_P(ArgMaxTest, ArgMaxTestD, ::testing::ValuesIn(inputsd));
 
 }  // namespace matrix
 }  // namespace raft
\ No newline at end of file
diff --git a/cpp/test/matrix/diagonal.cu b/cpp/test/matrix/diagonal.cu
new file mode 100644
index 0000000000..e1ad9e144b
--- /dev/null
+++ b/cpp/test/matrix/diagonal.cu
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/matrix/diagonal.cuh>
+#include <raft/matrix/init.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T>
+struct DiagonalInputs {
+  int n_rows;
+  int n_cols;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const DiagonalInputs<T>& dims)
+{
+  return os;
+}
+
+template <typename T>
+class DiagonalTest : public ::testing::TestWithParam<DiagonalInputs<T>> {
+ public:
+  DiagonalTest()
+    : params(::testing::TestWithParam<DiagonalInputs<T>>::GetParam()),
+      input(raft::make_device_matrix<T, std::uint32_t>(handle, params.n_rows, params.n_cols)),
+      diag_expected(raft::make_device_vector<T, std::uint32_t>(handle, diag_size)),
+      diag_actual(raft::make_device_vector<T, std::uint32_t>(handle, diag_size)),
+      diag_size(std::min(params.n_rows, params.n_cols))
+  {
+    T mat_fill_scalar  = 1.0;
+    T diag_fill_scalar = 5.0;
+
+    auto input_view = raft::make_device_matrix_view<const T, std::uint32_t>(
+      input.data_handle(), input.extent(0), input.extent(1));
+    auto diag_expected_view =
+      raft::make_device_vector_view<const T, std::uint32_t>(diag_expected.data_handle(), diag_size);
+
+    raft::matrix::fill(
+      handle, input_view, input.view(), raft::make_host_scalar_view<T>(&mat_fill_scalar));
+    raft::matrix::fill(handle,
+                       diag_expected_view,
+                       diag_expected.view(),
+                       raft::make_host_scalar_view<T>(&diag_fill_scalar));
+
+    handle.sync_stream();
+
+    raft::matrix::set_diagonal(handle, diag_expected_view, input.view());
+
+    handle.sync_stream();
+
+    raft::matrix::get_diagonal(handle, input_view, diag_actual.view());
+
+    handle.sync_stream();
+  }
+
+ protected:
+  raft::handle_t handle;
+  DiagonalInputs<T> params;
+
+  int diag_size;
+
+  raft::device_matrix<T, std::uint32_t> input;
+  raft::device_vector<T, std::uint32_t> diag_expected;
+  raft::device_vector<T, std::uint32_t> diag_actual;
+};
+
+const std::vector<DiagonalInputs<float>> inputsf = {{4, 4}, {4, 10}, {10, 4}};
+
+const std::vector<DiagonalInputs<double>> inputsd = {{4, 4}, {4, 10}, {10, 4}};
+
+typedef DiagonalTest<float> DiagonalTestF;
+TEST_P(DiagonalTestF, Result)
+{
+  ASSERT_TRUE(devArrMatch(diag_expected.data_handle(),
+                          diag_actual.data_handle(),
+                          diag_size,
+                          Compare<float>(),
+                          handle.get_stream()));
+}
+
+typedef DiagonalTest<double> DiagonalTestD;
+TEST_P(DiagonalTestD, Result)
+{
+  ASSERT_TRUE(devArrMatch(diag_expected.data_handle(),
+                          diag_actual.data_handle(),
+                          diag_size,
+                          Compare<double>(),
+                          handle.get_stream()));
+}
+
+INSTANTIATE_TEST_SUITE_P(DiagonalTest, DiagonalTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_SUITE_P(DiagonalTest, DiagonalTestD, ::testing::ValuesIn(inputsd));
+
+}  // namespace matrix
+}  // namespace raft
\ No newline at end of file

From 1123262d482ece1764be51489df9b8259b0e40fe Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 4 Oct 2022 21:33:08 -0400
Subject: [PATCH 03/13] iFixing argmax to be row major

---
 cpp/include/raft/matrix/argmax.cuh |  6 ++---
 cpp/test/matrix/argmax.cu          | 38 ++++++++++++++++--------------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/cpp/include/raft/matrix/argmax.cuh b/cpp/include/raft/matrix/argmax.cuh
index 2c4908f712..5cfd3eb1bf 100644
--- a/cpp/include/raft/matrix/argmax.cuh
+++ b/cpp/include/raft/matrix/argmax.cuh
@@ -30,11 +30,11 @@ namespace raft::matrix {
  */
 template <typename math_t, typename idx_t, typename matrix_idx_t>
 void argmax(const raft::handle_t& handle,
-            raft::device_matrix_view<const math_t, matrix_idx_t, col_major> in,
+            raft::device_matrix_view<const math_t, matrix_idx_t, row_major> in,
             raft::device_vector_view<idx_t, matrix_idx_t> out)
 {
-  RAFT_EXPECTS(out.extent(1) == in.extent(1),
-               "Size of output vector must equal number of columns in input matrix.");
+  RAFT_EXPECTS(out.extent(0) == in.extent(0),
+               "Size of output vector must equal number of rows in input matrix.");
   detail::argmax(
     in.data_handle(), in.extent(0), in.extent(1), out.data_handle(), handle.get_stream());
 }
diff --git a/cpp/test/matrix/argmax.cu b/cpp/test/matrix/argmax.cu
index fbd8bc7bb8..9568c06d93 100644
--- a/cpp/test/matrix/argmax.cu
+++ b/cpp/test/matrix/argmax.cu
@@ -44,46 +44,48 @@ class ArgMaxTest : public ::testing::TestWithParam<ArgMaxInputs<T, IdxT>> {
  public:
   ArgMaxTest()
     : params(::testing::TestWithParam<ArgMaxInputs<T, IdxT>>::GetParam()),
-      input(raft::make_device_matrix<T, std::uint32_t, col_major>(
+      input(raft::make_device_matrix<T, std::uint32_t, row_major>(
         handle, params.n_rows, params.n_cols)),
       output(raft::make_device_vector<IdxT, std::uint32_t>(handle, params.n_rows)),
       expected(raft::make_device_vector<IdxT, std::uint32_t>(handle, params.n_rows))
   {
-    raft::copy(input.data_handle(),
-               params.input_matrix.data(),
-               params.n_rows * params.n_cols,
-               handle.get_stream());
-    raft::copy(expected.data_handle(),
-               params.output_matrix.data(),
-               params.n_rows * params.n_cols,
-               handle.get_stream());
-
-    auto input_const_view = raft::make_device_matrix_view<const T, std::uint32_t, col_major>(
+    raft::update_device(input.data_handle(),
+                        params.input_matrix.data(),
+                        params.input_matrix.size(),
+                        handle.get_stream());
+    raft::update_device(expected.data_handle(),
+                        params.output_matrix.data(),
+                        params.output_matrix.size(),
+                        handle.get_stream());
+
+    auto input_const_view = raft::make_device_matrix_view<const T, std::uint32_t, row_major>(
       input.data_handle(), input.extent(0), input.extent(1));
 
     raft::matrix::argmax(handle, input_const_view, output.view());
+
+    handle.sync_stream();
   }
 
  protected:
   raft::handle_t handle;
   ArgMaxInputs<T, IdxT> params;
 
-  raft::device_matrix<T, std::uint32_t, col_major> input;
+  raft::device_matrix<T, std::uint32_t, row_major> input;
   raft::device_vector<IdxT, std::uint32_t> output;
   raft::device_vector<IdxT, std::uint32_t> expected;
 };
 
 const std::vector<ArgMaxInputs<float, int>> inputsf = {
-  {{0.1f, 0.4f, 0.2f, 0.2f, 0.3f, 0.3f, 0.3f, 0.2f, 0.5f, 0.4f, 0.1f, 0.0f}, {3, 0, 2}, 3, 4}};
+  {{0.1f, 0.2f, 0.3f, 0.4f, 0.4f, 0.3f, 0.2f, 0.1f, 0.2f, 0.3f, 0.5f, 0.0f}, {3, 0, 2}, 3, 4}};
 
 const std::vector<ArgMaxInputs<double, int>> inputsd = {
-  {{0.1, 0.4, 0.2, 0.2, 0.3, 0.3, 0.3, 0.2, 0.5, 0.4, 0.1, 0.0}, {3, 0, 2}, 3, 4}};
+  {{0.1, 0.2, 0.3, 0.4, 0.4, 0.3, 0.2, 0.1, 0.2, 0.3, 0.5, 0.0}, {3, 0, 2}, 3, 4}};
 
 typedef ArgMaxTest<float, int> ArgMaxTestF;
 TEST_P(ArgMaxTestF, Result)
 {
-  ASSERT_TRUE(devArrMatch(output.data_handle(),
-                          expected.data_handle(),
+  ASSERT_TRUE(devArrMatch(expected.data_handle(),
+                          output.data_handle(),
                           params.n_rows,
                           Compare<int>(),
                           handle.get_stream()));
@@ -92,8 +94,8 @@ TEST_P(ArgMaxTestF, Result)
 typedef ArgMaxTest<double, int> ArgMaxTestD;
 TEST_P(ArgMaxTestD, Result)
 {
-  ASSERT_TRUE(devArrMatch(output.data_handle(),
-                          expected.data_handle(),
+  ASSERT_TRUE(devArrMatch(expected.data_handle(),
+                          output.data_handle(),
                           params.n_rows,
                           Compare<int>(),
                           handle.get_stream()));

From e24e850ab098d578763494e74850917199e99622 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 8 Oct 2022 21:18:56 -0400
Subject: [PATCH 04/13] fixing docs

---
 cpp/include/raft/matrix/diagonal.cuh |  3 +++
 cpp/include/raft/matrix/norm.cuh     |  5 ++---
 cpp/include/raft/matrix/reverse.cuh  | 16 ++++------------
 3 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/cpp/include/raft/matrix/diagonal.cuh b/cpp/include/raft/matrix/diagonal.cuh
index e546982fa9..d83c932fcd 100644
--- a/cpp/include/raft/matrix/diagonal.cuh
+++ b/cpp/include/raft/matrix/diagonal.cuh
@@ -24,6 +24,7 @@ namespace raft::matrix {
 
 /**
  * @brief Initialize a diagonal matrix with a vector
+ * @param[in] handle: raft handle
  * @param[in] vec: vector of length k = min(n_rows, n_cols)
  * @param[out] matrix: matrix of size n_rows x n_cols
  */
@@ -44,6 +45,7 @@ void set_diagonal(const raft::handle_t& handle,
 
 /**
  * @brief Initialize a diagonal matrix with a vector
+ * @param handle: raft handle
  * @param[in] matrix: matrix of size n_rows x n_cols
  * @param[out] vec: vector of length k = min(n_rows, n_cols)
  */
@@ -63,6 +65,7 @@ void get_diagonal(const raft::handle_t& handle,
 
 /**
  * @brief Take reciprocal of elements on diagonal of square matrix (in-place)
+ * @param handle raft handle
  * @param[inout] inout: square input matrix with size len x len
  */
 template <typename m_t, typename idx_t, typename layout>
diff --git a/cpp/include/raft/matrix/norm.cuh b/cpp/include/raft/matrix/norm.cuh
index c19bde8828..3ad5faccda 100644
--- a/cpp/include/raft/matrix/norm.cuh
+++ b/cpp/include/raft/matrix/norm.cuh
@@ -24,10 +24,9 @@ namespace raft::matrix {
 
 /**
  * @brief Get the L2/F-norm of a matrix
- * @param handle
+ * @param handle: raft handle
  * @param in: input matrix/vector with totally size elements
- * @param size: size of the matrix/vector
- * @param stream: cuda stream
+ * @returns matrix l2 norm
  */
 template <typename m_t, typename idx_t>
 m_t l2_norm(const raft::handle_t& handle, raft::device_mdspan<m_t, idx_t> in)
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
index 6bdc381a34..c1025577a7 100644
--- a/cpp/include/raft/matrix/reverse.cuh
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -25,10 +25,8 @@ namespace raft::matrix {
 /**
  * @brief Columns of a column major matrix are reversed in place (i.e. first column and
  * last column are swapped)
+ * @param handle: raft handle
  * @param inout: input and output matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t>
 void col_reverse(const raft::handle_t& handle,
@@ -40,10 +38,8 @@ void col_reverse(const raft::handle_t& handle,
 /**
  * @brief Columns of a column major matrix are reversed in place (i.e. first column and
  * last column are swapped)
+ * @param handle: raft handle
  * @param inout: input and output matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t>
 void col_reverse(const raft::handle_t& handle,
@@ -55,10 +51,8 @@ void col_reverse(const raft::handle_t& handle,
 /**
  * @brief Rows of a column major matrix are reversed in place (i.e. first row and last
  * row are swapped)
+ * @param handle: raft handle
  * @param inout: input and output matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t>
 void row_reverse(const raft::handle_t& handle,
@@ -70,10 +64,8 @@ void row_reverse(const raft::handle_t& handle,
 /**
  * @brief Rows of a column major matrix are reversed in place (i.e. first row and last
  * row are swapped)
+ * @param handle: raft handle
  * @param inout: input and output matrix
- * @param n_rows: number of rows of input matrix
- * @param n_cols: number of columns of input matrix
- * @param stream: cuda stream
  */
 template <typename m_t, typename idx_t>
 void row_reverse(const raft::handle_t& handle,

From bf29eafc5b6324427aadbcb1c3d76b111d3f7369 Mon Sep 17 00:00:00 2001
From: Mickael Ide <mide@nvidia.com>
Date: Wed, 12 Oct 2022 15:43:27 +0200
Subject: [PATCH 05/13] Add `matrix::norm` test

---
 cpp/include/raft/matrix/detail/matrix.cuh |   6 +-
 cpp/include/raft/matrix/norm.cuh          |   2 +-
 cpp/test/CMakeLists.txt                   |   1 +
 cpp/test/matrix/math.cu                   |   8 +-
 cpp/test/matrix/norm.cu                   | 128 ++++++++++++++++++++++
 5 files changed, 138 insertions(+), 7 deletions(-)
 create mode 100644 cpp/test/matrix/norm.cu

diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index 3408364b58..ddcbb7ce55 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -300,11 +300,13 @@ void getDiagonalInverseMatrix(m_t* in, idx_t len, cudaStream_t stream)
 }
 
 template <typename m_t, typename idx_t = int>
-m_t getL2Norm(const raft::handle_t& handle, m_t* in, idx_t size, cudaStream_t stream)
+m_t getL2Norm(const raft::handle_t& handle, const m_t* in, idx_t size, cudaStream_t stream)
 {
   cublasHandle_t cublasH = handle.get_cublas_handle();
   m_t normval            = 0;
-  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(cublasH, size, in, 1, &normval, stream));
+  RAFT_EXPECTS(std::is_integral_v<idx_t> && size <= std::numeric_limits<int>::max(),
+               "Index type not supported");
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(cublasH, static_cast<int>(size), in, 1, &normval, stream));
   return normval;
 }
 
diff --git a/cpp/include/raft/matrix/norm.cuh b/cpp/include/raft/matrix/norm.cuh
index 3ad5faccda..559ca2e47e 100644
--- a/cpp/include/raft/matrix/norm.cuh
+++ b/cpp/include/raft/matrix/norm.cuh
@@ -29,7 +29,7 @@ namespace raft::matrix {
  * @returns matrix l2 norm
  */
 template <typename m_t, typename idx_t>
-m_t l2_norm(const raft::handle_t& handle, raft::device_mdspan<m_t, idx_t> in)
+m_t l2_norm(const raft::handle_t& handle, raft::device_mdspan<const m_t, idx_t> in)
 {
   return detail::getL2Norm(handle, in.data_handle(), in.size(), handle.get_stream());
 }
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 2d8468d5f7..cdb2f67822 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -162,6 +162,7 @@ if(BUILD_TESTS)
             test/matrix/gather.cu
             test/matrix/math.cu
             test/matrix/matrix.cu
+            test/matrix/norm.cu
             test/matrix/columnSort.cu
             test/matrix/linewise_op.cu
             test/spectral_matrix.cu
diff --git a/cpp/test/matrix/math.cu b/cpp/test/matrix/math.cu
index ad4a37825c..684b550dfc 100644
--- a/cpp/test/matrix/math.cu
+++ b/cpp/test/matrix/math.cu
@@ -32,7 +32,7 @@ namespace raft {
 namespace matrix {
 
 template <typename Type>
-__global__ void nativePowerKernel(Type* in, Type* out, int len)
+__global__ void naivePowerKernel(Type* in, Type* out, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = in[idx] * in[idx]; }
@@ -43,12 +43,12 @@ void naivePower(Type* in, Type* out, int len, cudaStream_t stream)
 {
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
-  nativePowerKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
+  naivePowerKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 template <typename Type>
-__global__ void nativeSqrtKernel(Type* in, Type* out, int len)
+__global__ void naiveSqrtKernel(Type* in, Type* out, int len)
 {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < len) { out[idx] = std::sqrt(in[idx]); }
@@ -59,7 +59,7 @@ void naiveSqrt(Type* in, Type* out, int len, cudaStream_t stream)
 {
   static const int TPB = 64;
   int nblks            = raft::ceildiv(len, TPB);
-  nativeSqrtKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
+  naiveSqrtKernel<Type><<<nblks, TPB, 0, stream>>>(in, out, len);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
diff --git a/cpp/test/matrix/norm.cu b/cpp/test/matrix/norm.cu
new file mode 100644
index 0000000000..e84742adbe
--- /dev/null
+++ b/cpp/test/matrix/norm.cu
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #include "../test_utils.h"
+ #include <gtest/gtest.h>
+ #include <raft/matrix/norm.cuh>
+ #include <raft/random/rng.cuh>
+ #include <raft/util/cudart_utils.hpp>
+ #include <rmm/device_scalar.hpp>
+ 
+ namespace raft {
+ namespace matrix {
+ 
+ template <typename T>
+ struct NormInputs {
+   T tolerance;
+   int rows, cols;
+   unsigned long long int seed;
+ };
+ 
+ template <typename T>
+ ::std::ostream& operator<<(::std::ostream& os, const NormInputs<T>& I)
+ {
+   os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " 
+      << I.seed << '}' << std::endl;
+   return os;
+ }
+ 
+ template <typename Type>
+ Type naiveNorm(
+   const Type* data, int D, int N)
+ {
+   Type out_scalar = 0;
+   for (int i = 0; i < N * D; ++i) {
+      out_scalar += data[i] * data[i];
+   }
+   out_scalar = std::sqrt(out_scalar);
+   return out_scalar;
+ }
+ 
+ template <typename T>
+ class NormTest : public ::testing::TestWithParam<NormInputs<T>> {
+  public:
+   NormTest()
+     : params(::testing::TestWithParam<NormInputs<T>>::GetParam()),
+       stream(handle.get_stream()),
+       data(params.rows * params.cols, stream)
+   {
+   }
+ 
+   void SetUp() override
+   {
+     raft::random::RngState r(params.seed);
+     int rows = params.rows, cols = params.cols, len = rows * cols;
+     uniform(handle, r, data.data(), len, T(-1.0), T(1.0));
+     std::vector<T> h_data(rows*cols);
+     raft::update_host(h_data.data(), data.data(), rows*cols, stream);
+     out_scalar_exp = naiveNorm(h_data.data(), cols, rows);
+     auto input = raft::make_device_matrix_view<const T, uint32_t>(
+       data.data(), params.rows, params.cols);
+     out_scalar_act = l2_norm(handle, input);
+     handle.sync_stream(stream);
+   }
+ 
+  protected:
+   raft::handle_t handle;
+   cudaStream_t stream;
+ 
+   NormInputs<T> params;
+   rmm::device_uvector<T> data;
+   T out_scalar_exp = 0;
+   T out_scalar_act = 0;
+ };
+
+ ///// Row- and column-wise tests
+ const std::vector<NormInputs<float>> inputsf = {{0.00001f, 32, 1024, 1234ULL},
+                                                 {0.00001f, 64, 1024, 1234ULL},
+                                                 {0.00001f, 128, 1024, 1234ULL},
+                                                 {0.00001f, 256, 1024, 1234ULL},
+                                                 {0.00001f, 512, 512, 1234ULL},
+                                                 {0.00001f, 1024, 32, 1234ULL},
+                                                 {0.00001f, 1024, 64, 1234ULL},
+                                                 {0.00001f, 1024, 128, 1234ULL},
+                                                 {0.00001f, 1024, 256, 1234ULL}};
+ 
+ const std::vector<NormInputs<double>> inputsd = {
+   {0.00000001, 32, 1024, 1234ULL},
+   {0.00000001, 64, 1024, 1234ULL},
+   {0.00000001, 128, 1024, 1234ULL},
+   {0.00000001, 256, 1024, 1234ULL},
+   {0.00000001, 512, 512, 1234ULL},
+   {0.00000001, 1024, 32, 1234ULL},
+   {0.00000001, 1024, 64, 1234ULL},
+   {0.00000001, 1024, 128, 1234ULL},
+   {0.00000001, 1024, 256, 1234ULL},};
+ 
+ typedef NormTest<float> NormTestF;
+ TEST_P(NormTestF, Result)
+ {
+    ASSERT_NEAR(out_scalar_exp, out_scalar_act, params.tolerance);
+ }
+ 
+ typedef NormTest<double> NormTestD;
+ TEST_P(NormTestD, Result)
+ {
+    ASSERT_NEAR(out_scalar_exp, out_scalar_act, params.tolerance);
+ }
+ 
+ INSTANTIATE_TEST_CASE_P(NormTests, NormTestF, ::testing::ValuesIn(inputsf));
+ 
+ INSTANTIATE_TEST_CASE_P(NormTests, NormTestD, ::testing::ValuesIn(inputsd));
+ 
+ }  // end namespace matrix
+ }  // end namespace raft
+ 
\ No newline at end of file

From cae83e75bada09ea9d308539327146f010579adf Mon Sep 17 00:00:00 2001
From: Mickael Ide <mide@nvidia.com>
Date: Wed, 12 Oct 2022 19:24:10 +0200
Subject: [PATCH 06/13] Add slice test

---
 cpp/test/matrix/norm.cu  |   6 +-
 cpp/test/matrix/slice.cu | 142 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 145 insertions(+), 3 deletions(-)
 create mode 100644 cpp/test/matrix/slice.cu

diff --git a/cpp/test/matrix/norm.cu b/cpp/test/matrix/norm.cu
index e84742adbe..64614495d4 100644
--- a/cpp/test/matrix/norm.cu
+++ b/cpp/test/matrix/norm.cu
@@ -65,7 +65,7 @@
    {
      raft::random::RngState r(params.seed);
      int rows = params.rows, cols = params.cols, len = rows * cols;
-     uniform(handle, r, data.data(), len, T(-1.0), T(1.0));
+     uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
      std::vector<T> h_data(rows*cols);
      raft::update_host(h_data.data(), data.data(), rows*cols, stream);
      out_scalar_exp = naiveNorm(h_data.data(), cols, rows);
@@ -110,13 +110,13 @@
  typedef NormTest<float> NormTestF;
  TEST_P(NormTestF, Result)
  {
-    ASSERT_NEAR(out_scalar_exp, out_scalar_act, params.tolerance);
+    ASSERT_NEAR(out_scalar_exp, out_scalar_act, params.tolerance * params.rows * params.cols);
  }
  
  typedef NormTest<double> NormTestD;
  TEST_P(NormTestD, Result)
  {
-    ASSERT_NEAR(out_scalar_exp, out_scalar_act, params.tolerance);
+    ASSERT_NEAR(out_scalar_exp, out_scalar_act, params.tolerance * params.rows * params.cols);
  }
  
  INSTANTIATE_TEST_CASE_P(NormTests, NormTestF, ::testing::ValuesIn(inputsf));
diff --git a/cpp/test/matrix/slice.cu b/cpp/test/matrix/slice.cu
new file mode 100644
index 0000000000..6ffa0c9045
--- /dev/null
+++ b/cpp/test/matrix/slice.cu
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #include "../test_utils.h"
+ #include <gtest/gtest.h>
+ #include <raft/matrix/slice.cuh>
+ #include <raft/random/rng.cuh>
+ #include <raft/util/cudart_utils.hpp>
+ #include <rmm/device_scalar.hpp>
+ 
+ namespace raft {
+ namespace matrix {
+ 
+ template <typename T>
+ struct SliceInputs {
+   T tolerance;
+   int rows, cols;
+   unsigned long long int seed;
+ };
+ 
+ template <typename T>
+ ::std::ostream& operator<<(::std::ostream& os, const SliceInputs<T>& I)
+ {
+   os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " 
+      << I.seed << '}' << std::endl;
+   return os;
+ }
+ 
+// Col-major slice reference test
+ template <typename Type>
+ void naiveSlice(
+   const Type* in, Type* out, int rows, int cols, int x1, int y1, int x2, int y2)
+{
+    int out_rows = x2 - x1;
+    // int out_cols = y2 - y1;
+    for (int j = y1; j < y2; ++j) {
+        for (int i = x1; i < x2; ++i) {
+            out[(i - x1) + (j - y1) * out_rows] = in[i + j * rows];
+        }
+   }
+}
+ 
+ template <typename T>
+ class SliceTest : public ::testing::TestWithParam<SliceInputs<T>> {
+  public:
+   SliceTest()
+     : params(::testing::TestWithParam<SliceInputs<T>>::GetParam()),
+       stream(handle.get_stream()),
+       data(params.rows * params.cols, stream)
+   {
+   }
+ 
+   void SetUp() override
+   {
+     std::random_device rd;
+     std::default_random_engine dre(rd());
+     raft::random::RngState r(params.seed);
+     int rows = params.rows, cols = params.cols, len = rows * cols;
+     uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
+
+     std::uniform_int_distribution<int> rowGenerator(0, rows / 2);
+     int row1 = rowGenerator(dre);
+     int row2 = rowGenerator(dre) + rows / 2;
+
+     std::uniform_int_distribution<int> colGenerator(0, cols / 2);
+     int col1 = colGenerator(dre);
+     int col2 = colGenerator(dre) + cols / 2;
+
+     std::vector<T> h_data(rows*cols);
+     raft::update_host(h_data.data(), data.data(), rows*cols, stream);
+     exp_result = naiveSlice(h_data.data(), rows, cols, row1, col1, row2, col2);
+     auto input = raft::make_device_matrix_view<const T, uint32_t, raft::col_major>(
+       data.data(), params.rows, params.cols, row1, col1, row2, col2);
+     act_result = slice(handle, input, output);
+     handle.sync_stream(stream);
+   }
+ 
+  protected:
+   raft::handle_t handle;
+   cudaStream_t stream;
+ 
+   SliceInputs<T> params;
+   rmm::device_uvector<T> data;
+   rmm::device_uvector<T> exp_result, act_result;
+ };
+
+ ///// Row- and column-wise tests
+ const std::vector<SliceInputs<float>> inputsf = {{0.00001f, 32, 1024, 1234ULL},
+                                                  {0.00001f, 64, 1024, 1234ULL},
+                                                  {0.00001f, 128, 1024, 1234ULL},
+                                                  {0.00001f, 256, 1024, 1234ULL},
+                                                  {0.00001f, 512, 512, 1234ULL},
+                                                  {0.00001f, 1024, 32, 1234ULL},
+                                                  {0.00001f, 1024, 64, 1234ULL},
+                                                  {0.00001f, 1024, 128, 1234ULL},
+                                                  {0.00001f, 1024, 256, 1234ULL}};
+ 
+ const std::vector<SliceInputs<double>> inputsd = {
+   {0.00000001, 32, 1024, 1234ULL},
+   {0.00000001, 64, 1024, 1234ULL},
+   {0.00000001, 128, 1024, 1234ULL},
+   {0.00000001, 256, 1024, 1234ULL},
+   {0.00000001, 512, 512, 1234ULL},
+   {0.00000001, 1024, 32, 1234ULL},
+   {0.00000001, 1024, 64, 1234ULL},
+   {0.00000001, 1024, 128, 1234ULL},
+   {0.00000001, 1024, 256, 1234ULL},};
+ 
+ typedef SliceTest<float> SliceTestF;
+ TEST_P(SliceTestF, Result)
+ {
+    ASSERT_NEAR(exp_result, act_result, params.tolerance);
+    ASSERT_TRUE(devArrMatch(
+        d_out_exp.data(), d_out_act.data(), params.map_length * params.ncols, raft::Compare<float>()));
+ }
+ 
+ typedef SliceTest<double> SliceTestD;
+ TEST_P(SliceTestD, Result)
+ {
+    ASSERT_NEAR(exp_result, act_result, params.tolerance);
+ }
+ 
+ INSTANTIATE_TEST_CASE_P(SliceTests, SliceTestF, ::testing::ValuesIn(inputsf));
+ 
+ INSTANTIATE_TEST_CASE_P(SliceTests, SliceTestD, ::testing::ValuesIn(inputsd));
+ 
+ }  // end namespace matrix
+ }  // end namespace raft
+ 
\ No newline at end of file

From f07019a3ff56fc3d4b4f8b72b2a068a7f138cd76 Mon Sep 17 00:00:00 2001
From: Mickael Ide <mide@nvidia.com>
Date: Mon, 17 Oct 2022 17:48:04 +0200
Subject: [PATCH 07/13] Complete slice test and add constness

---
 cpp/include/raft/matrix/detail/matrix.cuh |   7 +-
 cpp/include/raft/matrix/slice.cuh         |   6 +-
 cpp/test/matrix/norm.cu                   | 223 ++++++++++---------
 cpp/test/matrix/slice.cu                  | 249 +++++++++++-----------
 4 files changed, 244 insertions(+), 241 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index ddcbb7ce55..fd57d21621 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -170,7 +170,7 @@ void printHost(const m_t* in, idx_t n_rows, idx_t n_cols)
  */
 template <typename m_t, typename idx_t = int>
 __global__ void slice(
-  m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2)
+  const m_t* src_d, idx_t m, idx_t n, m_t* dst_d, idx_t x1, idx_t y1, idx_t x2, idx_t y2)
 {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t dm = x2 - x1, dn = y2 - y1;
@@ -182,7 +182,7 @@ __global__ void slice(
 }
 
 template <typename m_t, typename idx_t = int>
-void sliceMatrix(m_t* in,
+void sliceMatrix(const m_t* in,
                  idx_t n_rows,
                  idx_t n_cols,
                  m_t* out,
@@ -306,7 +306,8 @@ m_t getL2Norm(const raft::handle_t& handle, const m_t* in, idx_t size, cudaStrea
   m_t normval            = 0;
   RAFT_EXPECTS(std::is_integral_v<idx_t> && size <= std::numeric_limits<int>::max(),
                "Index type not supported");
-  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasnrm2(cublasH, static_cast<int>(size), in, 1, &normval, stream));
+  RAFT_CUBLAS_TRY(
+    raft::linalg::detail::cublasnrm2(cublasH, static_cast<int>(size), in, 1, &normval, stream));
   return normval;
 }
 
diff --git a/cpp/include/raft/matrix/slice.cuh b/cpp/include/raft/matrix/slice.cuh
index b556b2f903..837eacca94 100644
--- a/cpp/include/raft/matrix/slice.cuh
+++ b/cpp/include/raft/matrix/slice.cuh
@@ -35,7 +35,7 @@ namespace raft::matrix {
  */
 template <typename m_t, typename idx_t>
 void slice(const raft::handle_t& handle,
-           raft::device_matrix_view<m_t, idx_t, col_major> in,
+           raft::device_matrix_view<const m_t, idx_t, col_major> in,
            raft::device_matrix_view<m_t, idx_t, col_major> out,
            idx_t x1,
            idx_t y1,
@@ -45,9 +45,9 @@ void slice(const raft::handle_t& handle,
   RAFT_EXPECTS(x2 > x1, "x2 must be > x1");
   RAFT_EXPECTS(y2 > y1, "y2 must be > y1");
   RAFT_EXPECTS(x1 >= 0, "x1 must be >= 0");
-  RAFT_EXPECTS(x2 <= in.extents(0), "x2 must be <= number of rows in the input matrix")
+  RAFT_EXPECTS(x2 <= in.extent(0), "x2 must be <= number of rows in the input matrix");
   RAFT_EXPECTS(y1 >= 0, "y1 must be >= 0");
-  RAFT_EXPECTS(y2 <= in.extents(1), "y2 must be <= number of columns in the input matrix");
+  RAFT_EXPECTS(y2 <= in.extent(1), "y2 must be <= number of columns in the input matrix");
 
   detail::sliceMatrix(in.data_handle(),
                       in.extent(0),
diff --git a/cpp/test/matrix/norm.cu b/cpp/test/matrix/norm.cu
index 64614495d4..4047e9a707 100644
--- a/cpp/test/matrix/norm.cu
+++ b/cpp/test/matrix/norm.cu
@@ -14,115 +14,114 @@
  * limitations under the License.
  */
 
- #include "../test_utils.h"
- #include <gtest/gtest.h>
- #include <raft/matrix/norm.cuh>
- #include <raft/random/rng.cuh>
- #include <raft/util/cudart_utils.hpp>
- #include <rmm/device_scalar.hpp>
- 
- namespace raft {
- namespace matrix {
- 
- template <typename T>
- struct NormInputs {
-   T tolerance;
-   int rows, cols;
-   unsigned long long int seed;
- };
- 
- template <typename T>
- ::std::ostream& operator<<(::std::ostream& os, const NormInputs<T>& I)
- {
-   os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " 
-      << I.seed << '}' << std::endl;
-   return os;
- }
- 
- template <typename Type>
- Type naiveNorm(
-   const Type* data, int D, int N)
- {
-   Type out_scalar = 0;
-   for (int i = 0; i < N * D; ++i) {
-      out_scalar += data[i] * data[i];
-   }
-   out_scalar = std::sqrt(out_scalar);
-   return out_scalar;
- }
- 
- template <typename T>
- class NormTest : public ::testing::TestWithParam<NormInputs<T>> {
-  public:
-   NormTest()
-     : params(::testing::TestWithParam<NormInputs<T>>::GetParam()),
-       stream(handle.get_stream()),
-       data(params.rows * params.cols, stream)
-   {
-   }
- 
-   void SetUp() override
-   {
-     raft::random::RngState r(params.seed);
-     int rows = params.rows, cols = params.cols, len = rows * cols;
-     uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
-     std::vector<T> h_data(rows*cols);
-     raft::update_host(h_data.data(), data.data(), rows*cols, stream);
-     out_scalar_exp = naiveNorm(h_data.data(), cols, rows);
-     auto input = raft::make_device_matrix_view<const T, uint32_t>(
-       data.data(), params.rows, params.cols);
-     out_scalar_act = l2_norm(handle, input);
-     handle.sync_stream(stream);
-   }
- 
-  protected:
-   raft::handle_t handle;
-   cudaStream_t stream;
- 
-   NormInputs<T> params;
-   rmm::device_uvector<T> data;
-   T out_scalar_exp = 0;
-   T out_scalar_act = 0;
- };
-
- ///// Row- and column-wise tests
- const std::vector<NormInputs<float>> inputsf = {{0.00001f, 32, 1024, 1234ULL},
-                                                 {0.00001f, 64, 1024, 1234ULL},
-                                                 {0.00001f, 128, 1024, 1234ULL},
-                                                 {0.00001f, 256, 1024, 1234ULL},
-                                                 {0.00001f, 512, 512, 1234ULL},
-                                                 {0.00001f, 1024, 32, 1234ULL},
-                                                 {0.00001f, 1024, 64, 1234ULL},
-                                                 {0.00001f, 1024, 128, 1234ULL},
-                                                 {0.00001f, 1024, 256, 1234ULL}};
- 
- const std::vector<NormInputs<double>> inputsd = {
-   {0.00000001, 32, 1024, 1234ULL},
-   {0.00000001, 64, 1024, 1234ULL},
-   {0.00000001, 128, 1024, 1234ULL},
-   {0.00000001, 256, 1024, 1234ULL},
-   {0.00000001, 512, 512, 1234ULL},
-   {0.00000001, 1024, 32, 1234ULL},
-   {0.00000001, 1024, 64, 1234ULL},
-   {0.00000001, 1024, 128, 1234ULL},
-   {0.00000001, 1024, 256, 1234ULL},};
- 
- typedef NormTest<float> NormTestF;
- TEST_P(NormTestF, Result)
- {
-    ASSERT_NEAR(out_scalar_exp, out_scalar_act, params.tolerance * params.rows * params.cols);
- }
- 
- typedef NormTest<double> NormTestD;
- TEST_P(NormTestD, Result)
- {
-    ASSERT_NEAR(out_scalar_exp, out_scalar_act, params.tolerance * params.rows * params.cols);
- }
- 
- INSTANTIATE_TEST_CASE_P(NormTests, NormTestF, ::testing::ValuesIn(inputsf));
- 
- INSTANTIATE_TEST_CASE_P(NormTests, NormTestD, ::testing::ValuesIn(inputsd));
- 
- }  // end namespace matrix
- }  // end namespace raft
- 
\ No newline at end of file
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/matrix/norm.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_scalar.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T>
+struct NormInputs {
+  T tolerance;
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const NormInputs<T>& I)
+{
+  os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " << I.seed << '}'
+     << std::endl;
+  return os;
+}
+
+template <typename Type>
+Type naiveNorm(const Type* data, int D, int N)
+{
+  Type out_scalar = 0;
+  for (int i = 0; i < N * D; ++i) {
+    out_scalar += data[i] * data[i];
+  }
+  out_scalar = std::sqrt(out_scalar);
+  return out_scalar;
+}
+
+template <typename T>
+class NormTest : public ::testing::TestWithParam<NormInputs<T>> {
+ public:
+  NormTest()
+    : params(::testing::TestWithParam<NormInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    raft::random::RngState r(params.seed);
+    int rows = params.rows, cols = params.cols, len = rows * cols;
+    uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
+    std::vector<T> h_data(rows * cols);
+    raft::update_host(h_data.data(), data.data(), rows * cols, stream);
+    out_scalar_exp = naiveNorm(h_data.data(), cols, rows);
+    auto input =
+      raft::make_device_matrix_view<const T, uint32_t>(data.data(), params.rows, params.cols);
+    out_scalar_act = l2_norm(handle, input);
+    handle.sync_stream(stream);
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  NormInputs<T> params;
+  rmm::device_uvector<T> data;
+  T out_scalar_exp = 0;
+  T out_scalar_act = 0;
+};
+
+///// Row- and column-wise tests
+const std::vector<NormInputs<float>> inputsf = {{0.00001f, 32, 1024, 1234ULL},
+                                                {0.00001f, 64, 1024, 1234ULL},
+                                                {0.00001f, 128, 1024, 1234ULL},
+                                                {0.00001f, 256, 1024, 1234ULL},
+                                                {0.00001f, 512, 512, 1234ULL},
+                                                {0.00001f, 1024, 32, 1234ULL},
+                                                {0.00001f, 1024, 64, 1234ULL},
+                                                {0.00001f, 1024, 128, 1234ULL},
+                                                {0.00001f, 1024, 256, 1234ULL}};
+
+const std::vector<NormInputs<double>> inputsd = {
+  {0.00000001, 32, 1024, 1234ULL},
+  {0.00000001, 64, 1024, 1234ULL},
+  {0.00000001, 128, 1024, 1234ULL},
+  {0.00000001, 256, 1024, 1234ULL},
+  {0.00000001, 512, 512, 1234ULL},
+  {0.00000001, 1024, 32, 1234ULL},
+  {0.00000001, 1024, 64, 1234ULL},
+  {0.00000001, 1024, 128, 1234ULL},
+  {0.00000001, 1024, 256, 1234ULL},
+};
+
+typedef NormTest<float> NormTestF;
+TEST_P(NormTestF, Result)
+{
+  ASSERT_NEAR(out_scalar_exp, out_scalar_act, params.tolerance * params.rows * params.cols);
+}
+
+typedef NormTest<double> NormTestD;
+TEST_P(NormTestD, Result)
+{
+  ASSERT_NEAR(out_scalar_exp, out_scalar_act, params.tolerance * params.rows * params.cols);
+}
+
+INSTANTIATE_TEST_CASE_P(NormTests, NormTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(NormTests, NormTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace matrix
+}  // end namespace raft
diff --git a/cpp/test/matrix/slice.cu b/cpp/test/matrix/slice.cu
index 6ffa0c9045..f0cce2c184 100644
--- a/cpp/test/matrix/slice.cu
+++ b/cpp/test/matrix/slice.cu
@@ -14,129 +14,132 @@
  * limitations under the License.
  */
 
- #include "../test_utils.h"
- #include <gtest/gtest.h>
- #include <raft/matrix/slice.cuh>
- #include <raft/random/rng.cuh>
- #include <raft/util/cudart_utils.hpp>
- #include <rmm/device_scalar.hpp>
- 
- namespace raft {
- namespace matrix {
- 
- template <typename T>
- struct SliceInputs {
-   T tolerance;
-   int rows, cols;
-   unsigned long long int seed;
- };
- 
- template <typename T>
- ::std::ostream& operator<<(::std::ostream& os, const SliceInputs<T>& I)
- {
-   os << "{ " << I.tolerance << ", " << I.rows << ", " << I.cols << ", " 
-      << I.seed << '}' << std::endl;
-   return os;
- }
- 
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/matrix/slice.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/device_scalar.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T>
+struct SliceInputs {
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const SliceInputs<T>& I)
+{
+  os << "{ " << I.rows << ", " << I.cols << ", " << I.seed << '}' << std::endl;
+  return os;
+}
+
 // Col-major slice reference test
- template <typename Type>
- void naiveSlice(
-   const Type* in, Type* out, int rows, int cols, int x1, int y1, int x2, int y2)
+template <typename Type>
+void naiveSlice(const Type* in, Type* out, int rows, int cols, int x1, int y1, int x2, int y2)
+{
+  int out_rows = x2 - x1;
+  // int out_cols = y2 - y1;
+  for (int j = y1; j < y2; ++j) {
+    for (int i = x1; i < x2; ++i) {
+      out[(i - x1) + (j - y1) * out_rows] = in[i + j * rows];
+    }
+  }
+}
+
+template <typename T>
+class SliceTest : public ::testing::TestWithParam<SliceInputs<T>> {
+ public:
+  SliceTest()
+    : params(::testing::TestWithParam<SliceInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    raft::random::RngState r(params.seed);
+    int rows = params.rows, cols = params.cols, len = rows * cols;
+    uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
+
+    std::uniform_int_distribution<int> rowGenerator(0, rows / 2);
+    auto row1 = rowGenerator(dre);
+    auto row2 = rowGenerator(dre) + rows / 2;
+
+    std::uniform_int_distribution<int> colGenerator(0, cols / 2);
+    auto col1 = colGenerator(dre);
+    auto col2 = colGenerator(dre) + cols / 2;
+
+    rmm::device_uvector<T> d_act_result((row2 - row1) * (col2 - col1), stream);
+    act_result.resize((row2 - row1) * (col2 - col1));
+    exp_result.resize((row2 - row1) * (col2 - col1));
+
+    std::vector<T> h_data(rows * cols);
+    raft::update_host(h_data.data(), data.data(), rows * cols, stream);
+    naiveSlice(h_data.data(), exp_result.data(), rows, cols, row1, col1, row2, col2);
+    auto input =
+      raft::make_device_matrix_view<const T, int, raft::col_major>(data.data(), rows, cols);
+    auto output = raft::make_device_matrix_view<T, int, raft::col_major>(
+      d_act_result.data(), row2 - row1, col2 - col1);
+    slice(handle, input, output, row1, col1, row2, col2);
+
+    raft::update_host(act_result.data(), d_act_result.data(), d_act_result.size(), stream);
+    handle.sync_stream(stream);
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  SliceInputs<T> params;
+  rmm::device_uvector<T> data;
+  std::vector<T> exp_result, act_result;
+};
+
+///// Row- and column-wise tests
+const std::vector<SliceInputs<float>> inputsf = {{32, 1024, 1234ULL},
+                                                 {64, 1024, 1234ULL},
+                                                 {128, 1024, 1234ULL},
+                                                 {256, 1024, 1234ULL},
+                                                 {512, 512, 1234ULL},
+                                                 {1024, 32, 1234ULL},
+                                                 {1024, 64, 1234ULL},
+                                                 {1024, 128, 1234ULL},
+                                                 {1024, 256, 1234ULL}};
+
+const std::vector<SliceInputs<double>> inputsd = {
+  {32, 1024, 1234ULL},
+  {64, 1024, 1234ULL},
+  {128, 1024, 1234ULL},
+  {256, 1024, 1234ULL},
+  {512, 512, 1234ULL},
+  {1024, 32, 1234ULL},
+  {1024, 64, 1234ULL},
+  {1024, 128, 1234ULL},
+  {1024, 256, 1234ULL},
+};
+
+typedef SliceTest<float> SliceTestF;
+TEST_P(SliceTestF, Result)
 {
-    int out_rows = x2 - x1;
-    // int out_cols = y2 - y1;
-    for (int j = y1; j < y2; ++j) {
-        for (int i = x1; i < x2; ++i) {
-            out[(i - x1) + (j - y1) * out_rows] = in[i + j * rows];
-        }
-   }
+  ASSERT_TRUE(hostVecMatch(exp_result, act_result, raft::Compare<float>()));
 }
- 
- template <typename T>
- class SliceTest : public ::testing::TestWithParam<SliceInputs<T>> {
-  public:
-   SliceTest()
-     : params(::testing::TestWithParam<SliceInputs<T>>::GetParam()),
-       stream(handle.get_stream()),
-       data(params.rows * params.cols, stream)
-   {
-   }
- 
-   void SetUp() override
-   {
-     std::random_device rd;
-     std::default_random_engine dre(rd());
-     raft::random::RngState r(params.seed);
-     int rows = params.rows, cols = params.cols, len = rows * cols;
-     uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
-
-     std::uniform_int_distribution<int> rowGenerator(0, rows / 2);
-     int row1 = rowGenerator(dre);
-     int row2 = rowGenerator(dre) + rows / 2;
-
-     std::uniform_int_distribution<int> colGenerator(0, cols / 2);
-     int col1 = colGenerator(dre);
-     int col2 = colGenerator(dre) + cols / 2;
-
-     std::vector<T> h_data(rows*cols);
-     raft::update_host(h_data.data(), data.data(), rows*cols, stream);
-     exp_result = naiveSlice(h_data.data(), rows, cols, row1, col1, row2, col2);
-     auto input = raft::make_device_matrix_view<const T, uint32_t, raft::col_major>(
-       data.data(), params.rows, params.cols, row1, col1, row2, col2);
-     act_result = slice(handle, input, output);
-     handle.sync_stream(stream);
-   }
- 
-  protected:
-   raft::handle_t handle;
-   cudaStream_t stream;
- 
-   SliceInputs<T> params;
-   rmm::device_uvector<T> data;
-   rmm::device_uvector<T> exp_result, act_result;
- };
-
- ///// Row- and column-wise tests
- const std::vector<SliceInputs<float>> inputsf = {{0.00001f, 32, 1024, 1234ULL},
-                                                  {0.00001f, 64, 1024, 1234ULL},
-                                                  {0.00001f, 128, 1024, 1234ULL},
-                                                  {0.00001f, 256, 1024, 1234ULL},
-                                                  {0.00001f, 512, 512, 1234ULL},
-                                                  {0.00001f, 1024, 32, 1234ULL},
-                                                  {0.00001f, 1024, 64, 1234ULL},
-                                                  {0.00001f, 1024, 128, 1234ULL},
-                                                  {0.00001f, 1024, 256, 1234ULL}};
- 
- const std::vector<SliceInputs<double>> inputsd = {
-   {0.00000001, 32, 1024, 1234ULL},
-   {0.00000001, 64, 1024, 1234ULL},
-   {0.00000001, 128, 1024, 1234ULL},
-   {0.00000001, 256, 1024, 1234ULL},
-   {0.00000001, 512, 512, 1234ULL},
-   {0.00000001, 1024, 32, 1234ULL},
-   {0.00000001, 1024, 64, 1234ULL},
-   {0.00000001, 1024, 128, 1234ULL},
-   {0.00000001, 1024, 256, 1234ULL},};
- 
- typedef SliceTest<float> SliceTestF;
- TEST_P(SliceTestF, Result)
- {
-    ASSERT_NEAR(exp_result, act_result, params.tolerance);
-    ASSERT_TRUE(devArrMatch(
-        d_out_exp.data(), d_out_act.data(), params.map_length * params.ncols, raft::Compare<float>()));
- }
- 
- typedef SliceTest<double> SliceTestD;
- TEST_P(SliceTestD, Result)
- {
-    ASSERT_NEAR(exp_result, act_result, params.tolerance);
- }
- 
- INSTANTIATE_TEST_CASE_P(SliceTests, SliceTestF, ::testing::ValuesIn(inputsf));
- 
- INSTANTIATE_TEST_CASE_P(SliceTests, SliceTestD, ::testing::ValuesIn(inputsd));
- 
- }  // end namespace matrix
- }  // end namespace raft
- 
\ No newline at end of file
+
+typedef SliceTest<double> SliceTestD;
+TEST_P(SliceTestD, Result)
+{
+  ASSERT_TRUE(hostVecMatch(exp_result, act_result, raft::Compare<double>()));
+}
+
+INSTANTIATE_TEST_CASE_P(SliceTests, SliceTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(SliceTests, SliceTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace matrix
+}  // end namespace raft

From a2011d4f9d4d19430ba09bd1ab726497a0feb3e2 Mon Sep 17 00:00:00 2001
From: Mickael Ide <mide@nvidia.com>
Date: Wed, 19 Oct 2022 16:28:02 +0200
Subject: [PATCH 08/13] Add reverse test

---
 cpp/include/raft/matrix/detail/matrix.cuh |  17 +--
 cpp/include/raft/matrix/matrix.cuh        |   4 +-
 cpp/include/raft/matrix/reverse.cuh       |  16 +-
 cpp/test/CMakeLists.txt                   |   6 +-
 cpp/test/matrix/reverse.cu                | 176 ++++++++++++++++++++++
 5 files changed, 200 insertions(+), 19 deletions(-)
 create mode 100644 cpp/test/matrix/reverse.cu

diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index fd57d21621..a5e9bdcc8f 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -84,7 +84,7 @@ void truncZeroOrigin(
 }
 
 template <typename m_t, typename idx_t = int>
-void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+void col_major_col_reverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
   auto n            = n_cols;
   auto m            = n_rows;
@@ -106,7 +106,7 @@ void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 }
 
 template <typename m_t, typename idx_t = int>
-void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+void col_major_row_reverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
   auto m            = n_rows;
   idx_t size        = n_rows * n_cols;
@@ -116,15 +116,14 @@ void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 
   thrust::for_each(
     rmm::exec_policy(stream), counting, counting + (size / 2), [=] __device__(idx_t idx) {
-      idx_t dest_row = idx % m;
-      idx_t dest_col = idx / m;
+      idx_t dest_row = idx % (m / 2);
+      idx_t dest_col = idx / (m / 2);
       idx_t src_row  = (m - dest_row) - 1;
-      ;
-      idx_t src_col = dest_col;
+      idx_t src_col  = dest_col;
 
-      m_t temp                   = (m_t)d_q_reversed[idx];
-      d_q_reversed[idx]          = d_q[src_col * m + src_row];
-      d_q[src_col * m + src_row] = temp;
+      m_t temp                              = (m_t)d_q_reversed[dest_col * m + dest_row];
+      d_q_reversed[dest_col * m + dest_row] = d_q[src_col * m + src_row];
+      d_q[src_col * m + src_row]            = temp;
     });
 }
 
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
index 3a7e0dad47..9197757a8f 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -127,7 +127,7 @@ void truncZeroOrigin(
 template <typename m_t, typename idx_t = int>
 void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
-  detail::colReverse(inout, n_rows, n_cols, stream);
+  detail::col_major_col_reverse(inout, n_rows, n_cols, stream);
 }
 
 /**
@@ -141,7 +141,7 @@ void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 template <typename m_t, typename idx_t = int>
 void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
-  detail::rowReverse(inout, n_rows, n_cols, stream);
+  detail::col_major_row_reverse(inout, n_rows, n_cols, stream);
 }
 
 /**
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
index c1025577a7..c6ec4b014f 100644
--- a/cpp/include/raft/matrix/reverse.cuh
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -32,11 +32,12 @@ template <typename m_t, typename idx_t>
 void col_reverse(const raft::handle_t& handle,
                  raft::device_matrix_view<m_t, idx_t, col_major> inout)
 {
-  detail::colReverse(inout.data_handle(), inout.extent(0), inout.extent(1), stream);
+  detail::col_major_col_reverse(
+    inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
 }
 
 /**
- * @brief Columns of a column major matrix are reversed in place (i.e. first column and
+ * @brief Columns of a row major matrix are reversed in place (i.e. first column and
  * last column are swapped)
  * @param handle: raft handle
  * @param inout: input and output matrix
@@ -45,7 +46,8 @@ template <typename m_t, typename idx_t>
 void col_reverse(const raft::handle_t& handle,
                  raft::device_matrix_view<m_t, idx_t, row_major> inout)
 {
-  detail::rowReverse(inout.data_handle(), inout.extent(0), inout.extent(1), stream);
+  detail::col_major_row_reverse(
+    inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
 }
 
 /**
@@ -58,11 +60,12 @@ template <typename m_t, typename idx_t>
 void row_reverse(const raft::handle_t& handle,
                  raft::device_matrix_view<m_t, idx_t, col_major> inout)
 {
-  detail::rowReverse(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
+  detail::col_major_row_reverse(
+    inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
 }
 
 /**
- * @brief Rows of a column major matrix are reversed in place (i.e. first row and last
+ * @brief Rows of a row major matrix are reversed in place (i.e. first row and last
  * row are swapped)
  * @param handle: raft handle
  * @param inout: input and output matrix
@@ -71,7 +74,8 @@ template <typename m_t, typename idx_t>
 void row_reverse(const raft::handle_t& handle,
                  raft::device_matrix_view<m_t, idx_t, row_major> inout)
 {
-  detail::colReverse(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
+  detail::col_major_col_reverse(
+    inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
 }
 
 }  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index cdb2f67822..31163af84c 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -158,13 +158,15 @@ if(BUILD_TESTS)
     ConfigureTest(NAME MATRIX_TEST
             PATH
             test/matrix/argmax.cu
+            test/matrix/columnSort.cu
             test/matrix/diagonal.cu
             test/matrix/gather.cu
+            test/matrix/linewise_op.cu
             test/matrix/math.cu
             test/matrix/matrix.cu
             test/matrix/norm.cu
-            test/matrix/columnSort.cu
-            test/matrix/linewise_op.cu
+            test/matrix/reverse.cu
+            test/matrix/slice.cu
             test/spectral_matrix.cu
     )
 
diff --git a/cpp/test/matrix/reverse.cu b/cpp/test/matrix/reverse.cu
new file mode 100644
index 0000000000..6192ce2065
--- /dev/null
+++ b/cpp/test/matrix/reverse.cu
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/matrix/reverse.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T>
+struct ReverseInputs {
+  bool row_major, row_reverse;
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const ReverseInputs<T>& I)
+{
+  os << "{ " << I.row_major << ", " << I.row_reverse << ", " << I.rows << ", " << I.cols << ", "
+     << I.seed << '}' << std::endl;
+  return os;
+}
+
+// col-reverse reference test
+template <typename Type>
+void naive_col_reverse(std::vector<Type>& data, int rows, int cols, bool row_major)
+{
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols / 2; ++j) {
+      auto index_in   = row_major ? i * cols + j : i + j * rows;
+      auto index_out  = row_major ? i * cols + (cols - j - 1) : i + (cols - j - 1) * rows;
+      auto tmp        = data[index_in];
+      data[index_in]  = data[index_out];
+      data[index_out] = tmp;
+    }
+  }
+}
+
+// row-reverse reference test
+template <typename Type>
+void naive_row_reverse(std::vector<Type>& data, int rows, int cols, bool row_major)
+{
+  for (int i = 0; i < rows / 2; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      auto index_in   = row_major ? i * cols + j : i + j * rows;
+      auto index_out  = row_major ? (rows - i - 1) * cols + j : (rows - i - 1) + j * rows;
+      auto tmp        = data[index_in];
+      data[index_in]  = data[index_out];
+      data[index_out] = tmp;
+    }
+  }
+}
+
+template <typename T>
+class ReverseTest : public ::testing::TestWithParam<ReverseInputs<T>> {
+ public:
+  ReverseTest()
+    : params(::testing::TestWithParam<ReverseInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    raft::random::RngState r(params.seed);
+    int rows = params.rows, cols = params.cols, len = rows * cols;
+
+    rmm::device_uvector<T> d_act_result(len, stream);
+    act_result.resize(len);
+    exp_result.resize(len);
+
+    uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
+    raft::update_host(exp_result.data(), data.data(), len, stream);
+
+    auto input_col_major =
+      raft::make_device_matrix_view<T, int, raft::col_major>(data.data(), rows, cols);
+    auto input_row_major =
+      raft::make_device_matrix_view<T, int, raft::row_major>(data.data(), rows, cols);
+    if (params.row_major) {
+      if (params.row_reverse) {
+        row_reverse(handle, input_row_major);
+        naive_row_reverse(exp_result, rows, cols, params.row_major);
+      } else {
+        col_reverse(handle, input_row_major);
+        naive_col_reverse(exp_result, rows, cols, params.row_major);
+      }
+    } else {
+      if (params.row_reverse) {
+        row_reverse(handle, input_col_major);
+        naive_row_reverse(exp_result, rows, cols, params.row_major);
+      } else {
+        col_reverse(handle, input_col_major);
+        naive_col_reverse(exp_result, rows, cols, params.row_major);
+      }
+    }
+
+    raft::update_host(act_result.data(), data.data(), len, stream);
+    handle.sync_stream(stream);
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  ReverseInputs<T> params;
+  rmm::device_uvector<T> data;
+  std::vector<T> exp_result, act_result;
+};
+
+///// Row- and column-wise tests
+const std::vector<ReverseInputs<float>> inputsf = {{true, true, 4, 4, 1234ULL},
+                                                   {true, true, 2, 12, 1234ULL},
+                                                   {true, false, 2, 12, 1234ULL},
+                                                   {true, false, 2, 64, 1234ULL},
+                                                   {true, true, 64, 512, 1234ULL},
+                                                   {true, false, 64, 1024, 1234ULL},
+                                                   {true, true, 128, 1024, 1234ULL},
+                                                   {true, false, 256, 1024, 1234ULL},
+                                                   {false, true, 512, 512, 1234ULL},
+                                                   {false, false, 1024, 32, 1234ULL},
+                                                   {false, true, 1024, 64, 1234ULL},
+                                                   {false, false, 1024, 128, 1234ULL},
+                                                   {false, true, 1024, 256, 1234ULL}};
+
+const std::vector<ReverseInputs<double>> inputsd = {{true, true, 4, 4, 1234ULL},
+                                                    {true, true, 2, 12, 1234ULL},
+                                                    {true, false, 2, 12, 1234ULL},
+                                                    {true, false, 2, 64, 1234ULL},
+                                                    {true, true, 64, 512, 1234ULL},
+                                                    {true, false, 64, 1024, 1234ULL},
+                                                    {true, true, 128, 1024, 1234ULL},
+                                                    {true, false, 256, 1024, 1234ULL},
+                                                    {false, true, 512, 512, 1234ULL},
+                                                    {false, false, 1024, 32, 1234ULL},
+                                                    {false, true, 1024, 64, 1234ULL},
+                                                    {false, false, 1024, 128, 1234ULL},
+                                                    {false, true, 1024, 256, 1234ULL}};
+
+typedef ReverseTest<float> ReverseTestF;
+TEST_P(ReverseTestF, Result)
+{
+  ASSERT_TRUE(hostVecMatch(exp_result, act_result, raft::Compare<float>()));
+}
+
+typedef ReverseTest<double> ReverseTestD;
+TEST_P(ReverseTestD, Result)
+{
+  ASSERT_TRUE(hostVecMatch(exp_result, act_result, raft::Compare<double>()));
+}
+
+INSTANTIATE_TEST_CASE_P(ReverseTests, ReverseTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(ReverseTests, ReverseTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace matrix
+}  // end namespace raft

From 8409a240b8b9d6ad59e2a5cfbec5632e53e25b8a Mon Sep 17 00:00:00 2001
From: Mickael Ide <mide@nvidia.com>
Date: Thu, 20 Oct 2022 16:38:01 +0200
Subject: [PATCH 09/13] Add triangular test

---
 cpp/include/raft/matrix/detail/matrix.cuh |   9 +-
 cpp/include/raft/matrix/init.cuh          |  23 +++-
 cpp/include/raft/matrix/norm.cuh          |   1 -
 cpp/include/raft/matrix/reverse.cuh       |   1 -
 cpp/include/raft/matrix/slice.cuh         |   1 -
 cpp/include/raft/matrix/triangular.cuh    |   4 +-
 cpp/test/CMakeLists.txt                   |   1 +
 cpp/test/matrix/norm.cu                   |   3 +-
 cpp/test/matrix/reverse.cu                |   1 -
 cpp/test/matrix/triangular.cu             | 148 ++++++++++++++++++++++
 10 files changed, 180 insertions(+), 12 deletions(-)
 create mode 100644 cpp/test/matrix/triangular.cu

diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index a5e9bdcc8f..460242fb3d 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -206,7 +206,7 @@ void sliceMatrix(const m_t* in,
  * @param k: min(n_rows, n_cols)
  */
 template <typename m_t, typename idx_t = int>
-__global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k)
+__global__ void getUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, idx_t k)
 {
   idx_t idx = threadIdx.x + blockDim.x * blockIdx.x;
   idx_t m = n_rows, n = n_cols;
@@ -217,7 +217,7 @@ __global__ void getUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_col
 }
 
 template <typename m_t, typename idx_t = int>
-void copyUpperTriangular(m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+void copyUpperTriangular(const m_t* src, m_t* dst, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
   idx_t m = n_rows, n = n_cols;
   idx_t k = std::min(m, n);
@@ -303,8 +303,9 @@ m_t getL2Norm(const raft::handle_t& handle, const m_t* in, idx_t size, cudaStrea
 {
   cublasHandle_t cublasH = handle.get_cublas_handle();
   m_t normval            = 0;
-  RAFT_EXPECTS(std::is_integral_v<idx_t> && size <= std::numeric_limits<int>::max(),
-               "Index type not supported");
+  RAFT_EXPECTS(
+    std::is_integral_v<idx_t> && (std::size_t)size <= (std::size_t)std::numeric_limits<int>::max(),
+    "Index type not supported");
   RAFT_CUBLAS_TRY(
     raft::linalg::detail::cublasnrm2(cublasH, static_cast<int>(size), in, 1, &normval, stream));
   return normval;
diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh
index 5810ffb502..caee2555a9 100644
--- a/cpp/include/raft/matrix/init.cuh
+++ b/cpp/include/raft/matrix/init.cuh
@@ -25,7 +25,7 @@ namespace raft::matrix {
 /**
  * @brief set values to scalar in matrix
  * @tparam math_t data-type upon which the math operation will be performed
- * @tparam idx_t integer type used for indexing
+ * @tparam extents dimension and indexing type used for the input
  * @tparam layout layout of the matrix data (must be row or col major)
  * @param[in] handle: raft handle
  * @param[in] in input matrix
@@ -38,8 +38,29 @@ void fill(const raft::handle_t& handle,
           raft::device_mdspan<math_t, extents, layout> out,
           raft::host_scalar_view<math_t> scalar)
 {
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Data layout not supported");
   RAFT_EXPECTS(in.size() == out.size(), "Input and output matrices must be the same size.");
+  RAFT_EXPECTS(scalar.data_handle() != nullptr, "Empty scalar");
   detail::setValue(
     out.data_handle(), in.data_handle(), *(scalar.data_handle()), in.size(), handle.get_stream());
 }
+
+/**
+ * @brief set values to scalar in matrix
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam extents dimension and indexing type used for the input
+ * @tparam layout_t layout of the matrix data (must be row or col major)
+ * @param[in] handle: raft handle
+ * @param[inout] inout input matrix
+ * @param[in] scalar scalar value to fill matrix elements
+ */
+template <typename math_t, typename extents, typename layout>
+void fill(const raft::handle_t& handle,
+          raft::device_mdspan<math_t, extents, layout> inout,
+          math_t scalar)
+{
+  RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Data layout not supported");
+  detail::setValue(
+    inout.data_handle(), inout.data_handle(), scalar, inout.size(), handle.get_stream());
+}
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/matrix/norm.cuh b/cpp/include/raft/matrix/norm.cuh
index 559ca2e47e..2bd755881d 100644
--- a/cpp/include/raft/matrix/norm.cuh
+++ b/cpp/include/raft/matrix/norm.cuh
@@ -18,7 +18,6 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/matrix/detail/matrix.cuh>
-#include <raft/matrix/matrix.cuh>
 
 namespace raft::matrix {
 
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
index c6ec4b014f..2424c18241 100644
--- a/cpp/include/raft/matrix/reverse.cuh
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -18,7 +18,6 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/matrix/detail/matrix.cuh>
-#include <raft/matrix/matrix.cuh>
 
 namespace raft::matrix {
 
diff --git a/cpp/include/raft/matrix/slice.cuh b/cpp/include/raft/matrix/slice.cuh
index 837eacca94..507ec6cb02 100644
--- a/cpp/include/raft/matrix/slice.cuh
+++ b/cpp/include/raft/matrix/slice.cuh
@@ -18,7 +18,6 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/matrix/detail/matrix.cuh>
-#include <raft/matrix/matrix.cuh>
 
 namespace raft::matrix {
 
diff --git a/cpp/include/raft/matrix/triangular.cuh b/cpp/include/raft/matrix/triangular.cuh
index b385201859..fad3dd77af 100644
--- a/cpp/include/raft/matrix/triangular.cuh
+++ b/cpp/include/raft/matrix/triangular.cuh
@@ -18,7 +18,6 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/matrix/detail/matrix.cuh>
-#include <raft/matrix/matrix.cuh>
 
 namespace raft::matrix {
 
@@ -33,6 +32,9 @@ void upper_triangular(const raft::handle_t& handle,
                       raft::device_matrix_view<const m_t, idx_t, col_major> src,
                       raft::device_matrix_view<m_t, idx_t, col_major> dst)
 {
+  auto k = std::min(src.extent(0), src.extent(1));
+  RAFT_EXPECTS(k == dst.extent(0) && k == dst.extent(1),
+               "dst should be of size kxk, k = min(n_rows, n_cols)");
   detail::copyUpperTriangular(
     src.data_handle(), dst.data_handle(), src.extent(0), src.extent(1), handle.get_stream());
 }
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 31163af84c..7312267f58 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -167,6 +167,7 @@ if(BUILD_TESTS)
             test/matrix/norm.cu
             test/matrix/reverse.cu
             test/matrix/slice.cu
+            test/matrix/triangular.cu
             test/spectral_matrix.cu
     )
 
diff --git a/cpp/test/matrix/norm.cu b/cpp/test/matrix/norm.cu
index 4047e9a707..38fdd409eb 100644
--- a/cpp/test/matrix/norm.cu
+++ b/cpp/test/matrix/norm.cu
@@ -68,8 +68,7 @@ class NormTest : public ::testing::TestWithParam<NormInputs<T>> {
     std::vector<T> h_data(rows * cols);
     raft::update_host(h_data.data(), data.data(), rows * cols, stream);
     out_scalar_exp = naiveNorm(h_data.data(), cols, rows);
-    auto input =
-      raft::make_device_matrix_view<const T, uint32_t>(data.data(), params.rows, params.cols);
+    auto input = raft::make_device_matrix_view<const T, int>(data.data(), params.rows, params.cols);
     out_scalar_act = l2_norm(handle, input);
     handle.sync_stream(stream);
   }
diff --git a/cpp/test/matrix/reverse.cu b/cpp/test/matrix/reverse.cu
index 6192ce2065..c905b8711e 100644
--- a/cpp/test/matrix/reverse.cu
+++ b/cpp/test/matrix/reverse.cu
@@ -85,7 +85,6 @@ class ReverseTest : public ::testing::TestWithParam<ReverseInputs<T>> {
     raft::random::RngState r(params.seed);
     int rows = params.rows, cols = params.cols, len = rows * cols;
 
-    rmm::device_uvector<T> d_act_result(len, stream);
     act_result.resize(len);
     exp_result.resize(len);
 
diff --git a/cpp/test/matrix/triangular.cu b/cpp/test/matrix/triangular.cu
new file mode 100644
index 0000000000..f8a1a85aeb
--- /dev/null
+++ b/cpp/test/matrix/triangular.cu
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include <gtest/gtest.h>
+#include <raft/matrix/init.cuh>
+#include <raft/matrix/triangular.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft {
+namespace matrix {
+
+template <typename T>
+struct TriangularInputs {
+  int rows, cols;
+  unsigned long long int seed;
+};
+
+template <typename T>
+::std::ostream& operator<<(::std::ostream& os, const TriangularInputs<T>& I)
+{
+  os << "{ " << I.rows << ", " << I.cols << ", " << I.seed << '}' << std::endl;
+  return os;
+}
+
+// triangular reference test
+template <typename Type>
+void naive_triangular(std::vector<Type>& in, std::vector<Type>& out, int rows, int cols)
+{
+  auto k = std::min(rows, cols);
+  for (int i = 0; i < k; ++i) {
+    for (int j = 0; j <= i; ++j) {
+      auto index     = i * rows + j;
+      out[i * k + j] = in[index];
+    }
+  }
+}
+
+template <typename T>
+class TriangularTest : public ::testing::TestWithParam<TriangularInputs<T>> {
+ public:
+  TriangularTest()
+    : params(::testing::TestWithParam<TriangularInputs<T>>::GetParam()),
+      stream(handle.get_stream()),
+      data(params.rows * params.cols, stream)
+  {
+  }
+
+  void SetUp() override
+  {
+    std::random_device rd;
+    std::default_random_engine dre(rd());
+    raft::random::RngState r(params.seed);
+    int rows = params.rows, cols = params.cols, len = rows * cols;
+    auto k = std::min(rows, cols);
+
+    rmm::device_uvector<T> d_act_result(len, stream);
+    std::vector<T> h_data(len);
+    act_result.resize(k * k);
+    exp_result.resize(k * k);
+
+    uniform(handle, r, data.data(), len, T(-10.0), T(10.0));
+    raft::update_host(h_data.data(), data.data(), len, stream);
+    raft::matrix::fill(
+      handle,
+      raft::make_device_matrix_view<T, int, raft::col_major>(d_act_result.data(), k, k),
+      T(0));
+
+    upper_triangular(
+      handle,
+      raft::make_device_matrix_view<const T, int, raft::col_major>(data.data(), rows, cols),
+      raft::make_device_matrix_view<T, int, raft::col_major>(d_act_result.data(), k, k));
+    naive_triangular(h_data, exp_result, rows, cols);
+
+    raft::update_host(act_result.data(), d_act_result.data(), k * k, stream);
+    handle.sync_stream(stream);
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  TriangularInputs<T> params;
+  rmm::device_uvector<T> data;
+  std::vector<T> exp_result, act_result;
+};
+
+///// Row- and column-wise tests
+const std::vector<TriangularInputs<float>> inputsf = {{4, 4, 1234ULL},
+                                                      {2, 12, 1234ULL},
+                                                      {2, 12, 1234ULL},
+                                                      {2, 64, 1234ULL},
+                                                      {64, 512, 1234ULL},
+                                                      {64, 1024, 1234ULL},
+                                                      {128, 1024, 1234ULL},
+                                                      {256, 1024, 1234ULL},
+                                                      {512, 512, 1234ULL},
+                                                      {1024, 32, 1234ULL},
+                                                      {1024, 64, 1234ULL},
+                                                      {1024, 128, 1234ULL},
+                                                      {1024, 256, 1234ULL}};
+
+const std::vector<TriangularInputs<double>> inputsd = {{4, 4, 1234ULL},
+                                                       {2, 12, 1234ULL},
+                                                       {2, 12, 1234ULL},
+                                                       {2, 64, 1234ULL},
+                                                       {64, 512, 1234ULL},
+                                                       {64, 1024, 1234ULL},
+                                                       {128, 1024, 1234ULL},
+                                                       {256, 1024, 1234ULL},
+                                                       {512, 512, 1234ULL},
+                                                       {1024, 32, 1234ULL},
+                                                       {1024, 64, 1234ULL},
+                                                       {1024, 128, 1234ULL},
+                                                       {1024, 256, 1234ULL}};
+
+typedef TriangularTest<float> TriangularTestF;
+TEST_P(TriangularTestF, Result)
+{
+  ASSERT_TRUE(hostVecMatch(exp_result, act_result, raft::Compare<float>()));
+}
+
+typedef TriangularTest<double> TriangularTestD;
+TEST_P(TriangularTestD, Result)
+{
+  ASSERT_TRUE(hostVecMatch(exp_result, act_result, raft::Compare<double>()));
+}
+
+INSTANTIATE_TEST_CASE_P(TriangularTests, TriangularTestF, ::testing::ValuesIn(inputsf));
+
+INSTANTIATE_TEST_CASE_P(TriangularTests, TriangularTestD, ::testing::ValuesIn(inputsd));
+
+}  // end namespace matrix
+}  // end namespace raft

From ac6c72b5c7de093cabca71c9d4a61dc3ce19f56a Mon Sep 17 00:00:00 2001
From: Mickael Ide <mide@nvidia.com>
Date: Thu, 20 Oct 2022 18:04:38 +0200
Subject: [PATCH 10/13] Remove redondant tests

---
 cpp/test/matrix/triangular.cu | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/cpp/test/matrix/triangular.cu b/cpp/test/matrix/triangular.cu
index f8a1a85aeb..9af3defb5d 100644
--- a/cpp/test/matrix/triangular.cu
+++ b/cpp/test/matrix/triangular.cu
@@ -101,30 +101,22 @@ class TriangularTest : public ::testing::TestWithParam<TriangularInputs<T>> {
 
 ///// Row- and column-wise tests
 const std::vector<TriangularInputs<float>> inputsf = {{4, 4, 1234ULL},
-                                                      {2, 12, 1234ULL},
-                                                      {2, 12, 1234ULL},
                                                       {2, 64, 1234ULL},
                                                       {64, 512, 1234ULL},
                                                       {64, 1024, 1234ULL},
-                                                      {128, 1024, 1234ULL},
                                                       {256, 1024, 1234ULL},
                                                       {512, 512, 1234ULL},
                                                       {1024, 32, 1234ULL},
-                                                      {1024, 64, 1234ULL},
                                                       {1024, 128, 1234ULL},
                                                       {1024, 256, 1234ULL}};
 
 const std::vector<TriangularInputs<double>> inputsd = {{4, 4, 1234ULL},
-                                                       {2, 12, 1234ULL},
-                                                       {2, 12, 1234ULL},
                                                        {2, 64, 1234ULL},
                                                        {64, 512, 1234ULL},
                                                        {64, 1024, 1234ULL},
-                                                       {128, 1024, 1234ULL},
                                                        {256, 1024, 1234ULL},
                                                        {512, 512, 1234ULL},
                                                        {1024, 32, 1234ULL},
-                                                       {1024, 64, 1234ULL},
                                                        {1024, 128, 1234ULL},
                                                        {1024, 256, 1234ULL}};
 

From 91bc8e1424c8cfd20370a788bafb8214c99a051f Mon Sep 17 00:00:00 2001
From: Mickael Ide <mide@nvidia.com>
Date: Fri, 21 Oct 2022 16:54:26 +0200
Subject: [PATCH 11/13] Fix naming

---
 cpp/include/raft/matrix/detail/matrix.cuh |  4 ++--
 cpp/include/raft/matrix/matrix.cuh        |  4 ++--
 cpp/include/raft/matrix/reverse.cuh       | 12 ++++--------
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/cpp/include/raft/matrix/detail/matrix.cuh b/cpp/include/raft/matrix/detail/matrix.cuh
index 460242fb3d..17a40be5d6 100644
--- a/cpp/include/raft/matrix/detail/matrix.cuh
+++ b/cpp/include/raft/matrix/detail/matrix.cuh
@@ -84,7 +84,7 @@ void truncZeroOrigin(
 }
 
 template <typename m_t, typename idx_t = int>
-void col_major_col_reverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
   auto n            = n_cols;
   auto m            = n_rows;
@@ -106,7 +106,7 @@ void col_major_col_reverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t
 }
 
 template <typename m_t, typename idx_t = int>
-void col_major_row_reverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
+void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
   auto m            = n_rows;
   idx_t size        = n_rows * n_cols;
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
index 9197757a8f..3a7e0dad47 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -127,7 +127,7 @@ void truncZeroOrigin(
 template <typename m_t, typename idx_t = int>
 void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
-  detail::col_major_col_reverse(inout, n_rows, n_cols, stream);
+  detail::colReverse(inout, n_rows, n_cols, stream);
 }
 
 /**
@@ -141,7 +141,7 @@ void colReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 template <typename m_t, typename idx_t = int>
 void rowReverse(m_t* inout, idx_t n_rows, idx_t n_cols, cudaStream_t stream)
 {
-  detail::col_major_row_reverse(inout, n_rows, n_cols, stream);
+  detail::rowReverse(inout, n_rows, n_cols, stream);
 }
 
 /**
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
index 2424c18241..5fbb8750af 100644
--- a/cpp/include/raft/matrix/reverse.cuh
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -31,8 +31,7 @@ template <typename m_t, typename idx_t>
 void col_reverse(const raft::handle_t& handle,
                  raft::device_matrix_view<m_t, idx_t, col_major> inout)
 {
-  detail::col_major_col_reverse(
-    inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
+  detail::colReverse(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
 }
 
 /**
@@ -45,8 +44,7 @@ template <typename m_t, typename idx_t>
 void col_reverse(const raft::handle_t& handle,
                  raft::device_matrix_view<m_t, idx_t, row_major> inout)
 {
-  detail::col_major_row_reverse(
-    inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
+  detail::rowReverse(inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
 }
 
 /**
@@ -59,8 +57,7 @@ template <typename m_t, typename idx_t>
 void row_reverse(const raft::handle_t& handle,
                  raft::device_matrix_view<m_t, idx_t, col_major> inout)
 {
-  detail::col_major_row_reverse(
-    inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
+  detail::rowReverse(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
 }
 
 /**
@@ -73,8 +70,7 @@ template <typename m_t, typename idx_t>
 void row_reverse(const raft::handle_t& handle,
                  raft::device_matrix_view<m_t, idx_t, row_major> inout)
 {
-  detail::col_major_col_reverse(
-    inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
+  detail::colReverse(inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
 }
 
 }  // namespace raft::matrix
\ No newline at end of file

From 5349b4b21c0c26a373b7a0e3a998dfa2db895617 Mon Sep 17 00:00:00 2001
From: Mickael Ide <mide@nvidia.com>
Date: Tue, 25 Oct 2022 00:54:16 +0200
Subject: [PATCH 12/13] Add matrix_vector_op gtests and in-out specification
 for doc

---
 cpp/include/raft/linalg/matrix_vector.cuh | 156 ++++++------
 cpp/include/raft/matrix/argmax.cuh        |   7 +-
 cpp/include/raft/matrix/norm.cuh          |   4 +-
 cpp/include/raft/matrix/reverse.cuh       |  16 +-
 cpp/include/raft/matrix/slice.cuh         |  10 +-
 cpp/test/CMakeLists.txt                   |   1 +
 cpp/test/linalg/matrix_vector.cu          | 285 ++++++++++++++++++++++
 cpp/test/linalg/matrix_vector_op.cu       |   1 +
 cpp/test/linalg/matrix_vector_op.cuh      |  42 ++++
 9 files changed, 430 insertions(+), 92 deletions(-)
 create mode 100644 cpp/test/linalg/matrix_vector.cu

diff --git a/cpp/include/raft/linalg/matrix_vector.cuh b/cpp/include/raft/linalg/matrix_vector.cuh
index 05bff59c6f..57bc0cf21f 100644
--- a/cpp/include/raft/linalg/matrix_vector.cuh
+++ b/cpp/include/raft/linalg/matrix_vector.cuh
@@ -17,7 +17,8 @@
 #pragma once
 
 #include <raft/core/device_mdspan.hpp>
-#include <raft/matrix/detail/matrix.cuh>
+#include <raft/linalg/linalg_types.hpp>
+#include <raft/matrix/detail/math.cuh>
 #include <raft/util/input_validation.hpp>
 
 namespace raft::linalg {
@@ -27,30 +28,32 @@ namespace raft::linalg {
  * @param [in] handle: raft handle for managing library resources
  * @param[inout] data: input matrix, results are in-place
  * @param[in] vec: input vector
- * @param[in] bcast_along_rows: whether to broadcast vector along rows of matrix or columns
+ * @param[in] apply whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns using enum class raft::linalg::Apply
  */
-template <typename Type, typename IdxType>
+template <typename math_t, typename idx_t, typename layout_t>
 void binary_mult_skip_zero(const raft::handle_t& handle,
-                           raft::device_matrix_view<Type, IdxType> data,
-                           raft::device_vector_view<const Type> vec,
-                           bool bcast_along_rows)
+                           raft::device_matrix_view<math_t, idx_t, layout_t> data,
+                           raft::device_vector_view<const math_t, idx_t> vec,
+                           Apply apply)
 {
-  bool row_major = raft::is_row_major(data);
+  bool row_major        = raft::is_row_major(data);
+  auto bcast_along_rows = apply == Apply::ALONG_ROWS;
 
-  IdxType vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+  idx_t vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
 
   RAFT_EXPECTS(
     vec.extent(0) == vec_size,
     "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
     "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
 
-  matrix::detail::matrixVectorBinaryMultSkipZero<Type, IdxType>(data.data_handle(),
-                                                                vec.data_handle(),
-                                                                matrix.extent(0),
-                                                                matrix.extent(1),
-                                                                row_major,
-                                                                bcast_along_rows,
-                                                                handle.get_stream());
+  matrix::detail::matrixVectorBinaryMultSkipZero(data.data_handle(),
+                                                 vec.data_handle(),
+                                                 data.extent(0),
+                                                 data.extent(1),
+                                                 row_major,
+                                                 bcast_along_rows,
+                                                 handle.get_stream());
 }
 
 /**
@@ -58,30 +61,32 @@ void binary_mult_skip_zero(const raft::handle_t& handle,
  * @param[in] handle: raft handle for managing library resources
  * @param[inout] data: input matrix, results are in-place
  * @param[in] vec: input vector
- * @param[in] bcast_along_rows: whether to broadcast vector along rows of matrix or columns
+ * @param[in] apply whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns using enum class raft::linalg::Apply
  */
-template <typename Type, typename IdxType>
+template <typename math_t, typename idx_t, typename layout_t>
 void binary_div(const raft::handle_t& handle,
-                raft::device_matrix_view<Type, IdxType> data,
-                raft::device_vector_view<const Type> vec,
-                bool bcast_along_rows)
+                raft::device_matrix_view<math_t, idx_t, layout_t> data,
+                raft::device_vector_view<const math_t, idx_t> vec,
+                Apply apply)
 {
-  bool row_major = raft::is_row_major(data);
+  bool row_major        = raft::is_row_major(data);
+  auto bcast_along_rows = apply == Apply::ALONG_ROWS;
 
-  IdxType vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+  idx_t vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
 
   RAFT_EXPECTS(
     vec.extent(0) == vec_size,
     "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
     "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
 
-  matrix::detail::matrixVectorBinaryDiv<Type, IdxType>(data.data_handle(),
-                                                       vec.data_handle(),
-                                                       data.extent(0),
-                                                       data.extent(1),
-                                                       row_major,
-                                                       bcast_along_rows,
-                                                       handle.get_stream());
+  matrix::detail::matrixVectorBinaryDiv(data.data_handle(),
+                                        vec.data_handle(),
+                                        data.extent(0),
+                                        data.extent(1),
+                                        row_major,
+                                        bcast_along_rows,
+                                        handle.get_stream());
 }
 
 /**
@@ -89,34 +94,36 @@ void binary_div(const raft::handle_t& handle,
  * @param[in] handle: raft handle for managing library resources
  * @param[inout] data: input matrix, results are in-place
  * @param[in] vec: input vector
- * @param[in] bcast_along_rows: whether to broadcast vector along rows of matrix or columns
+ * @param[in] apply whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns using enum class raft::linalg::Apply
  * @param[in] return_zero: result is zero if true and vector value is below threshold, original
  * value if false
  */
-template <typename Type, typename IdxType>
+template <typename math_t, typename idx_t, typename layout_t>
 void binary_div_skip_zero(const raft::handle_t& handle,
-                          raft::device_matrix_view<Type, IdxType> data,
-                          raft::device_vector_view<const Type> vec,
-                          bool bcast_along_rows,
+                          raft::device_matrix_view<math_t, idx_t, layout_t> data,
+                          raft::device_vector_view<const math_t, idx_t> vec,
+                          Apply apply,
                           bool return_zero = false)
 {
-  bool row_major = raft::is_row_major(data);
+  bool row_major        = raft::is_row_major(data);
+  auto bcast_along_rows = apply == Apply::ALONG_ROWS;
 
-  IdxType vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+  idx_t vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
 
   RAFT_EXPECTS(
     vec.extent(0) == vec_size,
     "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
     "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
 
-  matrix::detail::matrixVectorBinaryDivSkipZero<Type, IdxType>(data.data_handle(),
-                                                               vec.data_handle(),
-                                                               data.extent(0),
-                                                               data.extent(1),
-                                                               row_major,
-                                                               bcast_along_rows,
-                                                               handle.get_stream(),
-                                                               return_zero);
+  matrix::detail::matrixVectorBinaryDivSkipZero(data.data_handle(),
+                                                vec.data_handle(),
+                                                data.extent(0),
+                                                data.extent(1),
+                                                row_major,
+                                                bcast_along_rows,
+                                                handle.get_stream(),
+                                                return_zero);
 }
 
 /**
@@ -124,30 +131,32 @@ void binary_div_skip_zero(const raft::handle_t& handle,
  * @param[in] handle: raft handle for managing library resources
  * @param[inout] data: input matrix, results are in-place
  * @param[in] vec: input vector
- * @param[in] bcast_along_rows: whether to broadcast vector along rows of matrix or columns
+ * @param[in] apply whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns using enum class raft::linalg::Apply
  */
-template <typename Type, typename IdxType>
+template <typename math_t, typename idx_t, typename layout_t>
 void binary_add(const raft::handle_t& handle,
-                raft::device_matrix_view<Type, IdxType> data,
-                raft::device_vector_view<const Type> vec,
-                bool bcast_along_rows)
+                raft::device_matrix_view<math_t, idx_t, layout_t> data,
+                raft::device_vector_view<const math_t, idx_t> vec,
+                Apply apply)
 {
-  bool row_major = raft::is_row_major(data);
+  bool row_major        = raft::is_row_major(data);
+  auto bcast_along_rows = apply == Apply::ALONG_ROWS;
 
-  IdxType vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+  idx_t vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
 
   RAFT_EXPECTS(
     vec.extent(0) == vec_size,
     "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
     "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
 
-  matrix::detail::matrixVectorBinaryAdd<Type, IdxType, TPB>(data.data_handle(),
-                                                            vec.data_handle(),
-                                                            data.extent(0),
-                                                            data.extent(1),
-                                                            row_major,
-                                                            bcast_along_rows,
-                                                            handle.get_stream());
+  matrix::detail::matrixVectorBinaryAdd(data.data_handle(),
+                                        vec.data_handle(),
+                                        data.extent(0),
+                                        data.extent(1),
+                                        row_major,
+                                        bcast_along_rows,
+                                        handle.get_stream());
 }
 
 /**
@@ -155,30 +164,31 @@ void binary_add(const raft::handle_t& handle,
  * @param[in] handle: raft handle for managing library resources
  * @param[inout] data: input matrix, results are in-place
  * @param[in] vec: input vector
- * @param[in] bcast_along_rows: whether to broadcast vector along rows of matrix or columns
+ * @param[in] apply whether the broadcast of vector needs to happen along
+ * the rows of the matrix or columns using enum class raft::linalg::Apply
  */
-template <typename Type, typename IdxType>
+template <typename math_t, typename idx_t, typename layout_t>
 void binary_sub(const raft::handle_t& handle,
-                raft::device_matrix_view<Type, IdxType> data,
-                raft::device_vector_view<const Type> vec,
-                bool bcast_along_rows)
+                raft::device_matrix_view<math_t, idx_t, layout_t> data,
+                raft::device_vector_view<const math_t, idx_t> vec,
+                Apply apply)
 {
-  bool row_major = raft::is_row_major(data);
+  bool row_major        = raft::is_row_major(data);
+  auto bcast_along_rows = apply == Apply::ALONG_ROWS;
 
-  IdxType vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
+  idx_t vec_size = bcast_along_rows ? data.extent(1) : data.extent(0);
 
   RAFT_EXPECTS(
     vec.extent(0) == vec_size,
     "If `bcast_along_rows==true`, vector size must equal number of columns in the matrix."
     "If `bcast_along_rows==false`, vector size must equal number of rows in the matrix.");
 
-  matrix::detail::matrixVectorBinarySub<Type, IdxType>(data.data_handle(),
-                                                       vec.data_handle(),
-                                                       data.extent(0),
-                                                       data.extent(1),
-                                                       row_major,
-                                                       bcast_along_rows,
-                                                       handle.get_stream());
+  matrix::detail::matrixVectorBinarySub(data.data_handle(),
+                                        vec.data_handle(),
+                                        data.extent(0),
+                                        data.extent(1),
+                                        row_major,
+                                        bcast_along_rows,
+                                        handle.get_stream());
 }
-
 }  // namespace raft::linalg
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/argmax.cuh b/cpp/include/raft/matrix/argmax.cuh
index 5cfd3eb1bf..b3face1012 100644
--- a/cpp/include/raft/matrix/argmax.cuh
+++ b/cpp/include/raft/matrix/argmax.cuh
@@ -18,15 +18,14 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/matrix/detail/math.cuh>
-#include <raft/matrix/matrix.cuh>
 
 namespace raft::matrix {
 
 /**
  * @brief Argmax: find the row idx with maximum value for each column
- * @param handle: raft handle
- * @param in: input matrix of size (n_rows, n_cols)
- * @param out: output vector of size n_cols
+ * @param[in] handle: raft handle
+ * @param[in] in: input matrix of size (n_rows, n_cols)
+ * @param[out] out: output vector of size n_cols
  */
 template <typename math_t, typename idx_t, typename matrix_idx_t>
 void argmax(const raft::handle_t& handle,
diff --git a/cpp/include/raft/matrix/norm.cuh b/cpp/include/raft/matrix/norm.cuh
index 2bd755881d..deb3657905 100644
--- a/cpp/include/raft/matrix/norm.cuh
+++ b/cpp/include/raft/matrix/norm.cuh
@@ -23,8 +23,8 @@ namespace raft::matrix {
 
 /**
  * @brief Get the L2/F-norm of a matrix
- * @param handle: raft handle
- * @param in: input matrix/vector with totally size elements
+ * @param[in] handle: raft handle
+ * @param[in] in: input matrix/vector with totally size elements
  * @returns matrix l2 norm
  */
 template <typename m_t, typename idx_t>
diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
index 5fbb8750af..13000332a0 100644
--- a/cpp/include/raft/matrix/reverse.cuh
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -24,8 +24,8 @@ namespace raft::matrix {
 /**
  * @brief Columns of a column major matrix are reversed in place (i.e. first column and
  * last column are swapped)
- * @param handle: raft handle
- * @param inout: input and output matrix
+ * @param[in] handle: raft handle
+ * @param[inout] inout: input and output matrix
  */
 template <typename m_t, typename idx_t>
 void col_reverse(const raft::handle_t& handle,
@@ -37,8 +37,8 @@ void col_reverse(const raft::handle_t& handle,
 /**
  * @brief Columns of a row major matrix are reversed in place (i.e. first column and
  * last column are swapped)
- * @param handle: raft handle
- * @param inout: input and output matrix
+ * @param[in] handle: raft handle
+ * @param[inout] inout: input and output matrix
  */
 template <typename m_t, typename idx_t>
 void col_reverse(const raft::handle_t& handle,
@@ -50,8 +50,8 @@ void col_reverse(const raft::handle_t& handle,
 /**
  * @brief Rows of a column major matrix are reversed in place (i.e. first row and last
  * row are swapped)
- * @param handle: raft handle
- * @param inout: input and output matrix
+ * @param[in] handle: raft handle
+ * @param[inout] inout: input and output matrix
  */
 template <typename m_t, typename idx_t>
 void row_reverse(const raft::handle_t& handle,
@@ -63,8 +63,8 @@ void row_reverse(const raft::handle_t& handle,
 /**
  * @brief Rows of a row major matrix are reversed in place (i.e. first row and last
  * row are swapped)
- * @param handle: raft handle
- * @param inout: input and output matrix
+ * @param[in] handle: raft handle
+ * @param[inout] inout: input and output matrix
  */
 template <typename m_t, typename idx_t>
 void row_reverse(const raft::handle_t& handle,
diff --git a/cpp/include/raft/matrix/slice.cuh b/cpp/include/raft/matrix/slice.cuh
index 507ec6cb02..ef7ff3d28d 100644
--- a/cpp/include/raft/matrix/slice.cuh
+++ b/cpp/include/raft/matrix/slice.cuh
@@ -23,11 +23,11 @@ namespace raft::matrix {
 
 /**
  * @brief Slice a matrix (in-place)
- * @param handle: raft handle
- * @param in: input matrix (column-major)
- * @param out: output matrix (column-major)
- * @param x1, y1: coordinate of the top-left point of the wanted area (0-based)
- * @param x2, y2: coordinate of the bottom-right point of the wanted area
+ * @param[in] handle: raft handle
+ * @param[in] in: input matrix (column-major)
+ * @param[inout] out: output matrix (column-major)
+ * @param[in] x1, y1: coordinate of the top-left point of the wanted area (0-based)
+ * @param[in] x2, y2: coordinate of the bottom-right point of the wanted area
  * (1-based)
  * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice_matrix(M_d, 4,
  * 3, 0, 1, 4, 3);
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 7312267f58..c55a3602d7 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -138,6 +138,7 @@ if(BUILD_TESTS)
             test/linalg/gemv.cu
             test/linalg/map.cu
             test/linalg/map_then_reduce.cu
+            test/linalg/matrix_vector.cu
             test/linalg/matrix_vector_op.cu
             test/linalg/multiply.cu
             test/linalg/norm.cu
diff --git a/cpp/test/linalg/matrix_vector.cu b/cpp/test/linalg/matrix_vector.cu
new file mode 100644
index 0000000000..9062f3be4d
--- /dev/null
+++ b/cpp/test/linalg/matrix_vector.cu
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.h"
+#include "matrix_vector_op.cuh"
+#include <gtest/gtest.h>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/linalg/matrix_vector.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
+
+namespace raft {
+namespace linalg {
+
+template <typename T, typename IdxType = int>
+struct MatrixVectorInputs {
+  T tolerance;
+  IdxType rows, cols;
+  int operation_type;
+  bool row_major, bcast_along_rows;
+  unsigned long long int seed;
+};
+
+template <typename T, typename IdxType>
+::std::ostream& operator<<(::std::ostream& os, const MatrixVectorInputs<T, IdxType>& dims)
+{
+  return os;
+}
+
+// Or else, we get the following compilation error
+// for an extended __device__ lambda cannot have private or protected access
+// within its class
+template <typename T, typename IdxType>
+void matrix_vector_op_launch(const raft::handle_t& handle,
+                             T* in,
+                             const T* vec1,
+                             IdxType D,
+                             IdxType N,
+                             bool row_major,
+                             bool bcast_along_rows,
+                             int operation_type)
+{
+  auto in_row_major = raft::make_device_matrix_view<T, IdxType, raft::row_major>(in, N, D);
+  auto in_col_major = raft::make_device_matrix_view<T, IdxType, raft::col_major>(in, N, D);
+
+  auto apply     = bcast_along_rows ? Apply::ALONG_ROWS : Apply::ALONG_COLUMNS;
+  auto len       = bcast_along_rows ? D : N;
+  auto vec1_view = raft::make_device_vector_view<const T, IdxType>(vec1, len);
+
+  if (operation_type == 0) {
+    if (row_major) {
+      binary_mult_skip_zero(handle, in_row_major, vec1_view, apply);
+    } else {
+      binary_mult_skip_zero(handle, in_col_major, vec1_view, apply);
+    }
+  } else if (operation_type == 1) {
+    if (row_major) {
+      binary_div(handle, in_row_major, vec1_view, apply);
+    } else {
+      binary_div(handle, in_col_major, vec1_view, apply);
+    }
+  } else if (operation_type == 2) {
+    if (row_major) {
+      binary_div_skip_zero(handle, in_row_major, vec1_view, apply);
+    } else {
+      binary_div_skip_zero(handle, in_col_major, vec1_view, apply);
+    }
+  } else if (operation_type == 3) {
+    if (row_major) {
+      binary_add(handle, in_row_major, vec1_view, apply);
+    } else {
+      binary_add(handle, in_col_major, vec1_view, apply);
+    }
+  } else if (operation_type == 4) {
+    if (row_major) {
+      binary_sub(handle, in_row_major, vec1_view, apply);
+    } else {
+      binary_sub(handle, in_col_major, vec1_view, apply);
+    }
+  } else {
+    THROW("Unknown operation type '%d'!", (int)operation_type);
+  }
+}
+
+template <typename T, typename IdxType>
+void naive_matrix_vector_op_launch(const raft::handle_t& handle,
+                                   T* in,
+                                   const T* vec1,
+                                   IdxType D,
+                                   IdxType N,
+                                   bool row_major,
+                                   bool bcast_along_rows,
+                                   int operation_type)
+{
+  auto stream                       = handle.get_stream();
+  auto operation_bin_mult_skip_zero = [] __device__(T mat_element, T vec_element) {
+    if (vec_element != T(0)) {
+      return mat_element * vec_element;
+    } else {
+      return mat_element;
+    }
+  };
+  auto operation_div = [] __device__(T mat_element, T vec_element) {
+    return mat_element / vec_element;
+  };
+  auto operation_bin_div_skip_zero = [] __device__(T mat_element, T vec_element) {
+    if (raft::myAbs(vec_element) < T(1e-10))
+      return T(0);
+    else
+      return mat_element / vec_element;
+  };
+  auto operation_bin_add = [] __device__(T mat_element, T vec_element) {
+    return mat_element + vec_element;
+  };
+  auto operation_bin_sub = [] __device__(T mat_element, T vec_element) {
+    return mat_element - vec_element;
+  };
+
+  if (operation_type == 0) {
+    naiveMatVecOp(
+      in, vec1, D, N, row_major, bcast_along_rows, operation_bin_mult_skip_zero, stream);
+  } else if (operation_type == 1) {
+    naiveMatVecOp(in, vec1, D, N, row_major, bcast_along_rows, operation_div, stream);
+  } else if (operation_type == 2) {
+    naiveMatVecOp(in, vec1, D, N, row_major, bcast_along_rows, operation_bin_div_skip_zero, stream);
+  } else if (operation_type == 3) {
+    naiveMatVecOp(in, vec1, D, N, row_major, bcast_along_rows, operation_bin_add, stream);
+  } else if (operation_type == 4) {
+    naiveMatVecOp(in, vec1, D, N, row_major, bcast_along_rows, operation_bin_sub, stream);
+  } else {
+    THROW("Unknown operation type '%d'!", (int)operation_type);
+  }
+}
+
+template <typename T, typename IdxType>
+class MatrixVectorTest : public ::testing::TestWithParam<MatrixVectorInputs<T, IdxType>> {
+ public:
+  MatrixVectorTest()
+    : params(::testing::TestWithParam<MatrixVectorInputs<T, IdxType>>::GetParam()),
+      stream(handle.get_stream()),
+      in(params.rows * params.cols, stream),
+      out_ref(params.rows * params.cols, stream),
+      out(params.rows * params.cols, stream),
+      vec1(params.bcast_along_rows ? params.cols : params.rows, stream)
+  {
+  }
+
+ protected:
+  void SetUp() override
+  {
+    raft::random::RngState r(params.seed);
+    IdxType N = params.rows, D = params.cols;
+    IdxType len    = N * D;
+    IdxType vecLen = params.bcast_along_rows ? D : N;
+    uniform(handle, r, in.data(), len, (T)-1.0, (T)1.0);
+    uniform(handle, r, vec1.data(), vecLen, (T)-1.0, (T)1.0);
+    raft::copy(out_ref.data(), in.data(), len, handle.get_stream());
+    raft::copy(out.data(), in.data(), len, handle.get_stream());
+    naive_matrix_vector_op_launch(handle,
+                                  out_ref.data(),
+                                  vec1.data(),
+                                  D,
+                                  N,
+                                  params.row_major,
+                                  params.bcast_along_rows,
+                                  params.operation_type);
+    matrix_vector_op_launch(handle,
+                            out.data(),
+                            vec1.data(),
+                            D,
+                            N,
+                            params.row_major,
+                            params.bcast_along_rows,
+                            params.operation_type);
+    handle.sync_stream();
+  }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  MatrixVectorInputs<T, IdxType> params;
+  rmm::device_uvector<T> in, out, out_ref, vec1;
+};
+
+const std::vector<MatrixVectorInputs<float, int>> inputsf_i32 = {
+  {0.00001f, 1024, 32, 0, true, true, 1234ULL},
+  {0.00001f, 1024, 64, 1, true, true, 1234ULL},
+  {0.00001f, 1024, 32, 2, true, false, 1234ULL},
+  {0.00001f, 1024, 64, 3, true, false, 1234ULL},
+  {0.00001f, 1024, 32, 4, false, true, 1234ULL},
+  {0.00001f, 1024, 64, 0, false, true, 1234ULL},
+  {0.00001f, 1024, 32, 1, false, false, 1234ULL},
+  {0.00001f, 1024, 64, 2, false, false, 1234ULL},
+
+  {0.00001f, 1024, 32, 3, true, true, 1234ULL},
+  {0.00001f, 1024, 64, 4, true, true, 1234ULL},
+  {0.00001f, 1024, 32, 0, true, false, 1234ULL},
+  {0.00001f, 1024, 64, 1, true, false, 1234ULL},
+  {0.00001f, 1024, 32, 2, false, true, 1234ULL},
+  {0.00001f, 1024, 64, 3, false, true, 1234ULL},
+  {0.00001f, 1024, 32, 4, false, false, 1234ULL},
+  {0.00001f, 1024, 64, 0, false, false, 1234ULL}};
+typedef MatrixVectorTest<float, int> MatrixVectorTestF_i32;
+TEST_P(MatrixVectorTestF_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.rows * params.cols, CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MatrixVectorTests,
+                         MatrixVectorTestF_i32,
+                         ::testing::ValuesIn(inputsf_i32));
+
+const std::vector<MatrixVectorInputs<float, size_t>> inputsf_i64 = {
+  {0.00001f, 2500, 250, 0, false, false, 1234ULL}, {0.00001f, 2500, 250, 1, false, false, 1234ULL}};
+typedef MatrixVectorTest<float, size_t> MatrixVectorTestF_i64;
+TEST_P(MatrixVectorTestF_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(
+    out_ref.data(), out.data(), params.rows * params.cols, CompareApprox<float>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MatrixVectorTests,
+                         MatrixVectorTestF_i64,
+                         ::testing::ValuesIn(inputsf_i64));
+
+const std::vector<MatrixVectorInputs<double, int>> inputsd_i32 = {
+  {0.0000001, 1024, 32, 0, true, true, 1234ULL},
+  {0.0000001, 1024, 64, 1, true, true, 1234ULL},
+  {0.0000001, 1024, 32, 2, true, false, 1234ULL},
+  {0.0000001, 1024, 64, 3, true, false, 1234ULL},
+  {0.0000001, 1024, 32, 4, false, true, 1234ULL},
+  {0.0000001, 1024, 64, 0, false, true, 1234ULL},
+  {0.0000001, 1024, 32, 1, false, false, 1234ULL},
+  {0.0000001, 1024, 64, 2, false, false, 1234ULL},
+
+  {0.0000001, 1024, 32, 3, true, true, 1234ULL},
+  {0.0000001, 1024, 64, 4, true, true, 1234ULL},
+  {0.0000001, 1024, 32, 0, true, false, 1234ULL},
+  {0.0000001, 1024, 64, 1, true, false, 1234ULL},
+  {0.0000001, 1024, 32, 2, false, true, 1234ULL},
+  {0.0000001, 1024, 64, 3, false, true, 1234ULL},
+  {0.0000001, 1024, 32, 4, false, false, 1234ULL},
+  {0.0000001, 1024, 64, 0, false, false, 1234ULL}};
+typedef MatrixVectorTest<double, int> MatrixVectorTestD_i32;
+TEST_P(MatrixVectorTestD_i32, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref.data(),
+                          out.data(),
+                          params.rows * params.cols,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MatrixVectorTests,
+                         MatrixVectorTestD_i32,
+                         ::testing::ValuesIn(inputsd_i32));
+
+const std::vector<MatrixVectorInputs<double, size_t>> inputsd_i64 = {
+  {0.0000001, 2500, 250, 0, false, false, 1234ULL},
+  {0.0000001, 2500, 250, 1, false, false, 1234ULL}};
+typedef MatrixVectorTest<double, size_t> MatrixVectorTestD_i64;
+TEST_P(MatrixVectorTestD_i64, Result)
+{
+  ASSERT_TRUE(devArrMatch(out_ref.data(),
+                          out.data(),
+                          params.rows * params.cols,
+                          CompareApprox<double>(params.tolerance)));
+}
+INSTANTIATE_TEST_SUITE_P(MatrixVectorTests,
+                         MatrixVectorTestD_i64,
+                         ::testing::ValuesIn(inputsd_i64));
+
+}  // end namespace linalg
+}  // end namespace raft
diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu
index 2023ce4121..b5a3168a06 100644
--- a/cpp/test/linalg/matrix_vector_op.cu
+++ b/cpp/test/linalg/matrix_vector_op.cu
@@ -17,6 +17,7 @@
 #include "../test_utils.h"
 #include "matrix_vector_op.cuh"
 #include <gtest/gtest.h>
+#include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/cudart_utils.hpp>
 
diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh
index f46d70eaa3..934c2f3e0d 100644
--- a/cpp/test/linalg/matrix_vector_op.cuh
+++ b/cpp/test/linalg/matrix_vector_op.cuh
@@ -21,6 +21,48 @@
 namespace raft {
 namespace linalg {
 
+template <typename Type, typename IdxType, typename LambdaOp>
+__global__ void naiveMatVecOpKernel(Type* mat,
+                                    const Type* vec,
+                                    IdxType D,
+                                    IdxType N,
+                                    bool rowMajor,
+                                    bool bcastAlongRows,
+                                    LambdaOp operation)
+{
+  IdxType idx = threadIdx.x + blockIdx.x * blockDim.x;
+  IdxType len = N * D;
+  IdxType col;
+  if (rowMajor && bcastAlongRows) {
+    col = idx % D;
+  } else if (!rowMajor && !bcastAlongRows) {
+    col = idx % N;
+  } else if (rowMajor && !bcastAlongRows) {
+    col = idx / D;
+  } else {
+    col = idx / N;
+  }
+  if (idx < len) { mat[idx] = operation(mat[idx], vec[col]); }
+}
+
+template <typename Type, typename IdxType, typename LambdaOp>
+void naiveMatVecOp(Type* mat,
+                   const Type* vec,
+                   IdxType D,
+                   IdxType N,
+                   bool rowMajor,
+                   bool bcastAlongRows,
+                   LambdaOp operation,
+                   cudaStream_t stream)
+{
+  static const IdxType TPB = 64;
+  IdxType len              = N * D;
+  IdxType nblks            = raft::ceildiv(len, TPB);
+  naiveMatVecOpKernel<Type>
+    <<<nblks, TPB, 0, stream>>>(mat, vec, D, N, rowMajor, bcastAlongRows, operation);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 template <typename Type, typename IdxType = int>
 __global__ void naiveMatVecKernel(Type* out,
                                   const Type* mat,

From ddc8a52f1238c0fb6ef9a7934aeb59fef76a9c60 Mon Sep 17 00:00:00 2001
From: Mickael Ide <mide@nvidia.com>
Date: Thu, 27 Oct 2022 19:24:20 +0200
Subject: [PATCH 13/13] Add struct for slice coordinates and simplify reverse
 code

---
 cpp/include/raft/matrix/reverse.cuh | 56 ++++++++++-------------------
 cpp/include/raft/matrix/slice.cuh   | 50 +++++++++++++++-----------
 cpp/test/matrix/slice.cu            |  2 +-
 3 files changed, 50 insertions(+), 58 deletions(-)

diff --git a/cpp/include/raft/matrix/reverse.cuh b/cpp/include/raft/matrix/reverse.cuh
index 13000332a0..e00a240577 100644
--- a/cpp/include/raft/matrix/reverse.cuh
+++ b/cpp/include/raft/matrix/reverse.cuh
@@ -18,59 +18,41 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/matrix/detail/matrix.cuh>
+#include <raft/util/input_validation.hpp>
 
 namespace raft::matrix {
 
 /**
- * @brief Columns of a column major matrix are reversed in place (i.e. first column and
+ * @brief Reverse the columns of a matrix in place (i.e. first column and
  * last column are swapped)
  * @param[in] handle: raft handle
  * @param[inout] inout: input and output matrix
  */
-template <typename m_t, typename idx_t>
-void col_reverse(const raft::handle_t& handle,
-                 raft::device_matrix_view<m_t, idx_t, col_major> inout)
+template <typename m_t, typename idx_t, typename layout_t>
+void col_reverse(const raft::handle_t& handle, raft::device_matrix_view<m_t, idx_t, layout_t> inout)
 {
-  detail::colReverse(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
+  RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Unsupported matrix layout");
+  if (raft::is_col_major(inout)) {
+    detail::colReverse(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
+  } else {
+    detail::rowReverse(inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
+  }
 }
 
 /**
- * @brief Columns of a row major matrix are reversed in place (i.e. first column and
- * last column are swapped)
- * @param[in] handle: raft handle
- * @param[inout] inout: input and output matrix
- */
-template <typename m_t, typename idx_t>
-void col_reverse(const raft::handle_t& handle,
-                 raft::device_matrix_view<m_t, idx_t, row_major> inout)
-{
-  detail::rowReverse(inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
-}
-
-/**
- * @brief Rows of a column major matrix are reversed in place (i.e. first row and last
+ * @brief Reverse the rows of a matrix in place (i.e. first row and last
  * row are swapped)
  * @param[in] handle: raft handle
  * @param[inout] inout: input and output matrix
  */
-template <typename m_t, typename idx_t>
-void row_reverse(const raft::handle_t& handle,
-                 raft::device_matrix_view<m_t, idx_t, col_major> inout)
+template <typename m_t, typename idx_t, typename layout_t>
+void row_reverse(const raft::handle_t& handle, raft::device_matrix_view<m_t, idx_t, layout_t> inout)
 {
-  detail::rowReverse(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
+  RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Unsupported matrix layout");
+  if (raft::is_col_major(inout)) {
+    detail::rowReverse(inout.data_handle(), inout.extent(0), inout.extent(1), handle.get_stream());
+  } else {
+    detail::colReverse(inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
+  }
 }
-
-/**
- * @brief Rows of a row major matrix are reversed in place (i.e. first row and last
- * row are swapped)
- * @param[in] handle: raft handle
- * @param[inout] inout: input and output matrix
- */
-template <typename m_t, typename idx_t>
-void row_reverse(const raft::handle_t& handle,
-                 raft::device_matrix_view<m_t, idx_t, row_major> inout)
-{
-  detail::colReverse(inout.data_handle(), inout.extent(1), inout.extent(0), handle.get_stream());
-}
-
 }  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/include/raft/matrix/slice.cuh b/cpp/include/raft/matrix/slice.cuh
index ef7ff3d28d..eda2853c78 100644
--- a/cpp/include/raft/matrix/slice.cuh
+++ b/cpp/include/raft/matrix/slice.cuh
@@ -21,41 +21,51 @@
 
 namespace raft::matrix {
 
+template <typename idx_t>
+struct slice_coordinates {
+  idx_t row1;  ///< row coordinate of the top-left point of the wanted area (0-based)
+  idx_t col1;  ///< column coordinate of the top-left point of the wanted area (0-based)
+  idx_t row2;  ///< row coordinate of the bottom-right point of the wanted area (1-based)
+  idx_t col2;  ///< column coordinate of the bottom-right point of the wanted area (1-based)
+
+  slice_coordinates(idx_t row1_, idx_t col1_, idx_t row2_, idx_t col2_)
+    : row1(row1_), col1(col1_), row2(row2_), col2(col2_)
+  {
+  }
+};
+
 /**
  * @brief Slice a matrix (in-place)
+ * @tparam m_t type of matrix elements
+ * @tparam idx_t integer type used for indexing
  * @param[in] handle: raft handle
  * @param[in] in: input matrix (column-major)
- * @param[inout] out: output matrix (column-major)
- * @param[in] x1, y1: coordinate of the top-left point of the wanted area (0-based)
- * @param[in] x2, y2: coordinate of the bottom-right point of the wanted area
- * (1-based)
- * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice_matrix(M_d, 4,
- * 3, 0, 1, 4, 3);
+ * @param[out] out: output matrix (column-major)
+ * @param[in] coords: coordinates of the wanted slice
+ * example: Slice the 2nd and 3rd columns of a 4x3 matrix: slice(handle, in, out, {0, 1, 4, 3});
  */
 template <typename m_t, typename idx_t>
 void slice(const raft::handle_t& handle,
            raft::device_matrix_view<const m_t, idx_t, col_major> in,
            raft::device_matrix_view<m_t, idx_t, col_major> out,
-           idx_t x1,
-           idx_t y1,
-           idx_t x2,
-           idx_t y2)
+           slice_coordinates<idx_t> coords)
 {
-  RAFT_EXPECTS(x2 > x1, "x2 must be > x1");
-  RAFT_EXPECTS(y2 > y1, "y2 must be > y1");
-  RAFT_EXPECTS(x1 >= 0, "x1 must be >= 0");
-  RAFT_EXPECTS(x2 <= in.extent(0), "x2 must be <= number of rows in the input matrix");
-  RAFT_EXPECTS(y1 >= 0, "y1 must be >= 0");
-  RAFT_EXPECTS(y2 <= in.extent(1), "y2 must be <= number of columns in the input matrix");
+  RAFT_EXPECTS(coords.row2 > coords.row1, "row2 must be > row1");
+  RAFT_EXPECTS(coords.col2 > coords.col1, "col2 must be > col1");
+  RAFT_EXPECTS(coords.row1 >= 0, "row1 must be >= 0");
+  RAFT_EXPECTS(coords.row2 <= in.extent(0), "row2 must be <= number of rows in the input matrix");
+  RAFT_EXPECTS(coords.col1 >= 0, "col1 must be >= 0");
+  RAFT_EXPECTS(coords.col2 <= in.extent(1),
+               "col2 must be <= number of columns in the input matrix");
 
   detail::sliceMatrix(in.data_handle(),
                       in.extent(0),
                       in.extent(1),
                       out.data_handle(),
-                      x1,
-                      y1,
-                      x2,
-                      y2,
+                      coords.row1,
+                      coords.col1,
+                      coords.row2,
+                      coords.col2,
                       handle.get_stream());
 }
 }  // namespace raft::matrix
\ No newline at end of file
diff --git a/cpp/test/matrix/slice.cu b/cpp/test/matrix/slice.cu
index f0cce2c184..9744e3724a 100644
--- a/cpp/test/matrix/slice.cu
+++ b/cpp/test/matrix/slice.cu
@@ -87,7 +87,7 @@ class SliceTest : public ::testing::TestWithParam<SliceInputs<T>> {
       raft::make_device_matrix_view<const T, int, raft::col_major>(data.data(), rows, cols);
     auto output = raft::make_device_matrix_view<T, int, raft::col_major>(
       d_act_result.data(), row2 - row1, col2 - col1);
-    slice(handle, input, output, row1, col1, row2, col2);
+    slice(handle, input, output, slice_coordinates(row1, col1, row2, col2));
 
     raft::update_host(act_result.data(), d_act_result.data(), d_act_result.size(), stream);
     handle.sync_stream(stream);