From af05bcce28dcfa764121879fe6e6dcb374be4d69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Malte=20F=C3=B6rster?=
 <97973773+mfoerste4@users.noreply.github.com>
Date: Thu, 27 Oct 2022 04:42:05 +0200
Subject: [PATCH] Adding padded layout 'layout_padded_general' (#725)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a different approach / followup PR of https://github.com/rapidsai/raft/pull/663 for issue https://github.com/rapidsai/raft/issues/497.

I implemented a `layout_padded_general` within raft to statically enforce padding on mdpsan accesses.
* The layout has template parameters for `ValueType`, `StorageOrder `(default `row_major_t`), and `ByteAlignment `(default 128)
* in order to *not* require changes upstream I skipped `submdspan `functionality right now. I have a branch on a mdspan fork where I tested this though (https://github.com/mfoerste4/mdspan/tree/layout_padded).

Authors:
  - Malte Förster (https://github.com/mfoerste4)

Approvers:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/725
---
 cpp/include/raft/core/device_mdspan.hpp       |  45 +
 cpp/include/raft/core/host_mdspan.hpp         |  45 +
 cpp/include/raft/core/mdspan.hpp              |  35 +
 .../raft/matrix/detail/linewise_op.cuh        | 233 ++++++
 cpp/include/raft/matrix/linewise_op.cuh       |  39 +-
 .../__p0009_bits/aligned_accessor.hpp         | 189 +++++
 .../__p0009_bits/layout_padded.hpp            | 787 ++++++++++++++++++
 .../mdspan/include/experimental/mdspan        |   2 +
 cpp/test/matrix/linewise_op.cu                | 157 +++-
 cpp/test/mdarray.cu                           | 420 ++++++++++
 10 files changed, 1942 insertions(+), 10 deletions(-)
 create mode 100644 cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp
 create mode 100644 cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp
diff --git a/cpp/include/raft/core/device_mdspan.hpp b/cpp/include/raft/core/device_mdspan.hpp
index ffbbe43d01..7257b65f58 100644
--- a/cpp/include/raft/core/device_mdspan.hpp
+++ b/cpp/include/raft/core/device_mdspan.hpp
@@ -161,6 +161,51 @@ template <typename ElementType,
           typename LayoutPolicy = layout_c_contiguous>
 using device_matrix_view = device_mdspan<ElementType, matrix_extent<IndexType>, LayoutPolicy>;
 
+/**
+ * @brief Shorthand for 128 byte aligned device matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy must be of type layout_{left/right}_padded
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_right_padded<ElementType>,
+          typename              = enable_if_layout_padded<ElementType, LayoutPolicy>>
+using device_aligned_matrix_view =
+  device_mdspan<ElementType,
+                matrix_extent<IndexType>,
+                LayoutPolicy,
+                std::experimental::aligned_accessor<ElementType, detail::alignment::value>>;
+
+/**
+ * @brief Create a 2-dim 128 byte aligned mdspan instance for device pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy must be of type layout_{left/right}_padded
+ * @tparam IndexType the index type of the extents
+ * @param[in] ptr on device to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_right_padded<ElementType>>
+auto make_device_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
+{
+  using data_handle_type =
+    typename std::experimental::aligned_accessor<ElementType,
+                                                 detail::alignment::value>::data_handle_type;
+  static_assert(std::is_same<LayoutPolicy, layout_left_padded<ElementType>>::value ||
+                std::is_same<LayoutPolicy, layout_right_padded<ElementType>>::value);
+  assert(ptr == alignTo(ptr, detail::alignment::value));
+
+  data_handle_type aligned_pointer = ptr;
+
+  matrix_extent<IndexType> extents{n_rows, n_cols};
+  return device_aligned_matrix_view<ElementType, IndexType, LayoutPolicy>{aligned_pointer, extents};
+}
+
 /**
  * @brief Create a raft::managed_mdspan
  * @tparam ElementType the data type of the matrix elements
diff --git a/cpp/include/raft/core/host_mdspan.hpp b/cpp/include/raft/core/host_mdspan.hpp
index 3e76dbb9ce..961a7a7ccb 100644
--- a/cpp/include/raft/core/host_mdspan.hpp
+++ b/cpp/include/raft/core/host_mdspan.hpp
@@ -110,6 +110,51 @@ template <typename ElementType,
           typename LayoutPolicy = layout_c_contiguous>
 using host_matrix_view = host_mdspan<ElementType, matrix_extent<IndexType>, LayoutPolicy>;
 
+/**
+ * @brief Shorthand for 128 byte aligned host matrix view.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam IndexType the index type of the extents
+ * @tparam LayoutPolicy must be of type layout_{left/right}_padded
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_right_padded<ElementType>,
+          typename              = enable_if_layout_padded<ElementType, LayoutPolicy>>
+using host_aligned_matrix_view =
+  host_mdspan<ElementType,
+              matrix_extent<IndexType>,
+              LayoutPolicy,
+              std::experimental::aligned_accessor<ElementType, detail::alignment::value>>;
+
+/**
+ * @brief Create a 2-dim 128 byte aligned mdspan instance for host pointer. It's
+ *        expected that the given layout policy match the layout of the underlying
+ *        pointer.
+ * @tparam ElementType the data type of the matrix elements
+ * @tparam LayoutPolicy must be of type layout_{left/right}_padded
+ * @tparam IndexType the index type of the extents
+ * @param[in] ptr on host to wrap
+ * @param[in] n_rows number of rows in pointer
+ * @param[in] n_cols number of columns in pointer
+ */
+template <typename ElementType,
+          typename IndexType    = std::uint32_t,
+          typename LayoutPolicy = layout_right_padded<ElementType>>
+auto make_host_aligned_matrix_view(ElementType* ptr, IndexType n_rows, IndexType n_cols)
+{
+  using data_handle_type =
+    typename std::experimental::aligned_accessor<ElementType,
+                                                 detail::alignment::value>::data_handle_type;
+
+  static_assert(std::is_same<LayoutPolicy, layout_left_padded<ElementType>>::value ||
+                std::is_same<LayoutPolicy, layout_right_padded<ElementType>>::value);
+  assert(ptr == alignTo(ptr, detail::alignment::value));
+  data_handle_type aligned_pointer = ptr;
+
+  matrix_extent<IndexType> extents{n_rows, n_cols};
+  return host_aligned_matrix_view<ElementType, IndexType, LayoutPolicy>{aligned_pointer, extents};
+}
+
 /**
  * @brief Create a 0-dim (scalar) mdspan instance for host value.
  *
diff --git a/cpp/include/raft/core/mdspan.hpp b/cpp/include/raft/core/mdspan.hpp
index 1b98a7a937..1faac44cc8 100644
--- a/cpp/include/raft/core/mdspan.hpp
+++ b/cpp/include/raft/core/mdspan.hpp
@@ -32,6 +32,40 @@ template <typename ElementType,
           typename AccessorPolicy = std::experimental::default_accessor<ElementType>>
 using mdspan = std::experimental::mdspan<ElementType, Extents, LayoutPolicy, AccessorPolicy>;
 
+namespace detail {
+
+// keeping ByteAlignment as optional to allow testing
+template <class ValueType, size_t ByteAlignment = 128>
+struct padding {
+  static_assert(std::is_same<std::remove_cv_t<ValueType>, ValueType>::value,
+                "std::experimental::padding ValueType has to be provided without "
+                "const or volatile specifiers.");
+  static_assert(ByteAlignment % sizeof(ValueType) == 0 || sizeof(ValueType) % ByteAlignment == 0,
+                "std::experimental::padding sizeof(ValueType) has to be multiple or "
+                "divider of ByteAlignment.");
+  static constexpr size_t value = std::max(ByteAlignment / sizeof(ValueType), 1ul);
+};
+
+// alignment fixed to 128 bytes
+struct alignment {
+  static constexpr size_t value = 128;
+};
+
+}  // namespace detail
+
+template <typename ElementType>
+using layout_right_padded = std::experimental::layout_right_padded<
+  detail::padding<std::remove_cv_t<std::remove_reference_t<ElementType>>>::value>;
+
+template <typename ElementType>
+using layout_left_padded = std::experimental::layout_left_padded<
+  detail::padding<std::remove_cv_t<std::remove_reference_t<ElementType>>>::value>;
+
+template <typename ElementType, typename LayoutPolicy>
+using enable_if_layout_padded =
+  std::enable_if_t<std::is_same<LayoutPolicy, layout_left_padded<ElementType>>::value ||
+                   std::is_same<LayoutPolicy, layout_right_padded<ElementType>>::value>;
+
 /**
  * Ensure all types listed in the parameter pack `Extents` are integral types.
  * Usage:
@@ -254,4 +288,5 @@ RAFT_INLINE_FUNCTION auto unravel_index(Idx idx,
     return unravel_index_impl<uint32_t>(static_cast<uint32_t>(idx), shape);
   }
 }
+
 }  // namespace raft
diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh
index 15f5204382..8180b88c8a 100644
--- a/cpp/include/raft/matrix/detail/linewise_op.cuh
+++ b/cpp/include/raft/matrix/detail/linewise_op.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/mdspan.hpp>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
@@ -176,6 +178,37 @@ struct Linewise {
       return out;
     }
   }
+
+  /**
+   * @brief Same as loadVec, but padds data with Ones
+   *
+   * @param shm
+   * @param p
+   * @param blockOffset
+   * @param rowLen
+   * @param rowLenPadded
+   * @return a contiguous chunk of a vector, suitable for `vectorRows`.
+   */
+  static __device__ __forceinline__ Vec loadVecPadded(Type* shm,
+                                                      const Type* p,
+                                                      const IdxType blockOffset,
+                                                      const IdxType rowLen,
+                                                      const IdxType rowLenPadded) noexcept
+  {
+    IdxType j = blockOffset + threadIdx.x;
+#pragma unroll VecElems
+    for (int k = threadIdx.x; k < VecElems * BlockSize; k += BlockSize, j += BlockSize) {
+      while (j >= rowLenPadded)
+        j -= rowLenPadded;
+      shm[k] = j < rowLen ? p[j] : Type(1);
+    }
+    __syncthreads();
+    {
+      Vec out;
+      *out.vectorized_data() = reinterpret_cast<typename Vec::io_t*>(shm)[threadIdx.x];
+      return out;
+    }
+  }
 };
 
 /**
@@ -325,6 +358,48 @@ __global__ void __launch_bounds__(BlockSize)
     (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, blockOffset, rowLen))...);
 }
 
+/**
+ * Simplified version of `matrixLinewiseVecRowsMainKernel` for use with padded data.
+ * Data is required to be aligned and padded.
+ *
+ * @param [out] out the start of the *aligned* part of the output matrix
+ * @param [in] in the start of the *aligned* part of the input matrix
+ * @param [in] arrOffset such an offset into the matrices that makes them aligned to `VecBytes`
+ * @param [in] rowLen number of elements in a row (= the vector size)
+ * @param [in] len the total length of the aligned part of the matrices
+ * @param [in] op the function to apply
+ * @param [in] vecs pointers to the argument vectors
+ */
+template <typename Type,
+          typename IdxType,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+__global__ void __launch_bounds__(BlockSize)
+  matrixLinewiseVecRowsSpanKernel(Type* out,
+                                  const Type* in,
+                                  const IdxType rowLen,
+                                  const IdxType rowLenPadded,
+                                  const IdxType lenPadded,
+                                  Lambda op,
+                                  Vecs... vecs)
+{
+  typedef Linewise<Type, IdxType, VecBytes, BlockSize> L;
+  constexpr uint workSize = L::VecElems * BlockSize;
+  uint workOffset         = workSize;
+  __shared__ __align__(sizeof(Type) * L::VecElems)
+    Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)];
+  const IdxType blockOffset = (BlockSize * L::VecElems * blockIdx.x) % rowLenPadded;
+  return L::vectorRows(
+    reinterpret_cast<typename L::Vec::io_t*>(out),
+    reinterpret_cast<const typename L::Vec::io_t*>(in),
+    L::AlignElems::div(lenPadded),
+    op,
+    (workOffset ^= workSize,
+     L::loadVecPadded(shm + workOffset, vecs, blockOffset, rowLen, rowLenPadded))...);
+}
+
 /**
  * This kernel is similar to `matrixLinewiseVecRowsMainKernel`, but processes only the unaligned
  * head and tail parts of the matrix.
@@ -444,6 +519,59 @@ void matrixLinewiseVecCols(Type* out,
   }
 }
 
+/**
+ *  input/output data is expected to be aligned and padded
+ *  we simply extend the operation over the padded elements to be fully aligned
+ */
+template <typename Type,
+          typename IdxType,
+          typename LayoutPolicy,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+void matrixLinewiseVecColsSpan(
+  raft::device_aligned_matrix_view<Type, IdxType, LayoutPolicy> out,
+  raft::device_aligned_matrix_view<const Type, IdxType, LayoutPolicy> in,
+  const IdxType rowLen,
+  const IdxType nRows,
+  Lambda op,
+  cudaStream_t stream,
+  Vecs... vecs)
+{
+  typedef raft::Pow2<VecBytes> AlignBytes;
+  constexpr std::size_t VecElems = VecBytes / sizeof(Type);
+
+  typedef raft::Pow2<raft::layout_left_padded<Type>::padding> AlignPadding;
+
+  const uint paddedRowLen  = AlignPadding::roundUp(rowLen);
+  const IdxType alignedLen = paddedRowLen * nRows;
+
+  if (rowLen * nRows > 0) {
+    constexpr dim3 bs(BlockSize, 1, 1);
+    // Minimum size of the grid to make the device well occupied
+    const uint occupy = getOptimalGridSize<BlockSize>();
+    // does not make sense to have more blocks than this
+    const uint maxBlocks = raft::ceildiv<uint>(uint(alignedLen), bs.x * VecElems);
+    const dim3 gs(std::min(maxBlocks, occupy), 1, 1);
+    // The work arrangement is blocked on the block and warp levels;
+    //   see more details at Linewise::vectorCols.
+    // The value below determines how many scalar elements are processed by on thread in total.
+    const IdxType elemsPerThread =
+      raft::ceildiv<IdxType>(alignedLen, gs.x * VecElems * BlockSize) * VecElems;
+    matrixLinewiseVecColsMainKernel<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>
+      <<<gs, bs, 0, stream>>>(out.data_handle(),
+                              in.data_handle(),
+                              0,
+                              paddedRowLen,
+                              alignedLen,
+                              elemsPerThread,
+                              op,
+                              vecs...);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+}
+
 template <typename Type,
           typename IdxType,
           std::size_t VecBytes,
@@ -508,6 +636,70 @@ void matrixLinewiseVecRows(Type* out,
   }
 }
 
+/**
+ *  input/output data is expected to be aligned and padded
+ *  we simply extend the operation over the padded elements to be fully aligned
+ *  special treatment for 'Vecs' is needed as no elements are available for the padded region
+ */
+template <typename Type,
+          typename IdxType,
+          typename LayoutPolicy,
+          std::size_t VecBytes,
+          int BlockSize,
+          typename Lambda,
+          typename... Vecs>
+void matrixLinewiseVecRowsSpan(
+  raft::device_aligned_matrix_view<Type, IdxType, LayoutPolicy> out,
+  raft::device_aligned_matrix_view<const Type, IdxType, LayoutPolicy> in,
+  const IdxType rowLen,
+  const IdxType nRows,
+  Lambda op,
+  cudaStream_t stream,
+  Vecs... vecs)
+{
+  constexpr std::size_t VecElems = VecBytes / sizeof(Type);
+  typedef raft::Pow2<VecBytes> AlignBytes;
+
+  typedef raft::Pow2<raft::layout_right_padded<Type>::padding> AlignPadding;
+
+  const uint paddedRowLen  = AlignPadding::roundUp(rowLen);
+  const IdxType alignedLen = paddedRowLen * nRows;
+
+  if (rowLen * nRows > 0) {
+    constexpr dim3 bs(BlockSize, 1, 1);
+    // The work arrangement is striped;
+    //   see more details at Linewise::vectorRows.
+    // Below is the work amount performed by one block in one iteration.
+    constexpr uint block_work_size = bs.x * uint(VecElems);
+    /* Here I would define `grid_work_size = lcm(block_work_size, rowLen)` (Least Common Multiple)
+       This way, the grid spans a set of one or more rows each iteration, and, most importantly,
+       on every iteration each row processes the same set of indices within a row (= the same set
+       of vector indices).
+       This means, each block needs to load the values from the vector arguments only once.
+       Sadly, sometimes `grid_work_size > rowLen*nRows`, and sometimes grid_work_size > UINT_MAX.
+       That's why I don't declare it here explicitly.
+       Instead, I straightaway compute the
+         expected_grid_size = lcm(block_work_size, rowLen) / block_work_size
+     */
+    const uint expected_grid_size = paddedRowLen / raft::gcd(block_work_size, uint(paddedRowLen));
+    // Minimum size of the grid to make the device well occupied
+    const uint occupy = getOptimalGridSize<BlockSize>();
+    const dim3 gs(std::min(
+                    // does not make sense to have more blocks than this
+                    raft::ceildiv<uint>(uint(alignedLen), block_work_size),
+                    // increase the grid size to be not less than `occupy` while
+                    // still being the multiple of `expected_grid_size`
+                    raft::ceildiv<uint>(occupy, expected_grid_size) * expected_grid_size),
+                  1,
+                  1);
+
+    matrixLinewiseVecRowsSpanKernel<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>
+      <<<gs, bs, 0, stream>>>(
+        out.data_handle(), in.data_handle(), rowLen, paddedRowLen, alignedLen, op, vecs...);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+}
+
 /**
  * Select one of the implementations:
  *   a. vectors applied along/across lines
@@ -541,6 +733,47 @@ struct MatrixLinewiseOp {
       return matrixLinewiseVecCols<Type, IdxType, VecBytes, BlockSize, Lambda, Vecs...>(
         out, in, lineLen, nLines, op, stream, vecs...);
   }
+
+  template <typename Type,
+            typename IdxType,
+            typename LayoutPolicy,
+            typename Lambda,
+            typename... Vecs>
+  static void runPadded(raft::device_aligned_matrix_view<Type, IdxType, LayoutPolicy> out,
+                        raft::device_aligned_matrix_view<const Type, IdxType, LayoutPolicy> in,
+                        const IdxType lineLen,
+                        const IdxType nLines,
+                        const bool alongLines,
+                        Lambda op,
+                        cudaStream_t stream,
+                        Vecs... vecs)
+  {
+    constexpr auto is_rowmajor = std::is_same_v<LayoutPolicy, raft::layout_right_padded<Type>>;
+    constexpr auto is_colmajor = std::is_same_v<LayoutPolicy, raft::layout_left_padded<Type>>;
+
+    static_assert(is_rowmajor || is_colmajor,
+                  "layout for in and out must be either padded row or col major");
+
+    // also statically assert padded matrix alignment == 2^i*VecBytes
+    assert(raft::Pow2<VecBytes>::areSameAlignOffsets(in, out));
+
+    if (alongLines)
+      return matrixLinewiseVecRowsSpan<Type,
+                                       IdxType,
+                                       LayoutPolicy,
+                                       VecBytes,
+                                       BlockSize,
+                                       Lambda,
+                                       Vecs...>(out, in, lineLen, nLines, op, stream, vecs...);
+    else
+      return matrixLinewiseVecColsSpan<Type,
+                                       IdxType,
+                                       LayoutPolicy,
+                                       VecBytes,
+                                       BlockSize,
+                                       Lambda,
+                                       Vecs...>(out, in, lineLen, nLines, op, stream, vecs...);
+  }
 };
 
 }  // end namespace detail
diff --git a/cpp/include/raft/matrix/linewise_op.cuh b/cpp/include/raft/matrix/linewise_op.cuh
index 6b383b14f5..77f70239ea 100644
--- a/cpp/include/raft/matrix/linewise_op.cuh
+++ b/cpp/include/raft/matrix/linewise_op.cuh
@@ -42,9 +42,12 @@ namespace raft::matrix {
  * @param [in] alongLines whether vectors are indices along or across lines.
  * @param [in] op the operation applied on each line:
  *    for i in [0..lineLen) and j in [0..nLines):
+ *      out[j, i] = op(in[j, i], vec1[i], vec2[i], ... veck[i])   if alongLines = true
+ *      out[j, i] = op(in[j, i], vec1[j], vec2[j], ... veck[j])   if alongLines = false
+ *    where matrix indexing is row-major ([j, i] = [i + lineLen * j]).
  *      out[i, j] = op(in[i, j], vec1[i], vec2[i], ... veck[i])   if alongLines = true
  *      out[i, j] = op(in[i, j], vec1[j], vec2[j], ... veck[j])   if alongLines = false
- *    where matrix indexing is row-major ([i, j] = [i + lineLen * j]).
+ *    where matrix indexing is col-major ([i, j] = [i + lineLen * j]).
  * @param [in] vecs zero or more vectors to be passed as arguments,
  *    size of each vector is `alongLines ? lineLen : nLines`.
  */
@@ -67,8 +70,8 @@ void linewise_op(const raft::handle_t& handle,
   static_assert(is_rowmajor || is_colmajor,
                 "layout for in and out must be either row or col major");
 
-  const idx_t lineLen = is_rowmajor ? in.extent(0) : in.extent(1);
-  const idx_t nLines  = is_rowmajor ? in.extent(1) : in.extent(0);
+  const idx_t nLines  = is_rowmajor ? in.extent(0) : in.extent(1);
+  const idx_t lineLen = is_rowmajor ? in.extent(1) : in.extent(0);
 
   RAFT_EXPECTS(out.extent(0) == in.extent(0) && out.extent(1) == in.extent(1),
                "Input and output must have the same shape.");
@@ -82,4 +85,34 @@ void linewise_op(const raft::handle_t& handle,
                                                      handle.get_stream(),
                                                      vecs.data_handle()...);
 }
+
+template <typename m_t,
+          typename idx_t,
+          typename layout,
+          typename Lambda,
+          typename... vec_t,
+          typename = raft::enable_if_device_mdspan<vec_t...>>
+void linewise_op(const raft::handle_t& handle,
+                 raft::device_aligned_matrix_view<const m_t, idx_t, layout> in,
+                 raft::device_aligned_matrix_view<m_t, idx_t, layout> out,
+                 const bool alongLines,
+                 Lambda op,
+                 vec_t... vecs)
+{
+  constexpr auto is_rowmajor = std::is_same_v<layout, raft::layout_right_padded<m_t>>;
+  constexpr auto is_colmajor = std::is_same_v<layout, raft::layout_left_padded<m_t>>;
+
+  static_assert(is_rowmajor || is_colmajor,
+                "layout for in and out must be either padded row or col major");
+
+  const idx_t nLines  = is_rowmajor ? in.extent(0) : in.extent(1);
+  const idx_t lineLen = is_rowmajor ? in.extent(1) : in.extent(0);
+
+  RAFT_EXPECTS(out.extent(0) == in.extent(0) && out.extent(1) == in.extent(1),
+               "Input and output must have the same shape.");
+
+  detail::MatrixLinewiseOp<16, 256>::runPadded<m_t, idx_t>(
+    out, in, lineLen, nLines, alongLines, op, handle.get_stream(), vecs.data_handle()...);
+}
+
 }  // namespace raft::matrix
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp
new file mode 100644
index 0000000000..67356785c0
--- /dev/null
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/aligned_accessor.hpp
@@ -0,0 +1,189 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2019) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+// NOTE: This code is prematurely taken from an example based on 
+// https://github.com/kokkos/mdspan/pull/176
+
+#pragma once
+
+#include "macros.hpp"
+#include "trait_backports.hpp"
+#include "default_accessor.hpp"
+#include "extents.hpp"
+#include <cassert>
+#include <iostream>
+#include <type_traits>
+
+namespace std {
+namespace experimental {
+
+namespace stdex = std::experimental;
+
+
+// Prefer std::assume_aligned if available, as it is in the C++ Standard.
+// Otherwise, use a compiler-specific equivalent if available.
+
+// NOTE (mfh 2022/08/08) BYTE_ALIGNMENT must be unsigned and a power of 2.
+#if defined(__cpp_lib_assume_aligned)
+#  define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) (std::assume_aligned< BYTE_ALIGNMENT >( POINTER ))
+  constexpr char assume_aligned_method[] = "std::assume_aligned";
+#elif defined(__ICL)
+#  define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) POINTER
+  constexpr char assume_aligned_method[] = "(none)";
+#elif defined(__ICC)
+#  define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) POINTER
+  constexpr char assume_aligned_method[] = "(none)";
+#elif defined(__clang__)
+#  define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) POINTER
+  constexpr char assume_aligned_method[] = "(none)";
+#elif defined(__GNUC__)
+  // __builtin_assume_aligned returns void*
+#  define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) reinterpret_cast< ELEMENT_TYPE* >(__builtin_assume_aligned( POINTER, BYTE_ALIGNMENT ))
+  constexpr char assume_aligned_method[] = "__builtin_assume_aligned";
+#else
+#  define _MDSPAN_ASSUME_ALIGNED( ELEMENT_TYPE, POINTER, BYTE_ALIGNMENT ) POINTER
+  constexpr char assume_aligned_method[] = "(none)";
+#endif
+
+// Some compilers other than Clang or GCC like to define __clang__ or __GNUC__.
+// Thus, we order the tests from most to least specific.
+#if defined(__ICL)
+#  define _MDSPAN_ALIGN_VALUE_ATTRIBUTE( BYTE_ALIGNMENT ) __declspec(align_value( BYTE_ALIGNMENT ));
+  constexpr char align_attribute_method[] = "__declspec(align_value(BYTE_ALIGNMENT))";
+#elif defined(__ICC)
+#  define _MDSPAN_ALIGN_VALUE_ATTRIBUTE( BYTE_ALIGNMENT ) __attribute__((align_value( BYTE_ALIGNMENT )));
+  constexpr char align_attribute_method[] = "__attribute__((align_value(BYTE_ALIGNMENT)))";
+#elif defined(__clang__)
+#  define _MDSPAN_ALIGN_VALUE_ATTRIBUTE( BYTE_ALIGNMENT ) __attribute__((align_value( BYTE_ALIGNMENT )));
+  constexpr char align_attribute_method[] = "__attribute__((align_value(BYTE_ALIGNMENT)))";
+#else
+#  define _MDSPAN_ALIGN_VALUE_ATTRIBUTE( BYTE_ALIGNMENT )
+  constexpr char align_attribute_method[] = "(none)";
+#endif
+
+constexpr bool
+is_nonzero_power_of_two(const std::size_t x)
+{
+// Just checking __cpp_lib_int_pow2 isn't enough for some GCC versions.
+// The <bit> header exists, but std::has_single_bit does not.
+#if defined(__cpp_lib_int_pow2) && __cplusplus >= 202002L
+  return std::has_single_bit(x);
+#else
+  return x != 0 && (x & (x - 1)) == 0;
+#endif
+}
+
+template<class ElementType>
+constexpr bool
+valid_byte_alignment(const std::size_t byte_alignment)
+{
+  return is_nonzero_power_of_two(byte_alignment) && byte_alignment >= alignof(ElementType);
+}
+
+// We define aligned_pointer_t through a struct
+// so we can check whether the byte alignment is valid.
+// This makes it impossible to use the alias
+// with an invalid byte alignment.
+template<class ElementType, std::size_t byte_alignment>
+struct aligned_pointer {
+  static_assert(valid_byte_alignment<ElementType>(byte_alignment),
+		"byte_alignment must be a power of two no less than "
+		"the minimum required alignment of ElementType.");
+  using type = ElementType* _MDSPAN_ALIGN_VALUE_ATTRIBUTE( byte_alignment );
+};
+
+
+template<class ElementType, std::size_t byte_alignment>
+using aligned_pointer_t = typename aligned_pointer<ElementType, byte_alignment>::type;
+
+template<class ElementType, std::size_t byte_alignment>
+aligned_pointer_t<ElementType, byte_alignment>
+bless(ElementType* ptr, std::integral_constant<std::size_t, byte_alignment> /* ba */ )
+{
+  return _MDSPAN_ASSUME_ALIGNED( ElementType, ptr, byte_alignment );
+}
+
+
+template<class ElementType, std::size_t byte_alignment>
+struct aligned_accessor {
+  using offset_policy = stdex::default_accessor<ElementType>;
+  using element_type = ElementType;
+  using reference = ElementType&;
+  using data_handle_type = aligned_pointer_t<ElementType, byte_alignment>;
+
+  constexpr aligned_accessor() noexcept = default;
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class OtherElementType,
+    std::size_t other_byte_alignment,
+    /* requires */ (std::is_convertible<OtherElementType(*)[], element_type(*)[]>::value && other_byte_alignment == byte_alignment)
+    )
+  constexpr aligned_accessor(aligned_accessor<OtherElementType, other_byte_alignment>) noexcept {}
+
+  constexpr reference access(data_handle_type p, size_t i) const noexcept {
+    // This may declare alignment twice, depending on
+    // if we have an attribute for marking pointer types.
+    return _MDSPAN_ASSUME_ALIGNED( ElementType, p, byte_alignment )[i];
+  }
+
+  constexpr typename offset_policy::data_handle_type
+  offset(data_handle_type p, size_t i) const noexcept {
+    return p + i;
+  }
+};
+
+template<class ElementType>
+struct delete_raw {
+  void operator()(ElementType* p) const {
+    if (p != nullptr) {
+      // All the aligned allocation methods below go with std::free.
+      // If we implement a new method that uses a different
+      // deallocation function, that function would go here.
+      std::free(p);
+    }
+  }
+};
+
+}  // end namespace experimental
+}  // end namespace std
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp
new file mode 100644
index 0000000000..cd9c9c19bf
--- /dev/null
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/__p0009_bits/layout_padded.hpp
@@ -0,0 +1,787 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2019) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+
+// NOTE: This code is prematurely taken from https://github.com/kokkos/mdspan/pull/180
+// and matches requirements described in https://github.com/ORNL/cpp-proposals-pub/pull/296
+// Some parts (as submdspan integration) are missing
+// EDIT: the meaning of the template argument 'padding_stride' was adjusted from a 
+// fixed stride to a padding alignment, allowing dimensions > padding_stride to be padded
+// to multiples of 'padding_stride'
+
+#pragma once
+
+#include "macros.hpp"
+#include "trait_backports.hpp"
+#include "extents.hpp"
+#include "layout_left.hpp"
+#include "layout_right.hpp"
+#include <cassert>
+#include <iostream>
+#include <type_traits>
+
+namespace std {
+namespace experimental {
+
+namespace stdex = std::experimental;
+
+namespace details {
+
+  // offset_index_sequence idea comes from "offset_sequence" here:
+  // https://devblogs.microsoft.com/oldnewthing/20200625-00/?p=103903
+  //
+  // offset_index_sequence adds N to each element of the given IndexSequence.
+  // We can't just template on the parameter pack of indices directly;
+  // the pack needs to be contained in some type.
+  // We choose index_sequence because it stores no run-time data.
+  template<std::size_t N, class IndexSequence> struct offset_index_sequence;
+
+  template<std::size_t N, std::size_t... Indices>
+  struct offset_index_sequence<N, std::index_sequence<Indices...>>
+  {
+    using type = std::index_sequence<(Indices + N)...>;
+  };
+
+  template<std::size_t N, typename IndexSequence>
+  using offset_index_sequence_t = typename offset_index_sequence<N, IndexSequence>::type;
+
+  static_assert(std::is_same<
+    offset_index_sequence_t<3, std::make_index_sequence<4>>,
+    std::index_sequence<3, 4, 5, 6>>::value,
+    "offset_index_sequence defined incorrectly." );
+
+  // iota_index_sequence defines the half-open sequence
+  // begin, begin+1, begin+2, ..., end-1.
+  // If end == begin, then the sequence is empty (we permit this).
+  //
+  // Defining the struct first, rather than going straight to the type alias,
+  // lets us check the template arguments.
+  template<std::size_t begin, std::size_t end>
+  struct iota_index_sequence {
+    static_assert(end >= begin, "end must be >= begin.");
+    using type =
+      offset_index_sequence_t< begin, std::make_index_sequence<end - begin> >;
+  };
+
+  // iota_index_sequence_t is like make_index_sequence,
+  // except that it starts with begin instead of 0.
+  template<std::size_t begin, std::size_t end>
+  using iota_index_sequence_t = typename iota_index_sequence<begin, end>::type;
+
+  static_assert(std::is_same<
+		iota_index_sequence_t<3, 6>,
+		std::index_sequence<3, 4, 5>>::value,
+		"iota_index_sequence defined incorrectly." );
+
+  static_assert(std::is_same<
+		iota_index_sequence_t<3, 3>,
+		std::index_sequence<>>::value,
+		"iota_index_sequence defined incorrectly." );
+
+  static_assert(std::is_same<
+		iota_index_sequence_t<3, 4>,
+		std::index_sequence<3>>::value,
+		"iota_index_sequence defined incorrectly." );
+
+  template <typename IndexType>
+  constexpr IndexType ceildiv(IndexType a, IndexType b)
+  {
+    return (a + b - 1) / b;
+  }
+
+  template <typename IndexType>
+  constexpr IndexType alignTo(IndexType a, IndexType b)
+  {
+    return ceildiv(a, b) * b;
+  }
+
+} // namespace details
+
+// layout_padded_left implementation
+
+namespace details {
+   
+
+  // The *_helper functions work around not having C++20
+  // templated lambdas: []<size_t... TrailingIndices>{} .
+
+  // The third argument should always be
+  // iota_index_sequence_t<1, ReturnExtents::rank()>.
+  template<class ReturnExtents,
+	   std::size_t UnpaddedExtent,
+	   class InnerExtents,
+	   std::size_t... TrailingIndices>
+  MDSPAN_INLINE_FUNCTION constexpr ReturnExtents
+  layout_left_extents_helper(const stdex::extents<typename InnerExtents::index_type, UnpaddedExtent>& unpadded_extent,
+			     const InnerExtents& inner_extents,
+			     std::index_sequence<TrailingIndices...>)
+  {
+    static_assert(sizeof...(TrailingIndices) + 1 == ReturnExtents::rank(),
+		  "sizeof...(TrailingIndices) + 1 != ReturnExtents::rank()");
+    static_assert(InnerExtents::rank() == ReturnExtents::rank(),
+		  "InnerExtents::rank() != ReturnExtents::rank()");
+    using index_type = typename ReturnExtents::index_type;
+    return ReturnExtents{
+      unpadded_extent.extent(0),
+      index_type(inner_extents.extent(TrailingIndices))...
+    };
+  }
+
+  // The third argument should always be
+  // iota_index_sequence_t<0, ReturnExtents::rank() - 1>.
+  template<class ReturnExtents,
+	   std::size_t UnpaddedExtent,
+	   class InnerExtents,
+	   std::size_t... LeadingIndices>
+  MDSPAN_INLINE_FUNCTION constexpr ReturnExtents
+  layout_right_extents_helper(const InnerExtents& inner_extents,
+			      const stdex::extents<typename InnerExtents::index_type, UnpaddedExtent>& unpadded_extent,
+			      std::index_sequence<LeadingIndices...>)
+  {
+    static_assert(sizeof...(LeadingIndices) + 1 == ReturnExtents::rank(),
+		  "sizeof...(LeadingIndices) + 1 != ReturnExtents::rank()");
+    static_assert(InnerExtents::rank() == ReturnExtents::rank(),
+		  "InnerExtents::rank() != ReturnExtents::rank()");
+    using index_type = typename ReturnExtents::index_type;
+    return ReturnExtents{
+      index_type(inner_extents.extent(LeadingIndices))...,
+      unpadded_extent.extent(0)
+    };
+  }
+
+  template<class ReturnExtents,
+	   std::size_t UnpaddedExtent,
+	   class IndexType,
+	   std::size_t... InnerExtents>
+  MDSPAN_INLINE_FUNCTION constexpr ReturnExtents
+  layout_left_extents(const stdex::extents<IndexType, UnpaddedExtent>& unpadded_extent,
+		      const stdex::extents<IndexType, InnerExtents...>& inner_extents)
+  {
+    return layout_left_extents_helper<ReturnExtents>(
+      unpadded_extent,
+      inner_extents,
+      details::iota_index_sequence_t<1, ReturnExtents::rank()>{}
+    );
+  }
+
+  // Rank-0 unpadded_extent means rank-0 input,
+  // but the latter turns out not to matter here.
+
+  template<class ReturnExtents,
+	   class IndexType,
+	   std::size_t... InnerExtents>
+  MDSPAN_INLINE_FUNCTION constexpr ReturnExtents
+  layout_left_extents(const stdex::extents<IndexType>& /* unpadded_extent */ ,
+		      const stdex::extents<IndexType, InnerExtents...>& inner_extents)
+  {
+    return inner_extents;
+  }
+
+  template<class ReturnExtents,
+	   std::size_t UnpaddedExtent,
+	   class IndexType,
+	   std::size_t... InnerExtents>
+  MDSPAN_INLINE_FUNCTION constexpr ReturnExtents
+  layout_right_extents(const stdex::extents<IndexType, InnerExtents...>& inner_extents,
+		       const stdex::extents<IndexType, UnpaddedExtent>& unpadded_extent)
+  {
+    // If rank() is zero, size_t(-1) would be a very large upper bound.
+    static_assert(ReturnExtents::rank() != 0,
+		  "ReturnExtents::rank() must not be 0");
+    return layout_right_extents_helper<ReturnExtents>(
+       inner_extents,
+       unpadded_extent,
+       details::iota_index_sequence_t<0, ReturnExtents::rank() - 1>{}
+    );
+  }
+
+  // Rank-0 unpadded_extent means rank-0 input,
+  // but the latter turns out not to matter here.
+
+  template<class ReturnExtents,
+	   class IndexType,
+	   std::size_t... InnerExtents>
+  MDSPAN_INLINE_FUNCTION constexpr ReturnExtents
+  layout_right_extents(const stdex::extents<IndexType, InnerExtents...>& inner_extents,
+		       const stdex::extents<IndexType>& /* unpadded_extent */ )
+  {
+    return inner_extents;
+  }
+
+  template<
+    class InputExtentsType,
+    std::size_t PaddingExtent,
+    std::size_t ... Indices
+  >
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  pad_extents_left_helper(const InputExtentsType& input,
+			  const stdex::extents<typename InputExtentsType::index_type, PaddingExtent>& padding,
+			  std::index_sequence<Indices...>)
+  {
+    // NOTE (mfh 2022/09/04) This can be if constexpr,
+    // if the compiler supports it.
+    if /* constexpr */ (PaddingExtent == stdex::dynamic_extent) {
+      assert(padding.extent(0) != stdex::dynamic_extent);
+    }
+    using input_type = std::remove_cv_t<std::remove_reference_t<InputExtentsType>>;
+    using index_type = typename input_type::index_type;
+    constexpr std::size_t rank = input_type::rank();
+    static_assert(sizeof...(Indices) == std::size_t(rank - 1),
+		  "Indices pack has the wrong size.");
+    using return_type = stdex::extents<
+      index_type,
+      stdex::dynamic_extent,
+      input_type::static_extent(Indices)...
+    >;
+    return return_type{
+      index_type(details::alignTo(input.extent(0), padding.extent(0))),
+      input.extent(Indices)...
+    };
+  }
+
+  template<
+    class InputExtentsType,
+    std::size_t PaddingExtent,
+    std::size_t ... Indices
+  >
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  pad_extents_right_helper(const InputExtentsType& input,
+			   const stdex::extents<typename InputExtentsType::index_type, PaddingExtent>& padding,
+			   std::index_sequence<Indices...>)
+  {
+    // NOTE (mfh 2022/09/04) This can be if constexpr,
+    // if the compiler supports it.
+    if /* constexpr */ (PaddingExtent == stdex::dynamic_extent) {
+      assert(padding.extent(0) != stdex::dynamic_extent);
+    }
+    using input_type = std::remove_cv_t<std::remove_reference_t<InputExtentsType>>;
+    using index_type = typename input_type::index_type;
+    constexpr std::size_t rank = input_type::rank();
+    static_assert(sizeof...(Indices) == std::size_t(rank - 1),
+		  "Indices pack has the wrong size.");
+
+    using return_type = stdex::extents<
+      index_type,
+      input_type::static_extent(Indices)...,
+      stdex::dynamic_extent
+    >;
+    return return_type{
+      input.extent(Indices)...,
+      index_type(details::alignTo(input.extent(rank - 1), padding.extent(0)))
+    };
+  }
+
+  // Rank-0 and rank-1 mdspan don't need extra padding from their layout.
+  // They rely on an "aligned_accessor" and on the data_handle's alignment.
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class IndexType,
+    std::size_t PaddingExtent,
+    std::size_t ... InputExtents,
+    /* requires */ (sizeof...(InputExtents) <= std::size_t(1))
+  )
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  pad_extents_left(const stdex::extents<IndexType, InputExtents...>& input,
+		   const stdex::extents<IndexType, PaddingExtent> /* padding */ )
+  {
+    return input;
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class IndexType,
+    std::size_t PaddingExtent,
+    std::size_t ... InputExtents,
+    /* requires */ (sizeof...(InputExtents) <= std::size_t(1))
+  )
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  pad_extents_right(const stdex::extents<IndexType, InputExtents...>& input,
+		    const stdex::extents<IndexType, PaddingExtent> /* padding */ )
+  {
+    return input;
+  }
+
+  // rank > 1 case follows.
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class IndexType,
+    std::size_t PaddingExtent,
+    std::size_t ... InputExtents,
+    /* requires */ (sizeof...(InputExtents) > std::size_t(1))
+  )
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  pad_extents_left(const stdex::extents<IndexType, InputExtents...>& input,
+		   const stdex::extents<IndexType, PaddingExtent> padding)
+  {
+    constexpr std::size_t rank = sizeof...(InputExtents);
+    return details::pad_extents_left_helper
+      (input, padding, details::iota_index_sequence_t<1, rank>{});
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class IndexType,
+    std::size_t PaddingExtent,
+    std::size_t ... InputExtents,
+    /* requires */ (sizeof...(InputExtents) > std::size_t(1))
+  )
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  pad_extents_right(const stdex::extents<IndexType, InputExtents...>& input,
+		    const stdex::extents<IndexType, PaddingExtent> padding)
+  {
+    constexpr std::size_t rank = sizeof...(InputExtents);
+    return details::pad_extents_right_helper
+      (input, padding, details::iota_index_sequence_t<0, rank - 1>{});
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class IndexType,
+    std::size_t ... InputExtents,
+    /* requires */ (sizeof...(InputExtents) != std::size_t(0))
+  )
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  unpadded_extent_left(const stdex::extents<IndexType, InputExtents...>& input)
+  {
+    using input_type = stdex::extents<IndexType, InputExtents...>;
+    return stdex::extents<IndexType, input_type::static_extent(0)>{input.extent(0)};
+  }
+
+  MDSPAN_TEMPLATE_REQUIRES(
+    class IndexType,
+    std::size_t ... InputExtents,
+    /* requires */ (sizeof...(InputExtents) != std::size_t(0))
+  )
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  unpadded_extent_right(const stdex::extents<IndexType, InputExtents...>& input)
+  {
+    using input_type = stdex::extents<IndexType, InputExtents...>;
+    const auto rank = input_type::rank();
+    return stdex::extents<IndexType, input_type::static_extent(rank - 1)>{input.extent(rank - 1)};
+  }
+
+  template<class IndexType>
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  unpadded_extent_left(const stdex::extents<IndexType>& /* input */ )
+  {
+    return stdex::extents<IndexType>{};
+  }
+
+  template<class IndexType>
+  MDSPAN_INLINE_FUNCTION constexpr auto
+  unpadded_extent_right(const stdex::extents<IndexType>& /* input */ )
+  {
+    return stdex::extents<IndexType>{};
+  }
+
+  // Helper functions to work around C++14's lack of "if constexpr."
+
+  template<class PaddingExtentsType,
+	   class InnerMappingType,
+	   std::size_t Rank>
+  MDSPAN_INLINE_FUNCTION constexpr PaddingExtentsType
+  left_padding_extents(const InnerMappingType& inner_mapping,
+		       std::integral_constant<std::size_t, Rank> /* rank */ )
+  {
+    return PaddingExtentsType{inner_mapping.extent(0)};
+  }
+
+  template<class PaddingExtentsType,
+	   class InnerMappingType>
+  MDSPAN_INLINE_FUNCTION constexpr PaddingExtentsType
+  left_padding_extents(const InnerMappingType& /* inner_mapping */ ,
+		       std::integral_constant<std::size_t, 0> /* rank */ )
+  {
+    return PaddingExtentsType{};
+  }
+
+  template<class PaddingExtentsType,
+	   class InnerMappingType,
+           std::size_t Rank>
+  MDSPAN_INLINE_FUNCTION constexpr PaddingExtentsType
+  right_padding_extents(const InnerMappingType& inner_mapping,
+			std::integral_constant<std::size_t, Rank> /* rank */ )
+  {
+    return PaddingExtentsType{inner_mapping.extent(Rank - 1)};
+  }
+
+  template<class PaddingExtentsType,
+	   class InnerMappingType>
+  MDSPAN_INLINE_FUNCTION constexpr PaddingExtentsType
+  right_padding_extents(const InnerMappingType& /* inner_mapping */ ,
+		       std::integral_constant<std::size_t, 0> /* rank */ )
+  {
+    return PaddingExtentsType{};
+  }
+
+} // namespace details
+
+
+
+// TODO (mfh 2022/08/30) Private inheritance from layout_left::mapping
+// resp. layout_right::mapping would reduce inlining depth.
+
+// layout_left_padded is like layout_left,
+// except that stride(0) == 1 always,
+// and the leftmost extent may be padded
+// (so that stride(1) could possibly be greater than extent(0)).
+//
+// This layout exists for two reasons:
+//
+// 1. Appropriate choice of padding, plus use of overaligned memory,
+//    can ensure any desired power-of-two overalignment of the
+//    beginning of each contiguous segment of elements in an mdspan.
+//    This is useful for hardware that optimizes for overaligned
+//    access.
+//
+// 2. For rank-2 mdspan, this is exactly the layout supported by the
+//    BLAS and LAPACK (where the "leading dimension" of the matrix
+//    (LDA), i.e., the stride, is greater than or equal to the number
+//    of rows).
+//
+// The padding can be either a compile-time value or a run-time value.
+// It is a template parameter of layout_left_padded (the "tag type"),
+// and NOT of the mapping, because mdspan requires that the mapping be
+// a metafunction of the tag type and the extents specialization type.
+template<std::size_t padding_stride = stdex::dynamic_extent>
+struct layout_left_padded {
+  static constexpr size_t padding = padding_stride;
+  template <class Extents>
+  class mapping {
+  public:
+    using extents_type = Extents;
+    using index_type = typename extents_type::index_type;
+    using size_type = typename extents_type::size_type;
+    using rank_type = typename extents_type::rank_type;
+    using layout_type = layout_left_padded<padding_stride>;
+
+  private:
+    using padding_extents_type =
+      stdex::extents<index_type, padding_stride>;
+    using inner_layout_type = stdex::layout_left;
+    using inner_extents_type = decltype(
+      details::pad_extents_left(
+	std::declval<Extents>(),
+	std::declval<padding_extents_type>()
+      )
+    );
+    using inner_mapping_type =
+      typename inner_layout_type::template mapping<inner_extents_type>;
+    using unpadded_extent_type =
+      decltype(details::unpadded_extent_left(std::declval<extents_type>()));
+
+    inner_mapping_type inner_mapping_;
+    unpadded_extent_type unpadded_extent_;
+
+    padding_extents_type padding_extents() const {
+      return details::left_padding_extents<padding_extents_type>(
+        inner_mapping_,
+	std::integral_constant<std::size_t, extents_type::rank()>{});
+    }
+
+  public:
+    // mapping constructor that takes ONLY an extents_type.
+    //
+    // This constructor makes it possible to construct an mdspan
+    // from a pointer and extents, since that requires that
+    // the mapping be constructible from extents alone.
+    MDSPAN_INLINE_FUNCTION constexpr
+    mapping(const extents_type& ext) :
+      inner_mapping_(details::pad_extents_left(
+        ext,
+	padding_extents_type{padding_stride})),
+      unpadded_extent_(details::unpadded_extent_left(ext))
+    {}
+
+    // mapping constructor that takes an extents_type,
+    // AND an integral padding_value.
+    //
+    // This constructor always exists, even if padding is known at
+    // compile time -- just like the extents constructor lets you pass
+    // in all rank() extents, even if some of them are known at
+    // compile time.
+    template<class Size>
+    MDSPAN_INLINE_FUNCTION constexpr
+    mapping(const extents_type& ext,
+	    Size padding_value,
+	    std::enable_if_t<
+	      std::is_convertible<Size, index_type>::value &&
+              std::is_nothrow_constructible<index_type, Size>::value
+	    >* = nullptr) :
+      inner_mapping_(details::pad_extents_left(ext, padding_extents_type{padding_value})),
+      unpadded_extent_(details::unpadded_extent_left(ext))
+    {
+      // We don't have to check padding_value here, because the
+      // padding_extents_type constructor already has a precondition.
+    }
+
+    // Pass in the padding as an extents object.
+    MDSPAN_INLINE_FUNCTION constexpr
+    mapping(const extents_type& ext,
+	    const stdex::extents<index_type, padding_stride>& padding_extents) :
+      inner_mapping_(details::pad_extents_left(ext, padding_extents)),
+      unpadded_extent_(details::unpadded_extent_left(ext))
+    {}
+
+    // FIXME (mfh 2022/09/28) Converting constructor taking
+    // layout_right_padded<other_padding_stride>::mapping<OtherExtents>
+    // is in the proposal, but missing here.
+
+    // layout_stride::mapping deliberately only defines the copy
+    // constructor and copy assignment operator, not the move
+    // constructor or move assignment operator.  This is fine because
+    // all the storage is std::array-like; there's no advantage to
+    // move construction or move assignment.  We imitate this.
+    MDSPAN_INLINE_FUNCTION_DEFAULTED
+    constexpr mapping(const mapping&) noexcept = default;
+    MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED
+    mapping& operator=(const mapping&) noexcept = default;
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr extents_type extents() const noexcept
+    {
+      return details::layout_left_extents<extents_type>(
+	unpadded_extent_,
+	inner_mapping_.extents()
+      );
+    }
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr std::array<index_type, extents_type::rank()>
+    strides() const noexcept
+    {
+      return inner_mapping_.strides();
+    }
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr index_type required_span_size() const noexcept
+    {
+      return inner_mapping_.required_span_size();
+    }
+
+    MDSPAN_TEMPLATE_REQUIRES(
+      class... Indices,
+      /* requires */ (sizeof...(Indices) == Extents::rank() &&
+        _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) &&
+	_MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/)
+      )
+    )
+    MDSPAN_INLINE_FUNCTION
+    constexpr size_t operator()(Indices... idxs) const noexcept {
+      // TODO (mfh 2022/08/30) in debug mode, check precondition before forwarding to inner mapping.
+      return inner_mapping_(std::forward<Indices>(idxs)...);
+    }
+
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept {
+      return extents_type::rank() == 0 ? true :
+	(extents_type::static_extent(0) != stdex::dynamic_extent &&
+	 extents_type::static_extent(0) == unpadded_extent_type::static_extent(0));
+    }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; }
+
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept {
+      return extents_type::rank() == 0 ? true :
+	inner_mapping_.extent(0) == unpadded_extent_.extent(0);
+    }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; }
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr index_type stride(rank_type r) const noexcept {
+      return inner_mapping_.stride(r);
+    }
+  };
+};
+
+template<std::size_t padding_stride = stdex::dynamic_extent>
+struct layout_right_padded {
+  static constexpr size_t padding = padding_stride;
+  template <class Extents>
+  class mapping {
+  public:
+    using extents_type = Extents;
+    using index_type = typename extents_type::index_type;
+    using size_type = typename extents_type::size_type;
+    using rank_type = typename extents_type::rank_type;
+    using layout_type = layout_right_padded<padding_stride>;
+
+  private:
+    using padding_extents_type =
+      stdex::extents<index_type, padding_stride>;
+    using inner_layout_type = stdex::layout_right;
+    using inner_extents_type = decltype(
+      details::pad_extents_right(
+        std::declval<Extents>(),
+        std::declval<padding_extents_type>()
+      )
+    );
+    using inner_mapping_type =
+      typename inner_layout_type::template mapping<inner_extents_type>;
+    using unpadded_extent_type =
+      decltype(details::unpadded_extent_right(std::declval<extents_type>()));
+
+    inner_mapping_type inner_mapping_;
+    unpadded_extent_type unpadded_extent_;
+
+    padding_extents_type padding_extents() const {
+      return details::right_padding_extents<padding_extents_type>(
+        inner_mapping_,
+	std::integral_constant<std::size_t, extents_type::rank()>{});
+    }
+
+  public:
+    // mapping constructor that takes ONLY an extents_type.
+    //
+    // This constructor makes it possible to construct an mdspan
+    // from a pointer and extents, since that requires that
+    // the mapping be constructible from extents alone.
+    MDSPAN_INLINE_FUNCTION constexpr
+    mapping(const extents_type& ext) :
+      inner_mapping_(details::pad_extents_right(
+        ext,
+	padding_extents_type{padding_stride})),
+      unpadded_extent_(details::unpadded_extent_right(ext))
+    {}
+
+    // mapping constructor that takes an extents_type,
+    // AND an integral padding_value.
+    //
+    // This constructor always exists, even if padding is known at
+    // compile time -- just like the extents constructor lets you pass
+    // in all rank() extents, even if some of them are known at
+    // compile time.
+    template<class Size>
+    MDSPAN_INLINE_FUNCTION constexpr
+    mapping(const extents_type& ext,
+	    Size padding_value,
+	    std::enable_if_t<
+	      std::is_convertible<Size, index_type>::value &&
+	      std::is_nothrow_constructible<index_type, Size>::value
+	    >* = nullptr) :
+      inner_mapping_(details::pad_extents_right(ext, padding_extents_type{padding_value})),
+      unpadded_extent_(details::unpadded_extent_right(ext))
+    {
+      // We don't have to check padding_value here, because the
+      // padding_extents_type constructor already has a precondition.
+    }
+
+    // Pass in the padding as an extents object.
+    MDSPAN_INLINE_FUNCTION constexpr
+    mapping(const extents_type& ext,
+	    const stdex::extents<index_type, padding_stride>& padding_extents) :
+      inner_mapping_(details::pad_extents_right(ext, padding_extents)),
+      unpadded_extent_(details::unpadded_extent_right(ext))
+    {}
+
+    // FIXME (mfh 2022/09/28) The converting constructor taking
+    // layout_left_padded<other_padding_stride>::mapping<OtherExtents>
+    // is in the proposal (missing other_padding_stride in R0),
+    // but missing here.
+
+    // layout_stride::mapping deliberately only defines the copy
+    // constructor and copy assignment operator, not the move
+    // constructor or move assignment operator.  This is fine because
+    // all the storage is std::array-like; there's no advantage to
+    // move construction or move assignment.  We imitate this.
+    MDSPAN_INLINE_FUNCTION_DEFAULTED
+    constexpr mapping(const mapping&) noexcept = default;
+    MDSPAN_INLINE_FUNCTION_DEFAULTED _MDSPAN_CONSTEXPR_14_DEFAULTED
+    mapping& operator=(const mapping&) noexcept = default;
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr extents_type extents() const noexcept
+    {
+      return details::layout_right_extents<extents_type>(
+	inner_mapping_.extents(),
+	unpadded_extent_
+      );
+    }
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr std::array<index_type, extents_type::rank()>
+    strides() const noexcept
+    {
+      return inner_mapping_.strides();
+    }
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr index_type required_span_size() const noexcept
+    {
+      return inner_mapping_.required_span_size();
+    }
+
+    MDSPAN_TEMPLATE_REQUIRES(
+      class... Indices,
+      /* requires */ (sizeof...(Indices) == Extents::rank() &&
+        _MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_convertible, Indices, index_type) /*&& ...*/ ) &&
+	_MDSPAN_FOLD_AND(_MDSPAN_TRAIT(std::is_nothrow_constructible, index_type, Indices) /*&& ...*/)
+      )
+    )
+    MDSPAN_INLINE_FUNCTION
+    constexpr size_t operator()(Indices... idxs) const noexcept {
+      // TODO (mfh 2022/08/30) in debug mode, check precondition before forwarding to inner mapping.
+      return inner_mapping_(std::forward<Indices>(idxs)...);
+    }
+
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_always_unique() noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_always_exhaustive() noexcept {
+      return extents_type::rank() == 0 ? true :
+	(extents_type::static_extent(Extents::rank() - 1) != stdex::dynamic_extent &&
+	 extents_type::static_extent(Extents::rank() - 1) == unpadded_extent_type::static_extent(0));
+    }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_always_strided() noexcept { return true; }
+
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_unique() noexcept { return true; }
+    MDSPAN_INLINE_FUNCTION _MDSPAN_CONSTEXPR_14 bool is_exhaustive() const noexcept {
+      return extents_type::rank() == 0 ? true :
+	inner_mapping_.extent(Extents::rank() - 1) == unpadded_extent_.extent(0);
+    }
+    MDSPAN_INLINE_FUNCTION static constexpr bool is_strided() noexcept { return true; }
+
+    MDSPAN_INLINE_FUNCTION
+    constexpr index_type stride(rank_type r) const noexcept {
+      return inner_mapping_.stride(r);
+    }
+  };
+};
+
+}  // end namespace experimental
+}  // end namespace std
diff --git a/cpp/include/raft/thirdparty/mdspan/include/experimental/mdspan b/cpp/include/raft/thirdparty/mdspan/include/experimental/mdspan
index ca6f6b8686..c5fac4c0bd 100644
--- a/cpp/include/raft/thirdparty/mdspan/include/experimental/mdspan
+++ b/cpp/include/raft/thirdparty/mdspan/include/experimental/mdspan
@@ -44,6 +44,7 @@
 #pragma once
 
 #include "__p0009_bits/default_accessor.hpp"
+#include "__p0009_bits/aligned_accessor.hpp"
 #include "__p0009_bits/full_extent_t.hpp"
 #include "__p0009_bits/mdspan.hpp"
 #include "__p0009_bits/dynamic_extent.hpp"
@@ -51,6 +52,7 @@
 #include "__p0009_bits/layout_stride.hpp"
 #include "__p0009_bits/layout_left.hpp"
 #include "__p0009_bits/layout_right.hpp"
+#include "__p0009_bits/layout_padded.hpp"
 #include "__p0009_bits/macros.hpp"
 #include "__p0009_bits/static_array.hpp"
 #include "__p0009_bits/submdspan.hpp"
diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu
index 9d3d5af51e..931c3ecb16 100644
--- a/cpp/test/matrix/linewise_op.cu
+++ b/cpp/test/matrix/linewise_op.cu
@@ -64,10 +64,10 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
     I m = rowmajor ? lineLen : nLines;
     I n = rowmajor ? nLines : lineLen;
 
-    auto in_view  = raft::make_device_matrix_view<const T, I, layout>(in, m, n);
-    auto out_view = raft::make_device_matrix_view<T, I, layout>(out, m, n);
+    auto in_view  = raft::make_device_matrix_view<const T, I, layout>(in, n, m);
+    auto out_view = raft::make_device_matrix_view<T, I, layout>(out, n, m);
 
-    auto vec_view = raft::make_device_vector_view<const T>(vec, m);
+    auto vec_view = raft::make_device_vector_view<const T>(vec, lineLen);
     matrix::linewise_op(handle, in_view, out_view, raft::is_row_major(in_view), f, vec_view);
   }
 
@@ -81,10 +81,10 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
     I m = rowmajor ? lineLen : nLines;
     I n = rowmajor ? nLines : lineLen;
 
-    auto in_view   = raft::make_device_matrix_view<const T, I, layout>(in, m, n);
-    auto out_view  = raft::make_device_matrix_view<T, I, layout>(out, m, n);
-    auto vec1_view = raft::make_device_vector_view<const T, I>(vec1, m);
-    auto vec2_view = raft::make_device_vector_view<const T, I>(vec2, m);
+    auto in_view   = raft::make_device_matrix_view<const T, I, layout>(in, n, m);
+    auto out_view  = raft::make_device_matrix_view<T, I, layout>(out, n, m);
+    auto vec1_view = raft::make_device_vector_view<const T, I>(vec1, lineLen);
+    auto vec2_view = raft::make_device_vector_view<const T, I>(vec2, lineLen);
 
     matrix::linewise_op(
       handle, in_view, out_view, raft::is_row_major(in_view), f, vec1_view, vec2_view);
@@ -99,6 +99,19 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
     return blob;
   }
 
+  template <typename layout>
+  void runLinewiseSumPadded(raft::device_aligned_matrix_view<T, I, layout> out,
+                            raft::device_aligned_matrix_view<const T, I, layout> in,
+                            const I lineLen,
+                            const I nLines,
+                            const bool alongLines,
+                            const T* vec)
+  {
+    auto f        = [] __device__(T a, T b) -> T { return a + b; };
+    auto vec_view = raft::make_device_vector_view<const T, I>(vec, lineLen);
+    matrix::linewise_op(handle, in, out, alongLines, f, vec_view);
+  }
+
   /**
    * Suggest multiple versions of matrix dimensions (n, m), such that
    *
@@ -205,11 +218,129 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
     return r;
   }
 
+  testing::AssertionResult runWithPaddedSpan(std::vector<std::tuple<I, I>>&& dims,
+                                             rmm::device_uvector<T>&& blob)
+  {
+    rmm::device_uvector<T> blob_val(params.checkCorrectness ? blob.size() / 2 : 0, stream);
+
+    stream.synchronize();
+    cudaProfilerStart();
+    testing::AssertionResult r = testing::AssertionSuccess();
+    for (auto alongRows : ::testing::Bool()) {
+      for (auto [n, m] : dims) {
+        if (!r) break;
+        // take dense testdata
+        auto [out, in, vec1, vec2] = assignSafePtrs(blob, n, m);
+        common::nvtx::range dims_scope("Dims-%zu-%zu", std::size_t(n), std::size_t(m));
+        common::nvtx::range dir_scope(alongRows ? "alongRows" : "acrossRows");
+
+        auto lineLen = m;
+        auto nLines  = n;
+
+        // create a padded span based on testdata (just for functional testing)
+        size_t matrix_size_padded;
+        if (alongRows) {
+          auto extents = matrix_extent<I>{n, m};
+          typename raft::layout_right_padded<T>::mapping<matrix_extent<I>> layout{extents};
+          matrix_size_padded = layout.required_span_size();
+        } else {
+          auto extents = matrix_extent<I>{m, n};
+          typename raft::layout_left_padded<T>::mapping<matrix_extent<I>> layout{extents};
+          matrix_size_padded = layout.required_span_size();
+        }
+
+        rmm::device_uvector<T> blob_in(matrix_size_padded, stream);
+        rmm::device_uvector<T> blob_out(matrix_size_padded, stream);
+
+        {
+          auto in2 = in;
+
+          // actual testrun
+          common::nvtx::range vecs_scope("one vec");
+          if (alongRows) {
+            auto inSpan = make_device_aligned_matrix_view<T, I, raft::layout_right_padded<T>>(
+              blob_in.data(), nLines, lineLen);
+            auto outSpan = make_device_aligned_matrix_view<T, I, raft::layout_right_padded<T>>(
+              blob_out.data(), nLines, lineLen);
+            // prep padded input data
+            thrust::for_each_n(rmm::exec_policy(stream),
+                               thrust::make_counting_iterator(0ul),
+                               nLines * lineLen,
+                               [inSpan, in2, lineLen] __device__(size_t i) {
+                                 inSpan(i / lineLen, i % lineLen) = in2[i];
+                               });
+            auto inSpanConst =
+              make_device_aligned_matrix_view<const T, I, raft::layout_right_padded<T>>(
+                blob_in.data(), nLines, lineLen);
+            runLinewiseSumPadded<raft::layout_right_padded<T>>(
+              outSpan, inSpanConst, lineLen, nLines, alongRows, vec1);
+
+            if (params.checkCorrectness) {
+              runLinewiseSum<raft::row_major>(out, in, lineLen, nLines, vec1);
+              auto out_dense = blob_val.data();
+              thrust::for_each_n(rmm::exec_policy(stream),
+                                 thrust::make_counting_iterator(0ul),
+                                 nLines * lineLen,
+                                 [outSpan, out_dense, lineLen] __device__(size_t i) {
+                                   out_dense[i] = outSpan(i / lineLen, i % lineLen);
+                                 });
+              r = devArrMatch(out_dense, out, n * m, CompareApprox<T>(params.tolerance))
+                  << " " << (alongRows ? "alongRows" : "acrossRows")
+                  << " with one vec;  lineLen: " << lineLen << "; nLines " << nLines;
+              if (!r) break;
+            }
+
+          } else {
+            auto inSpan = make_device_aligned_matrix_view<T, I, raft::layout_left_padded<T>>(
+              blob_in.data(), lineLen, nLines);
+            auto outSpan = make_device_aligned_matrix_view<T, I, raft::layout_left_padded<T>>(
+              blob_out.data(), lineLen, nLines);
+            // prep padded input data
+            thrust::for_each_n(rmm::exec_policy(stream),
+                               thrust::make_counting_iterator(0ul),
+                               nLines * lineLen,
+                               [inSpan, in2, lineLen] __device__(size_t i) {
+                                 inSpan(i % lineLen, i / lineLen) = in2[i];
+                               });
+            auto inSpanConst =
+              make_device_aligned_matrix_view<const T, I, raft::layout_left_padded<T>>(
+                blob_in.data(), lineLen, nLines);
+            runLinewiseSumPadded<raft::layout_left_padded<T>>(
+              outSpan, inSpanConst, lineLen, nLines, alongRows, vec1);
+
+            if (params.checkCorrectness) {
+              runLinewiseSum<raft::col_major>(out, in, lineLen, nLines, vec1);
+              auto out_dense = blob_val.data();
+              thrust::for_each_n(rmm::exec_policy(stream),
+                                 thrust::make_counting_iterator(0ul),
+                                 nLines * lineLen,
+                                 [outSpan, out_dense, lineLen] __device__(size_t i) {
+                                   out_dense[i] = outSpan(i % lineLen, i / lineLen);
+                                 });
+              r = devArrMatch(out_dense, out, n * m, CompareApprox<T>(params.tolerance))
+                  << " " << (alongRows ? "alongRows" : "acrossRows")
+                  << " with one vec;  lineLen: " << lineLen << "; nLines " << nLines;
+              if (!r) break;
+            }
+          }
+        }
+      }
+    }
+    cudaProfilerStop();
+
+    return r;
+  }
+
   testing::AssertionResult run()
   {
     return run(suggestDimensions(2), genData(params.workSizeBytes));
   }
 
+  testing::AssertionResult runWithPaddedSpan()
+  {
+    return runWithPaddedSpan(suggestDimensions(2), genData(params.workSizeBytes));
+  }
+
   testing::AssertionResult runEdgeCases()
   {
     std::vector<I> sizes = {1, 2, 3, 4, 7, 16};
@@ -230,6 +361,13 @@ struct LinewiseTest : public ::testing::TestWithParam<typename ParamsReader::Par
   TEST_P(TestClass##_##ElemType##_##IndexType, fun) { ASSERT_TRUE(fun()); }                  \
   INSTANTIATE_TEST_SUITE_P(LinewiseOp, TestClass##_##ElemType##_##IndexType, TestClass##Params)
 
+#define TEST_IT_SPAN(fun, TestClass, ElemType, IndexType)                                        \
+  typedef LinewiseTest<ElemType, IndexType, TestClass> TestClass##Span_##ElemType##_##IndexType; \
+  TEST_P(TestClass##Span_##ElemType##_##IndexType, fun) { ASSERT_TRUE(fun()); }                  \
+  INSTANTIATE_TEST_SUITE_P(LinewiseOpSpan, TestClass##Span_##ElemType##_##IndexType, SpanParams)
+
+auto SpanParams = ::testing::Combine(::testing::Values(0), ::testing::Values(0));
+
 auto TinyParams = ::testing::Combine(::testing::Values(0, 1, 2, 4), ::testing::Values(0, 1, 2, 3));
 
 struct Tiny {
@@ -299,5 +437,10 @@ TEST_IT(run, Gigabyte, double, int);
 TEST_IT(run, TenGigs, float, uint64_t);
 TEST_IT(run, TenGigs, double, uint64_t);
 
+TEST_IT_SPAN(runWithPaddedSpan, Megabyte, float, int);
+TEST_IT_SPAN(runWithPaddedSpan, Megabyte, double, int);
+TEST_IT_SPAN(runWithPaddedSpan, Gigabyte, float, int);
+TEST_IT_SPAN(runWithPaddedSpan, Gigabyte, double, int);
+
 }  // namespace matrix
 }  // end namespace raft
diff --git a/cpp/test/mdarray.cu b/cpp/test/mdarray.cu
index 3931ff224c..c292feb894 100644
--- a/cpp/test/mdarray.cu
+++ b/cpp/test/mdarray.cu
@@ -436,6 +436,425 @@ TEST(MDArray, FuncArg)
   }
 }
 
+void test_mdspan_layout_right_padded()
+{
+  {
+    // 5x2 example,
+    constexpr int n_rows          = 2;
+    constexpr int n_cols          = 5;
+    constexpr int alignment       = 8;
+    constexpr int alignment_bytes = sizeof(int) * alignment;
+
+    int data_row_major[] = {
+      1,
+      2,
+      3,
+      4,
+      5, /* X  X  X */
+      6,
+      7,
+      8,
+      9,
+      10 /* X  X  X */
+    };
+    // manually aligning the above, using -1 as filler
+    static constexpr int X = -1;
+    int data_padded[]      = {1, 2, 3, 4, 5, X, X, X, 6, 7, 8, 9, 10, X, X, X};
+
+    using extents_type = stdex::extents<size_t, stdex::dynamic_extent, stdex::dynamic_extent>;
+    using padded_layout_row_major =
+      stdex::layout_right_padded<detail::padding<int, alignment_bytes>::value>;
+    using padded_mdspan    = stdex::mdspan<int, extents_type, padded_layout_row_major>;
+    using row_major_mdspan = stdex::mdspan<int, extents_type, stdex::layout_right>;
+
+    padded_layout_row_major::mapping<extents_type> layout{extents_type{n_rows, n_cols}};
+
+    auto padded    = padded_mdspan(data_padded, layout);
+    auto row_major = row_major_mdspan(data_row_major, n_rows, n_cols);
+
+    int failures = 0;
+    for (int irow = 0; irow < n_rows; ++irow) {
+      for (int icol = 0; icol < n_cols; ++icol) {
+        if (padded(irow, icol) != row_major(irow, icol)) { ++failures; }
+      }
+    }
+    ASSERT_EQ(failures, 0);
+  }
+}
+
+TEST(MDSpan, LayoutRightPadded) { test_mdspan_layout_right_padded(); }
+
+void test_mdarray_padding()
+{
+  using extents_type = stdex::extents<size_t, dynamic_extent, dynamic_extent>;
+  auto s             = rmm::cuda_stream_default;
+
+  {
+    constexpr int rows            = 6;
+    constexpr int cols            = 7;
+    constexpr int alignment       = 5;
+    constexpr int alignment_bytes = sizeof(int) * alignment;
+
+    /**
+     * padded device array
+     */
+    using padded_layout_row_major =
+      stdex::layout_right_padded<detail::padding<float, alignment_bytes>::value>;
+
+    using padded_mdarray_type = device_mdarray<float, extents_type, padded_layout_row_major>;
+    padded_layout_row_major::mapping<extents_type> layout(extents_type(rows, cols));
+
+    auto device_policy = padded_mdarray_type::container_policy_type{s};
+    static_assert(std::is_same_v<typename decltype(device_policy)::accessor_type,
+                                 detail::device_uvector_policy<float>>);
+    padded_mdarray_type padded_device_array{layout, device_policy};
+
+    // direct access mdarray
+    padded_device_array(0, 3) = 1;
+    ASSERT_EQ(padded_device_array(0, 3), 1);
+
+    // non-const access via mdspan
+    auto d_view = padded_device_array.view();
+    static_assert(!decltype(d_view)::accessor_type::is_host_type::value);
+
+    thrust::device_vector<int32_t> status(1, 0);
+    auto p_status = status.data().get();
+    thrust::for_each_n(rmm::exec_policy(s),
+                       thrust::make_counting_iterator(0ul),
+                       1,
+                       [d_view, p_status] __device__(size_t i) {
+                         if (d_view(0, 3) != 1) { myAtomicAdd(p_status, 1); }
+                         d_view(0, 2) = 3;
+                         if (d_view(0, 2) != 3) { myAtomicAdd(p_status, 1); }
+                       });
+    check_status(p_status, s);
+
+    // const ref access via mdspan
+    auto const& arr = padded_device_array;
+    ASSERT_EQ(arr(0, 3), 1);
+    auto const_d_view = arr.view();
+    thrust::for_each_n(rmm::exec_policy(s),
+                       thrust::make_counting_iterator(0ul),
+                       1,
+                       [const_d_view, p_status] __device__(size_t i) {
+                         if (const_d_view(0, 3) != 1) { myAtomicAdd(p_status, 1); }
+                       });
+    check_status(p_status, s);
+
+    // initialize with sequence
+    thrust::for_each_n(
+      rmm::exec_policy(s),
+      thrust::make_counting_iterator(0ul),
+      rows * cols,
+      [d_view, rows, cols] __device__(size_t i) { d_view(i / cols, i % cols) = i; });
+
+    // manually create span with layout
+    {
+      auto data_padded         = padded_device_array.data_handle();
+      using padded_mdspan_type = device_mdspan<float, extents_type, padded_layout_row_major>;
+      auto padded_span         = padded_mdspan_type(data_padded, layout);
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         rows * cols,
+                         [padded_span, rows, cols, p_status] __device__(size_t i) {
+                           if (padded_span(i / cols, i % cols) != i) myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+    }
+
+    // utilities
+    static_assert(padded_device_array.rank_dynamic() == 2);
+    static_assert(padded_device_array.rank() == 2);
+    static_assert(padded_device_array.is_unique());
+    static_assert(padded_device_array.is_strided());
+
+    static_assert(
+      !std::is_nothrow_default_constructible<padded_mdarray_type>::value);  // cuda stream
+    static_assert(std::is_nothrow_move_constructible<padded_mdarray_type>::value);
+    static_assert(std::is_nothrow_move_assignable<padded_mdarray_type>::value);
+  }
+}
+
+TEST(MDArray, Padding) { test_mdarray_padding(); }
+
+// Test deactivated as submdspan support requires upstream changes
+/*void test_submdspan_padding()
+{
+  using extents_type = stdex::extents<dynamic_extent, dynamic_extent>;
+  auto s             = rmm::cuda_stream_default;
+
+  {
+    constexpr int rows            = 6;
+    constexpr int cols            = 7;
+    constexpr int alignment       = 5;
+    constexpr int alignment_bytes = sizeof(int) * alignment;
+
+    using layout_padded_general =
+      stdex::layout_padded_general<float, stdex::StorageOrderType::row_major_t, alignment_bytes>;
+    using padded_mdarray_type = device_mdarray<float, extents_type, layout_padded_general>;
+    using padded_mdspan_type  = device_mdspan<float, extents_type, layout_padded_general>;
+    layout_padded_general::mapping<extents_type> layout{extents_type{rows, cols}};
+
+    auto device_policy = padded_mdarray_type::container_policy_type{s};
+    static_assert(std::is_same_v<typename decltype(device_policy)::accessor_type,
+                                 detail::device_uvector_policy<float>>);
+    padded_mdarray_type padded_device_array{layout, device_policy};
+
+    // test status
+    thrust::device_vector<int32_t> status(1, 0);
+    auto p_status = status.data().get();
+
+    // initialize with sequence
+    {
+      auto d_view = padded_device_array.view();
+      static_assert(std::is_same_v<typename decltype(d_view)::layout_type, layout_padded_general>);
+      thrust::for_each_n(
+        rmm::exec_policy(s),
+        thrust::make_counting_iterator(0ul),
+        rows * cols,
+        [d_view, rows, cols] __device__(size_t i) { d_view(i / cols, i % cols) = i; });
+    }
+
+    // get mdspan manually from raw data
+    {
+      auto data_padded = padded_device_array.data();
+      auto padded_span = padded_mdspan_type(data_padded, layout);
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         rows * cols,
+                         [padded_span, rows, cols, p_status] __device__(size_t i) {
+                           if (padded_span(i / cols, i % cols) != i) myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+    }
+
+    // full subspan
+    {
+      auto padded_span  = padded_device_array.view();
+      auto subspan_full = stdex::submdspan(padded_span, stdex::full_extent, stdex::full_extent);
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         cols * rows,
+                         [subspan_full, padded_span, rows, cols, p_status] __device__(size_t i) {
+                           if (subspan_full(i / cols, i % cols) != padded_span(i / cols, i % cols))
+                             myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+
+      // resulting submdspan should still be padded
+      static_assert(
+        std::is_same_v<typename decltype(subspan_full)::layout_type, layout_padded_general>);
+    }
+
+    // slicing a row
+    {
+      auto padded_span = padded_device_array.view();
+      auto row3        = stdex::submdspan(padded_span, 3, stdex::full_extent);
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         cols,
+                         [row3, padded_span, p_status] __device__(size_t i) {
+                           if (row3(i) != padded_span(3, i)) myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+
+      // resulting submdspan should still be padded
+      static_assert(std::is_same_v<typename decltype(row3)::layout_type, layout_padded_general>);
+    }
+
+    // slicing a column
+    {
+      auto padded_span = padded_device_array.view();
+      auto col1        = stdex::submdspan(padded_span, stdex::full_extent, 1);
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         rows,
+                         [col1, padded_span, p_status] __device__(size_t i) {
+                           if (col1(i) != padded_span(i, 1)) myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+
+      // resulting submdspan is *NOT* padded anymore
+      static_assert(std::is_same_v<typename decltype(col1)::layout_type, stdex::layout_stride>);
+    }
+
+    // sub-rectangle of 6x7
+    {
+      auto padded_span = padded_device_array.view();
+      auto subspan =
+        stdex::submdspan(padded_span, std::make_tuple(1ul, 4ul), std::make_tuple(2ul, 5ul));
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         (rows - 1) * (cols - 2),
+                         [subspan, rows, cols, padded_span, p_status] __device__(size_t i) {
+                           size_t idx = i / (cols - 2);
+                           size_t idy = i % (cols - 2);
+                           // elements > subspan range can be accessed as well
+                           if (subspan(idx, idy) != padded_span(idx + 1, idy + 2))
+                             myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+
+      // resulting submdspan is *NOT* padded anymore
+      static_assert(std::is_same_v<typename decltype(subspan)::layout_type, stdex::layout_stride>);
+    }
+
+    // sub-rectangle retaining padded layout
+    {
+      auto padded_span = padded_device_array.view();
+      auto subspan =
+        stdex::submdspan(padded_span, std::make_tuple(1ul, 4ul), std::make_tuple(2ul, 5ul));
+      thrust::for_each_n(rmm::exec_policy(s),
+                         thrust::make_counting_iterator(0ul),
+                         (rows - 1) * (cols - 2),
+                         [subspan, rows, cols, padded_span, p_status] __device__(size_t i) {
+                           size_t idx = i / (cols - 2);
+                           size_t idy = i % (cols - 2);
+                           // elements > subspan range can be accessed as well
+                           if (subspan(idx, idy) != padded_span(idx + 1, idy + 2))
+                             myAtomicAdd(p_status, 1);
+                         });
+      check_status(p_status, s);
+
+      // resulting submdspan is *NOT* padded anymore
+      static_assert(std::is_same_v<typename decltype(subspan)::layout_type, stdex::layout_stride>);
+    }
+  }
+}
+
+TEST(MDSpan, SubmdspanPadding) { test_submdspan_padding(); }*/
+
+struct TestElement1 {
+  int a, b;
+};
+
+void test_mdspan_padding_by_type()
+{
+  using extents_type = stdex::extents<size_t, dynamic_extent, dynamic_extent>;
+  auto s             = rmm::cuda_stream_default;
+
+  {
+    constexpr int rows            = 6;
+    constexpr int cols            = 7;
+    constexpr int alignment_bytes = 16;
+
+    thrust::device_vector<int32_t> status(1, 0);
+    auto p_status = status.data().get();
+
+    // manually check strides for row major (c style) padding
+    {
+      using padded_layout_row_major = stdex::layout_right_padded<
+        detail::padding<std::remove_cv_t<std::remove_reference_t<TestElement1>>,
+                        alignment_bytes>::value>;
+
+      using padded_mdarray_type =
+        device_mdarray<TestElement1, extents_type, padded_layout_row_major>;
+      auto device_policy = padded_mdarray_type::container_policy_type{s};
+
+      padded_layout_row_major::mapping<extents_type> layout{extents_type{rows, cols}};
+      padded_mdarray_type padded_device_array{layout, device_policy};
+      int alignment_elements = detail::padding<TestElement1, alignment_bytes>::value;
+      auto padded_span       = padded_device_array.view();
+      thrust::for_each_n(
+        rmm::exec_policy(s),
+        thrust::make_counting_iterator(0ul),
+        rows * cols,
+        [rows, cols, padded_span, alignment_elements, p_status] __device__(size_t i) {
+          size_t idx = i / cols;
+          size_t idy = i % cols;
+          if ((&(padded_span(idx, idy)) - &(padded_span(0, idy))) % alignment_elements != 0)
+            myAtomicAdd(p_status, 1);
+          if ((&(padded_span(idx, idy)) - &(padded_span(idx, 0))) != idy) myAtomicAdd(p_status, 1);
+        });
+      check_status(p_status, s);
+    }
+
+    // manually check strides for col major (f style) padding
+    {
+      using padded_layout_col_major = stdex::layout_left_padded<
+        detail::padding<std::remove_cv_t<std::remove_reference_t<TestElement1>>,
+                        alignment_bytes>::value>;
+      using padded_mdarray_type =
+        device_mdarray<TestElement1, extents_type, padded_layout_col_major>;
+      auto device_policy = padded_mdarray_type::container_policy_type{s};
+
+      padded_layout_col_major::mapping<extents_type> layout{extents_type{rows, cols}};
+      padded_mdarray_type padded_device_array{layout, device_policy};
+      int alignment_elements = detail::padding<TestElement1, alignment_bytes>::value;
+      auto padded_span       = padded_device_array.view();
+      thrust::for_each_n(
+        rmm::exec_policy(s),
+        thrust::make_counting_iterator(0ul),
+        rows * cols,
+        [rows, cols, padded_span, alignment_elements, p_status] __device__(size_t i) {
+          size_t idx = i / cols;
+          size_t idy = i % cols;
+          if ((&(padded_span(idx, idy)) - &(padded_span(idx, 0))) % alignment_elements != 0)
+            myAtomicAdd(p_status, 1);
+          if ((&(padded_span(idx, idy)) - &(padded_span(0, idy))) != idx) myAtomicAdd(p_status, 1);
+        });
+      check_status(p_status, s);
+    }
+  }
+}
+
+TEST(MDSpan, MDSpanPaddingType) { test_mdspan_padding_by_type(); }
+
+void test_mdspan_aligned_matrix()
+{
+  using extents_type = stdex::extents<size_t, dynamic_extent, dynamic_extent>;
+  constexpr int rows = 2;
+  constexpr int cols = 10;
+
+  // manually aligning the above, using -1 as filler
+  static constexpr int X = -1;
+  long data_padded[]     = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  X, X, X, X, X, X,
+                        10, 11, 12, 13, 14, 15, 16, 17, 18, 19, X, X, X, X, X, X};
+
+  auto my_aligned_host_span =
+    make_host_aligned_matrix_view<long, int, layout_right_padded<long>>(data_padded, rows, cols);
+
+  int failures = 0;
+  for (int irow = 0; irow < rows; ++irow) {
+    for (int icol = 0; icol < cols; ++icol) {
+      if (my_aligned_host_span(irow, icol) != irow * cols + icol) { ++failures; }
+    }
+  }
+  ASSERT_EQ(failures, 0);
+
+  // now work with device memory
+  // use simple 1D array to allocate some space
+  auto s          = rmm::cuda_stream_default;
+  using extent_1d = stdex::extents<size_t, dynamic_extent>;
+  layout_c_contiguous::mapping<extent_1d> layout_1d{extent_1d{rows * 32}};
+  using mdarray_t    = device_mdarray<long, extent_1d, layout_c_contiguous>;
+  auto device_policy = mdarray_t::container_policy_type{s};
+  mdarray_t device_array_1d{layout_1d, device_policy};
+
+  // direct access mdarray -- initialize with above data
+  for (int i = 0; i < 32; ++i) {
+    device_array_1d(i) = data_padded[i];
+  }
+
+  auto my_aligned_device_span =
+    make_device_aligned_matrix_view<long, int, layout_right_padded<long>>(
+      device_array_1d.data_handle(), rows, cols);
+
+  thrust::device_vector<int32_t> status(1, 0);
+  auto p_status = status.data().get();
+  thrust::for_each_n(rmm::exec_policy(s),
+                     thrust::make_counting_iterator(0ul),
+                     rows * cols,
+                     [rows, cols, my_aligned_device_span, p_status] __device__(size_t i) {
+                       size_t idx = i / cols;
+                       size_t idy = i % cols;
+                       if (my_aligned_device_span(idx, idy) != i) myAtomicAdd(p_status, 1);
+                     });
+  check_status(p_status, s);
+}
+
+TEST(MDSpan, MDSpanAlignedMatrix) { test_mdspan_aligned_matrix(); }
+
 namespace {
 void test_mdarray_unravel()
 {
@@ -527,4 +946,5 @@ void test_mdarray_unravel()
 }  // anonymous namespace
 
 TEST(MDArray, Unravel) { test_mdarray_unravel(); }
+
 }  // namespace raft