diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index 966e84965d..ed083a1590 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -18,12 +18,9 @@
 
 #pragma once
 
-#include "detail/binary_op.cuh"
-
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
-#include <raft/util/cuda_utils.cuh>
-#include <raft/util/input_validation.hpp>
+#include <raft/linalg/map.cuh>
 
 namespace raft {
 namespace linalg {
@@ -52,7 +49,7 @@ template <typename InType,
 void binaryOp(
   OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
 {
-  detail::binaryOp(out, in1, in2, len, op, stream);
+  return detail::map<false>(stream, out, len, op, in1, in2);
 }
 
 /**
@@ -80,22 +77,7 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void binary_op(raft::device_resources const& handle, InType in1, InType in2, OutType out, Lambda op)
 {
-  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
-  RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous");
-  RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous");
-  RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size(),
-               "Size mismatch between Output and Inputs");
-
-  using in_value_t  = typename InType::value_type;
-  using out_value_t = typename OutType::value_type;
-
-  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
-    binaryOp<in_value_t, Lambda, out_value_t, std::uint32_t>(
-      out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream());
-  } else {
-    binaryOp<in_value_t, Lambda, out_value_t, std::uint64_t>(
-      out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream());
-  }
+  return map(handle, in1, in2, out, op);
 }
 
 /** @} */  // end of group binary_op
@@ -103,4 +85,4 @@ void binary_op(raft::device_resources const& handle, InType in1, InType in2, Out
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh
deleted file mode 100644
index d073e164fd..0000000000
--- a/cpp/include/raft/linalg/detail/binary_op.cuh
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/util/vectorized.cuh>
-
-namespace raft {
-namespace linalg {
-namespace detail {
-
-template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType>
-__global__ void binaryOpKernel(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op)
-{
-  typedef TxN_t<InType, VecLen> InVecType;
-  typedef TxN_t<OutType, VecLen> OutVecType;
-  InVecType a, b;
-  OutVecType c;
-  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-  idx *= InVecType::Ratio;
-  if (idx >= len) return;
-  a.load(in1, idx);
-  b.load(in2, idx);
-#pragma unroll
-  for (int i = 0; i < InVecType::Ratio; ++i) {
-    c.val.data[i] = op(a.val.data[i], b.val.data[i]);
-  }
-  c.store(out, idx);
-}
-
-template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType, int TPB>
-void binaryOpImpl(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
-{
-  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
-  binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
-    <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-/**
- * @brief Checks if addresses are aligned on N bytes
- */
-inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N)
-{
-  return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
-}
-
-template <typename InType,
-          typename Lambda,
-          typename OutType = InType,
-          typename IdxType = int,
-          int TPB          = 256>
-void binaryOp(
-  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
-{
-  constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes           = len * maxSize;
-  uint64_t in1Addr       = uint64_t(in1);
-  uint64_t in2Addr       = uint64_t(in2);
-  uint64_t outAddr       = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) {
-    binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
-      out, in1, in2, len, op, stream);
-  } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) {
-    binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
-      out, in1, in2, len, op, stream);
-  } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) {
-    binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
-      out, in1, in2, len, op, stream);
-  } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) {
-    binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
-      out, in1, in2, len, op, stream);
-  } else if (1 / maxSize) {
-    binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
-      out, in1, in2, len, op, stream);
-  } else {
-    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len, op, stream);
-  }
-}
-
-}  // namespace detail
-}  // namespace linalg
-}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/init.hpp b/cpp/include/raft/linalg/detail/init.hpp
deleted file mode 100644
index 4718a2cb0e..0000000000
--- a/cpp/include/raft/linalg/detail/init.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <rmm/exec_policy.hpp>
-#include <thrust/copy.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-
-namespace raft {
-namespace linalg {
-namespace detail {
-
-template <typename T>
-void range(T* out, int start, int end, cudaStream_t stream)
-{
-  thrust::counting_iterator<int> first(start);
-  thrust::counting_iterator<int> last = first + (end - start);
-  thrust::device_ptr<T> ptr(out);
-  thrust::copy(rmm::exec_policy(stream), first, last, ptr);
-}
-
-/**
- * @brief Like Python range.
- *
- * Fills the output as out[i] = i.
- *
- * \param [out] out device array, size [n]
- * \param [in] n length of the array
- * \param [in] stream cuda stream
- */
-template <typename T, int TPB = 256>
-void range(T* out, int n, cudaStream_t stream)
-{
-  range(out, 0, n, stream);
-}
-
-}  // namespace detail
-}  // namespace linalg
-}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
index e0b473bdd4..90b653b711 100644
--- a/cpp/include/raft/linalg/detail/map.cuh
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -16,43 +16,207 @@
 
 #pragma once
 
-#include <cub/cub.cuh>
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
+#include <raft/util/input_validation.hpp>
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
-namespace raft {
-namespace linalg {
-namespace detail {
+#include <rmm/cuda_stream_view.hpp>
 
-template <typename InType,
-          typename OutType,
-          typename IdxType,
-          typename MapOp,
-          int TPB,
-          typename... Args>
-__global__ void mapKernel(OutType* out, IdxType len, MapOp map, const InType* in, Args... args)
+namespace raft::linalg::detail {
+
+template <bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
+__device__ __forceinline__ auto map_apply(Func f, const IdxT& offset, const InTs&... ins) -> OutT
+{
+  if constexpr (PassOffset) {
+    return f(offset, ins...);
+  } else {
+    return f(ins...);
+  }
+}
+
+template <int R,
+          bool PassOffset,
+          typename OutT,
+          typename IdxT,
+          typename Func,
+          typename... InTs,
+          size_t... Is>
+__device__ __forceinline__ void map_kernel_mainloop(
+  OutT* out_ptr, IdxT offset, IdxT len, Func f, const InTs*... in_ptrs, std::index_sequence<Is...>)
 {
-  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
+  TxN_t<OutT, R> wide;
+  thrust::tuple<TxN_t<InTs, R>...> wide_args;
+  if (offset + R <= len) {
+    (thrust::get<Is>(wide_args).load(in_ptrs, offset), ...);
+#pragma unroll
+    for (int j = 0; j < R; ++j) {
+      wide.val.data[j] = map_apply<PassOffset, OutT, IdxT, Func, InTs...>(
+        f, offset + j, thrust::get<Is>(wide_args).val.data[j]...);
+    }
+    wide.store(out_ptr, offset);
+  }
+}
+
+template <int R, bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
+__global__ void map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
+{
+  const IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if constexpr (R <= 1) {
+    if (tid < len) {
+      out_ptr[tid] = map_apply<PassOffset, OutT, IdxT, Func, InTs...>(f, tid, in_ptrs[tid]...);
+    }
+  } else {
+    using align_bytes = Pow2<sizeof(OutT) * size_t(R)>;
+    using align_elems = Pow2<R>;
+
+    // how many elements to skip in order to do aligned vectorized store
+    const IdxT skip_cnt_left = std::min<IdxT>(IdxT(align_bytes::roundUp(out_ptr) - out_ptr), len);
+
+    // The main loop: process all aligned data
+    map_kernel_mainloop<R, PassOffset, OutT, IdxT, Func, InTs...>(
+      out_ptr, tid * R + skip_cnt_left, len, f, in_ptrs..., std::index_sequence_for<InTs...>());
 
-  if (idx < len) { out[idx] = map(in[idx], args[idx]...); }
+    static_assert(WarpSize >= R);
+    // Processes the skipped elements on the left
+    if (tid < skip_cnt_left) {
+      out_ptr[tid] = map_apply<PassOffset, OutT, IdxT, Func, InTs...>(f, tid, in_ptrs[tid]...);
+    }
+    // Processes the skipped elements on the right
+    const IdxT skip_cnt_right = align_elems::mod(len - skip_cnt_left);
+    const IdxT remain_i       = len - skip_cnt_right + tid;
+    if (remain_i < len) {
+      out_ptr[remain_i] =
+        map_apply<PassOffset, OutT, IdxT, Func, InTs...>(f, remain_i, in_ptrs[remain_i]...);
+    }
+  }
 }
 
-template <typename InType,
+template <int R, bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
+void map_call(rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
+{
+  const IdxT len_vectorized = raft::div_rounding_up_safe<IdxT>(len, R);
+  const int threads =
+    std::max<int>(WarpSize, std::min<IdxT>(raft::bound_by_power_of_two<IdxT>(len_vectorized), 256));
+  const IdxT blocks = raft::div_rounding_up_unsafe<IdxT>(len_vectorized, threads);
+  map_kernel<R, PassOffset><<<blocks, threads, 0, stream>>>(out_ptr, len, f, in_ptrs...);
+}
+
+constexpr int kCoalescedVectorSize = 16;
+constexpr int kSmallInputThreshold = 1024;
+
+struct ratio_selector {
+  int ratio;
+  int align;
+  constexpr inline ratio_selector(int r, int a) : ratio(r), align(a) {}
+
+  template <typename T>
+  constexpr static auto ignoring_alignment() -> ratio_selector
+  {
+    return ratio_selector{raft::div_rounding_up_safe<size_t>(kCoalescedVectorSize, sizeof(T)), 0};
+  }
+
+  template <typename T>
+  explicit ratio_selector(const T* ptr)
+  {
+    constexpr auto s = ignoring_alignment<T>();  // NOLINT
+    align            = int(Pow2<sizeof(T) * s.ratio>::roundUp(ptr) - ptr);
+    ratio            = int(s.ratio);
+  }
+};
+
+constexpr inline auto operator*(const ratio_selector& a, const ratio_selector& b) -> ratio_selector
+{
+  auto ratio = std::min<int>(a.ratio, b.ratio);
+  while ((a.align % ratio) != (b.align % ratio)) {
+    ratio >>= 1;
+  }
+  return ratio_selector{ratio, a.align % ratio};
+}
+
+template <int R, bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
+void map_call_rt(
+  int r, rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
+{
+  if (r >= R) { return map_call<R, PassOffset>(stream, out_ptr, len, f, in_ptrs...); }
+  if constexpr (R > 1) {
+    return map_call_rt<(R >> 1), PassOffset>(r, stream, out_ptr, len, f, in_ptrs...);
+  }
+}
+
+template <bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
+void map(rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
+{
+  // don't vectorize on small inputs
+  if (len <= kSmallInputThreshold) {
+    return map_call<1, PassOffset>(stream, out_ptr, len, f, in_ptrs...);
+  }
+  constexpr int kRatio =
+    (ratio_selector::ignoring_alignment<OutT>() * ... * ratio_selector::ignoring_alignment<InTs>())
+      .ratio;
+  static_assert(kRatio > 0, "Unexpected zero vector size.");
+  const int ratio = (ratio_selector(out_ptr) * ... * ratio_selector(in_ptrs)).ratio;
+  return map_call_rt<kRatio, PassOffset>(ratio, stream, out_ptr, len, f, in_ptrs...);
+}
+
+template <typename OutType,
+          typename InType,
+          typename = raft::enable_if_output_device_mdspan<OutType>,
+          typename = raft::enable_if_input_device_mdspan<InType>>
+void map_check_shape(OutType out, InType in)
+{
+  RAFT_EXPECTS(raft::is_row_or_column_major(in) && out.size() == in.size(),
+               "All inputs must be contiguous and have the same size as the output");
+}
+
+/**
+ * @brief Map a function over a zero or more inputs and optionally a 0-based flat index
+ * (element offset).
+ *
+ * _Performance note_: when possible, this function loads the argument arrays and stores the output
+ * array using vectorized cuda load/store instructions. The size of the vectorization depends on the
+ * size of the largest input/output element type and on the alignment of all pointers.
+ *
+ * @tparam PassOffset whether to pass an offset as a first argument to Func
+ * @tparam OutType data-type of the result (device_mdspan)
+ * @tparam Func the device-lambda performing the actual operation
+ * @tparam InTypes data-types of the inputs (device_mdspan)
+ *
+ * @param[in] res raft::device_resources
+ * @param[out] out the output of the map operation (device_mdspan)
+ * @param[in] f device lambda of type
+ *                 ([auto offset], InTypes::value_type xs...) -> OutType::value_type
+ * @param[in] ins the inputs (each of the same size as the output) (device_mdspan)
+ */
+template <bool PassOffset,
           typename OutType,
-          typename IdxType,
-          typename MapOp,
-          int TPB,
-          typename... Args>
-void mapImpl(
-  OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
-{
-  const int nblks = raft::ceildiv(len, (IdxType)TPB);
-  mapKernel<InType, OutType, IdxType, MapOp, TPB, Args...>
-    <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-}  // namespace detail
-}  // namespace linalg
-};  // namespace raft
+          typename Func,
+          typename... InTypes,
+          typename = raft::enable_if_output_device_mdspan<OutType>,
+          typename = raft::enable_if_input_device_mdspan<InTypes...>>
+void map(const raft::device_resources& res, OutType out, Func f, InTypes... ins)
+{
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+  (map_check_shape(out, ins), ...);
+
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    map<PassOffset,
+        typename OutType::value_type,
+        std::uint32_t,
+        Func,
+        typename InTypes::value_type...>(
+      res.get_stream(), out.data_handle(), uint32_t(out.size()), f, ins.data_handle()...);
+  } else {
+    map<PassOffset,
+        typename OutType::value_type,
+        std::uint64_t,
+        Func,
+        typename InTypes::value_type...>(
+      res.get_stream(), out.data_handle(), uint64_t(out.size()), f, ins.data_handle()...);
+  }
+}
+
+}  // namespace raft::linalg::detail
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index 0e516b4750..42e79a9285 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include "unary_op.cuh"
 #include <cub/cub.cuh>
 #include <raft/core/operators.hpp>
 #include <raft/linalg/unary_op.cuh>
diff --git a/cpp/include/raft/linalg/detail/ternary_op.cuh b/cpp/include/raft/linalg/detail/ternary_op.cuh
deleted file mode 100644
index 7874f20f56..0000000000
--- a/cpp/include/raft/linalg/detail/ternary_op.cuh
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/util/cuda_utils.cuh>
-#include <raft/util/vectorized.cuh>
-
-namespace raft {
-namespace linalg {
-namespace detail {
-template <typename math_t, typename out_t, int veclen_, typename Lambda, typename IdxType>
-__global__ void ternaryOpKernel(
-  out_t* out, const math_t* in1, const math_t* in2, const math_t* in3, IdxType len, Lambda op)
-{
-  typedef raft::TxN_t<math_t, veclen_> VecType;
-  VecType a, b, c;
-  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-  idx *= VecType::Ratio;
-  if (idx >= len) return;
-  a.load(in1, idx);
-  b.load(in2, idx);
-  c.load(in3, idx);
-#pragma unroll
-  for (int i = 0; i < VecType::Ratio; ++i) {
-    a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]);
-  }
-  a.store(out, idx);
-}
-
-template <typename math_t, typename out_t, int veclen_, typename Lambda, typename IdxType, int TPB>
-void ternaryOpImpl(out_t* out,
-                   const math_t* in1,
-                   const math_t* in2,
-                   const math_t* in3,
-                   IdxType len,
-                   Lambda op,
-                   cudaStream_t stream)
-{
-  const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB);
-  ternaryOpKernel<math_t, out_t, veclen_, Lambda, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, in1, in2, in3, len, op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-/**
- * @brief perform element-wise ternary operation on the input arrays
- * @tparam math_t data-type upon which the math operation will be performed
- * @tparam Lambda the device-lambda performing the actual operation
- * @tparam IdxType Integer type used to for addressing
- * @tparam TPB threads-per-block in the final kernel launched
- * @param out the output array
- * @param in1 the first input array
- * @param in2 the second input array
- * @param in3 the third input array
- * @param len number of elements in the input array
- * @param op the device-lambda
- * @param stream cuda stream where to launch work
- */
-template <typename math_t, typename out_t, typename Lambda, typename IdxType = int, int TPB = 256>
-void ternaryOp(out_t* out,
-               const math_t* in1,
-               const math_t* in2,
-               const math_t* in3,
-               IdxType len,
-               Lambda op,
-               cudaStream_t stream)
-{
-  size_t bytes = len * sizeof(math_t);
-  if (16 / sizeof(math_t) && bytes % 16 == 0) {
-    ternaryOpImpl<math_t, out_t, 16 / sizeof(math_t), Lambda, IdxType, TPB>(
-      out, in1, in2, in3, len, op, stream);
-  } else if (8 / sizeof(math_t) && bytes % 8 == 0) {
-    ternaryOpImpl<math_t, out_t, 8 / sizeof(math_t), Lambda, IdxType, TPB>(
-      out, in1, in2, in3, len, op, stream);
-  } else if (4 / sizeof(math_t) && bytes % 4 == 0) {
-    ternaryOpImpl<math_t, out_t, 4 / sizeof(math_t), Lambda, IdxType, TPB>(
-      out, in1, in2, in3, len, op, stream);
-  } else if (2 / sizeof(math_t) && bytes % 2 == 0) {
-    ternaryOpImpl<math_t, out_t, 2 / sizeof(math_t), Lambda, IdxType, TPB>(
-      out, in1, in2, in3, len, op, stream);
-  } else if (1 / sizeof(math_t)) {
-    ternaryOpImpl<math_t, out_t, 1 / sizeof(math_t), Lambda, IdxType, TPB>(
-      out, in1, in2, in3, len, op, stream);
-  } else {
-    ternaryOpImpl<math_t, out_t, 1, Lambda, IdxType, TPB>(out, in1, in2, in3, len, op, stream);
-  }
-}
-
-};  // end namespace detail
-};  // end namespace linalg
-};  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh
deleted file mode 100644
index cdadc6f868..0000000000
--- a/cpp/include/raft/linalg/detail/unary_op.cuh
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/util/cuda_utils.cuh>
-#include <raft/util/cudart_utils.hpp>
-#include <raft/util/vectorized.cuh>
-
-namespace raft {
-namespace linalg {
-namespace detail {
-
-template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType>
-__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op)
-{
-  typedef TxN_t<InType, VecLen> InVecType;
-  typedef TxN_t<OutType, VecLen> OutVecType;
-  InVecType a;
-  OutVecType b;
-  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-  idx *= InVecType::Ratio;
-  if (idx >= len) return;
-  a.load(in, idx);
-#pragma unroll
-  for (int i = 0; i < InVecType::Ratio; ++i) {
-    b.val.data[i] = op(a.val.data[i]);
-  }
-  b.store(out, idx);
-}
-
-template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType, int TPB>
-void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
-{
-  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
-  unaryOpKernel<InType, VecLen, Lambda, OutType, IdxType>
-    <<<nblks, TPB, 0, stream>>>(out, in, len, op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-template <typename InType,
-          typename Lambda,
-          typename IdxType = int,
-          typename OutType = InType,
-          int TPB          = 256>
-void unaryOpCaller(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
-{
-  if (len <= 0) return;  // silently skip in case of 0 length input
-  constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
-  size_t bytes           = len * maxSize;
-  uint64_t inAddr        = uint64_t(in);
-  uint64_t outAddr       = uint64_t(out);
-  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) {
-    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
-  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) {
-    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
-  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) {
-    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
-  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) {
-    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
-  } else if (1 / maxSize) {
-    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
-  } else {
-    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
-  }
-}
-
-template <typename OutType, typename Lambda, typename IdxType>
-__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op)
-{
-  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
-  if (idx < len) { op(out + idx, idx); }
-}
-
-template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
-void writeOnlyUnaryOpCaller(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
-{
-  if (len <= 0) return;  // silently skip in case of 0 length input
-  auto nblks = raft::ceildiv<IdxType>(len, TPB);
-  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType><<<nblks, TPB, 0, stream>>>(out, len, op);
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-};  // end namespace detail
-};  // end namespace linalg
-};  // end namespace raft
diff --git a/cpp/include/raft/linalg/init.cuh b/cpp/include/raft/linalg/init.cuh
index 5a810bf2ba..ad772b3989 100644
--- a/cpp/include/raft/linalg/init.cuh
+++ b/cpp/include/raft/linalg/init.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,11 +18,10 @@
 
 #pragma once
 
-#include "detail/init.hpp"
+#include <raft/linalg/map.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-namespace raft {
-namespace linalg {
+namespace raft::linalg {
 
 /**
  * @brief Like Python range.
@@ -37,7 +36,8 @@ namespace linalg {
 template <typename T>
 void range(T* out, int start, int end, cudaStream_t stream)
 {
-  detail::range(out, start, end, stream);
+  return detail::map<true>(
+    stream, out, end - start, compose_op{cast_op<T>{}, add_const_op<int>{start}});
 }
 
 /**
@@ -52,7 +52,7 @@ void range(T* out, int start, int end, cudaStream_t stream)
 template <typename T, int TPB = 256>
 void range(T* out, int n, cudaStream_t stream)
 {
-  detail::range(out, n, stream);
+  return detail::map<true>(stream, out, n, cast_op<T>{});
 }
 
 /**
@@ -68,7 +68,6 @@ void zero(T* out, int n, cudaStream_t stream)
   RAFT_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(out), 0, n * sizeof(T), stream));
 }
 
-}  // namespace linalg
-}  // namespace raft
+}  // namespace raft::linalg
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh
index 2b9e6c80a0..305a8aa3cc 100644
--- a/cpp/include/raft/linalg/map.cuh
+++ b/cpp/include/raft/linalg/map.cuh
@@ -22,119 +22,267 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
-#include <raft/util/input_validation.hpp>
-#include <thrust/tabulate.h>
 
-namespace raft {
-namespace linalg {
+namespace raft::linalg {
 
 /**
- * @brief CUDA version of map
- * @tparam InType data-type upon which the math operation will be performed
- * @tparam MapOp the device-lambda performing the actual operation
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam Args additional parameters
- * @tparam OutType data-type in which the result will be stored
- * @param out the output of the map operation (assumed to be a device pointer)
- * @param len number of elements in the input array
- * @param map the device-lambda
- * @param stream cuda-stream where to launch this kernel
- * @param in the input array
- * @param args additional input arrays
+ * @defgroup map Mapping ops
+ * @{
+ */
+
+/**
+ * @brief Map a function over zero or more input mdspans of the same size.
+ *
+ * The algorithm applied on `k` inputs can be described in a following pseudo-code:
+ * @code
+ *  for (auto i: [0 ... out.size()]) {
+ *    out[i] = f(in_0[i], in_1[i], ..., in_k[i])
+ *  }
+ * @endcode
+ *
+ * _Performance note_: when possible, this function loads the argument arrays and stores the output
+ * array using vectorized cuda load/store instructions. The size of the vectorization depends on the
+ * size of the largest input/output element type and on the alignment of all pointers.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *  #include <raft/core/device_mdarray.hpp>
+ *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/core/operators.hpp>
+ *  #include <raft/linalg/map.cuh>
+ *
+ *  auto input = raft::make_device_vector<int>(res, n);
+ *  ... fill input ..
+ *  auto squares = raft::make_device_vector<int>(res, n);
+ *  raft::linalg::map_offset(res, squares.view(), raft::sq_op{}, input.view());
+ * @endcode
+ *
+ * @tparam OutType data-type of the result (device_mdspan)
+ * @tparam Func the device-lambda performing the actual operation
+ * @tparam InTypes data-types of the inputs (device_mdspan)
+ *
+ * @param[in] res raft::device_resources
+ * @param[out] out the output of the map operation (device_mdspan)
+ * @param[in] f device lambda
+ *                 (InTypes::value_type xs...) -> OutType::value_type
+ * @param[in] ins the inputs (each of the same size as the output) (device_mdspan)
  */
-template <typename InType,
-          typename MapOp,
-          typename IdxType = std::uint32_t,
-          int TPB          = 256,
-          typename OutType = InType,
-          typename... Args>
-void map_k(
-  OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+template <typename OutType,
+          typename Func,
+          typename... InTypes,
+          typename = raft::enable_if_output_device_mdspan<OutType>,
+          typename = raft::enable_if_input_device_mdspan<InTypes...>>
+void map(const raft::device_resources& res, OutType out, Func f, InTypes... ins)
 {
-  detail::mapImpl<InType, OutType, IdxType, MapOp, TPB, Args...>(
-    out, len, map, stream, in, args...);
+  return detail::map<false>(res, out, f, ins...);
 }
 
 /**
- * @defgroup map Mapping ops
- * @{
+ * @brief Map a function over one mdspan.
+ *
+ * @tparam InType1 data-type of the input (device_mdspan)
+ * @tparam OutType data-type of the result (device_mdspan)
+ * @tparam Func the device-lambda performing the actual operation
+ *
+ * @param[in] res raft::device_resources
+ * @param[in] in1 the input (the same size as the output) (device_mdspan)
+ * @param[out] out the output of the map operation (device_mdspan)
+ * @param[in] f device lambda
+ *                 (InType1::value_type x) -> OutType::value_type
  */
+template <typename InType1,
+          typename OutType,
+          typename Func,
+          typename = raft::enable_if_output_device_mdspan<OutType>,
+          typename = raft::enable_if_input_device_mdspan<InType1>>
+void map(const raft::device_resources& res, InType1 in1, OutType out, Func f)
+{
+  return detail::map<false>(res, out, f, in1);
+}
 
 /**
- * @brief CUDA version of map
- * @tparam InType data-type for math operation of type raft::device_mdspan
- * @tparam MapOp the device-lambda performing the actual operation
- * @tparam TPB threads-per-block in the final kernel launched
- * @tparam OutType data-type of result of type raft::device_mdspan
- * @tparam Args additional parameters
- * @param[in] handle raft::device_resources
- * @param[in] in the input of type raft::device_mdspan
- * @param[out] out the output of the map operation of type raft::device_mdspan
- * @param[in] map the device-lambda
- * @param[in] args additional input arrays
+ * @brief Map a function over two mdspans.
+ *
+ * @tparam InType1 data-type of the input (device_mdspan)
+ * @tparam InType2 data-type of the input (device_mdspan)
+ * @tparam OutType data-type of the result (device_mdspan)
+ * @tparam Func the device-lambda performing the actual operation
+ *
+ * @param[in] res raft::device_resources
+ * @param[in] in1 the input (the same size as the output) (device_mdspan)
+ * @param[in] in2 the input (the same size as the output) (device_mdspan)
+ * @param[out] out the output of the map operation (device_mdspan)
+ * @param[in] f device lambda
+ *                 (InType1::value_type x1, InType2::value_type x2) -> OutType::value_type
  */
-template <typename InType,
-          typename MapOp,
+template <typename InType1,
+          typename InType2,
           typename OutType,
-          int TPB = 256,
-          typename... Args,
-          typename = raft::enable_if_input_device_mdspan<InType>,
-          typename = raft::enable_if_output_device_mdspan<OutType>>
-void map(raft::device_resources const& handle, InType in, OutType out, MapOp map, Args... args)
+          typename Func,
+          typename = raft::enable_if_output_device_mdspan<OutType>,
+          typename = raft::enable_if_input_device_mdspan<InType1, InType2>>
+void map(const raft::device_resources& res, InType1 in1, InType2 in2, OutType out, Func f)
 {
-  using in_value_t  = typename InType::value_type;
-  using out_value_t = typename OutType::value_type;
-
-  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output is not exhaustive");
-  RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input is not exhaustive");
-  RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Input and Output");
-
-  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
-    map_k<in_value_t, MapOp, std::uint32_t, TPB, out_value_t, Args...>(
-      out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...);
-  } else {
-    map_k<in_value_t, MapOp, std::uint64_t, TPB, out_value_t, Args...>(
-      out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...);
-  }
+  return detail::map<false>(res, out, f, in1, in2);
 }
 
 /**
- * @brief Perform an element-wise unary operation on the input offset into the output array
+ * @brief Map a function over three mdspans.
+ *
+ * @tparam InType1 data-type of the input 1 (device_mdspan)
+ * @tparam InType2 data-type of the input 2 (device_mdspan)
+ * @tparam InType3 data-type of the input 3 (device_mdspan)
+ * @tparam OutType data-type of the result (device_mdspan)
+ * @tparam Func the device-lambda performing the actual operation
+ *
+ * @param[in] res raft::device_resources
+ * @param[in] in1 the input 1 (the same size as the output) (device_mdspan)
+ * @param[in] in2 the input 2 (the same size as the output) (device_mdspan)
+ * @param[in] in3 the input 3 (the same size as the output) (device_mdspan)
+ * @param[out] out the output of the map operation (device_mdspan)
+ * @param[in] f device lambda
+ *   (InType1::value_type x1, InType2::value_type x2, InType3::value_type x3) -> OutType::value_type
+ */
+template <typename InType1,
+          typename InType2,
+          typename InType3,
+          typename OutType,
+          typename Func,
+          typename = raft::enable_if_output_device_mdspan<OutType>,
+          typename = raft::enable_if_input_device_mdspan<InType1, InType2, InType3>>
+void map(
+  const raft::device_resources& res, InType1 in1, InType2 in2, InType3 in3, OutType out, Func f)
+{
+  return detail::map<false>(res, out, f, in1, in2, in3);
+}
+
+/**
+ *  @brief Map a function over zero-based flat index (element offset) and zero or more inputs.
+ *
+ * The algorithm applied on `k` inputs can be described in a following pseudo-code:
+ * @code
+ *  for (auto i: [0 ... out.size()]) {
+ *    out[i] = f(i, in_0[i], in_1[i], ..., in_k[i])
+ *  }
+ * @endcode
+ *
+ * _Performance note_: when possible, this function loads the argument arrays and stores the output
+ * array using vectorized cuda load/store instructions. The size of the vectorization depends on the
+ * size of the largest input/output element type and on the alignment of all pointers.
  *
  * Usage example:
  * @code{.cpp}
  *  #include <raft/core/device_mdarray.hpp>
- *  #include <raft/core/handle.hpp>
+ *  #include <raft/core/device_resources.hpp>
  *  #include <raft/core/operators.hpp>
  *  #include <raft/linalg/map.cuh>
- *  ...
- *  raft::handle_t handle;
+ *
  *  auto squares = raft::make_device_vector<int>(handle, n);
- *  raft::linalg::map_offset(handle, squares.view(), raft::sq_op());
+ *  raft::linalg::map_offset(res, squares.view(), raft::sq_op{});
  * @endcode
  *
- * @tparam OutType Output mdspan type
- * @tparam MapOp   The unary operation type with signature `OutT func(const IdxT& idx);`
- * @param[in]  handle The raft handle
- * @param[out] out    Output array
- * @param[in]  op     The unary operation
+ * @tparam OutType data-type of the result (device_mdspan)
+ * @tparam Func the device-lambda performing the actual operation
+ * @tparam InTypes data-types of the inputs (device_mdspan)
+ *
+ * @param[in] res raft::device_resources
+ * @param[out] out the output of the map operation (device_mdspan)
+ * @param[in] f device lambda
+ *                 (auto offset, InTypes::value_type xs...) -> OutType::value_type
+ * @param[in] ins the inputs (each of the same size as the output) (device_mdspan)
  */
 template <typename OutType,
-          typename MapOp,
-          typename = raft::enable_if_output_device_mdspan<OutType>>
-void map_offset(const raft::device_resources& handle, OutType out, MapOp op)
+          typename Func,
+          typename... InTypes,
+          typename = raft::enable_if_output_device_mdspan<OutType>,
+          typename = raft::enable_if_input_device_mdspan<InTypes...>>
+void map_offset(const raft::device_resources& res, OutType out, Func f, InTypes... ins)
 {
-  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+  return detail::map<true>(res, out, f, ins...);
+}
 
-  using out_value_t = typename OutType::value_type;
+/**
+ * @brief Map a function over zero-based flat index (element offset) and one mdspan.
+ *
+ * @tparam InType1 data-type of the input (device_mdspan)
+ * @tparam OutType data-type of the result (device_mdspan)
+ * @tparam Func the device-lambda performing the actual operation
+ *
+ * @param[in] res raft::device_resources
+ * @param[in] in1 the input (the same size as the output) (device_mdspan)
+ * @param[out] out the output of the map operation (device_mdspan)
+ * @param[in] f device lambda
+ *                 (auto offset, InType1::value_type x) -> OutType::value_type
+ */
+template <typename InType1,
+          typename OutType,
+          typename Func,
+          typename = raft::enable_if_output_device_mdspan<OutType>,
+          typename = raft::enable_if_input_device_mdspan<InType1>>
+void map_offset(const raft::device_resources& res, InType1 in1, OutType out, Func f)
+{
+  return detail::map<true>(res, out, f, in1);
+}
 
-  thrust::tabulate(
-    handle.get_thrust_policy(), out.data_handle(), out.data_handle() + out.size(), op);
+/**
+ * @brief Map a function over zero-based flat index (element offset) and two mdspans.
+ *
+ * @tparam InType1 data-type of the input (device_mdspan)
+ * @tparam InType2 data-type of the input (device_mdspan)
+ * @tparam OutType data-type of the result (device_mdspan)
+ * @tparam Func the device-lambda performing the actual operation
+ *
+ * @param[in] res raft::device_resources
+ * @param[in] in1 the input (the same size as the output) (device_mdspan)
+ * @param[in] in2 the input (the same size as the output) (device_mdspan)
+ * @param[out] out the output of the map operation (device_mdspan)
+ * @param[in] f device lambda
+ *    (auto offset, InType1::value_type x1, InType2::value_type x2) -> OutType::value_type
+ */
+template <typename InType1,
+          typename InType2,
+          typename OutType,
+          typename Func,
+          typename = raft::enable_if_output_device_mdspan<OutType>,
+          typename = raft::enable_if_input_device_mdspan<InType1, InType2>>
+void map_offset(const raft::device_resources& res, InType1 in1, InType2 in2, OutType out, Func f)
+{
+  return detail::map<true>(res, out, f, in1, in2);
+}
+
+/**
+ * @brief Map a function over zero-based flat index (element offset) and three mdspans.
+ *
+ * @tparam InType1 data-type of the input 1 (device_mdspan)
+ * @tparam InType2 data-type of the input 2 (device_mdspan)
+ * @tparam InType3 data-type of the input 3 (device_mdspan)
+ * @tparam OutType data-type of the result (device_mdspan)
+ * @tparam Func the device-lambda performing the actual operation
+ *
+ * @param[in] res raft::device_resources
+ * @param[in] in1 the input 1 (the same size as the output) (device_mdspan)
+ * @param[in] in2 the input 2 (the same size as the output) (device_mdspan)
+ * @param[in] in3 the input 3 (the same size as the output) (device_mdspan)
+ * @param[out] out the output of the map operation (device_mdspan)
+ * @param[in] f device lambda
+ *   (auto offset, InType1::value_type x1, InType2::value_type x2, InType3::value_type x3)
+ *      -> OutType::value_type
+ */
+template <typename InType1,
+          typename InType2,
+          typename InType3,
+          typename OutType,
+          typename Func,
+          typename = raft::enable_if_output_device_mdspan<OutType>,
+          typename = raft::enable_if_input_device_mdspan<InType1, InType2, InType3>>
+void map_offset(
+  const raft::device_resources& res, InType1 in1, InType2 in2, InType3 in3, OutType out, Func f)
+{
+  return detail::map<true>(res, out, f, in1, in2, in3);
 }
 
 /** @} */  // end of map
 
-}  // namespace linalg
-};  // namespace raft
+}  // namespace raft::linalg
 
 #endif
diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh
index aa3859bc23..1e347d69be 100644
--- a/cpp/include/raft/linalg/ternary_op.cuh
+++ b/cpp/include/raft/linalg/ternary_op.cuh
@@ -19,11 +19,9 @@
 
 #pragma once
 
-#include "detail/ternary_op.cuh"
-
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
-#include <raft/util/input_validation.hpp>
+#include <raft/linalg/map.cuh>
 
 namespace raft {
 namespace linalg {
@@ -50,7 +48,7 @@ void ternaryOp(out_t* out,
                Lambda op,
                cudaStream_t stream)
 {
-  detail::ternaryOp(out, in1, in2, in3, len, op, stream);
+  return detail::map<false>(stream, out, len, op, in1, in2, in3);
 }
 
 /**
@@ -80,33 +78,7 @@ template <typename InType,
 void ternary_op(
   raft::device_resources const& handle, InType in1, InType in2, InType in3, OutType out, Lambda op)
 {
-  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
-  RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous");
-  RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous");
-  RAFT_EXPECTS(raft::is_row_or_column_major(in3), "Input 3 must be contiguous");
-  RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size() && in2.size() == in3.size(),
-               "Size mismatch between Output and Inputs");
-
-  using in_value_t  = typename InType::value_type;
-  using out_value_t = typename OutType::value_type;
-
-  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
-    ternaryOp<in_value_t, Lambda, out_value_t, std::uint32_t>(out.data_handle(),
-                                                              in1.data_handle(),
-                                                              in2.data_handle(),
-                                                              in3.data_handle(),
-                                                              out.size(),
-                                                              op,
-                                                              handle.get_stream());
-  } else {
-    ternaryOp<in_value_t, Lambda, out_value_t, std::uint64_t>(out.data_handle(),
-                                                              in1.data_handle(),
-                                                              in2.data_handle(),
-                                                              in3.data_handle(),
-                                                              out.size(),
-                                                              op,
-                                                              handle.get_stream());
-  }
+  return map(handle, in1, in2, in3, out, op);
 }
 
 /** @} */  // end of group ternary_op
@@ -114,4 +86,4 @@ void ternary_op(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index ce102adfd2..23f932d2f2 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -18,11 +18,9 @@
 
 #pragma once
 
-#include "detail/unary_op.cuh"
-
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
-#include <raft/util/input_validation.hpp>
+#include <raft/linalg/map.cuh>
 
 namespace raft {
 namespace linalg {
@@ -48,7 +46,7 @@ template <typename InType,
           int TPB          = 256>
 void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
 {
-  detail::unaryOpCaller(out, in, len, op, stream);
+  return detail::map<false>(stream, out, len, op, in);
 }
 
 /**
@@ -71,7 +69,11 @@ void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_
 template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
 void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
 {
-  detail::writeOnlyUnaryOpCaller(out, len, op, stream);
+  return detail::map<true>(stream, out, len, [op] __device__(IdxType offset) {
+    OutType r;
+    op(&r, offset);
+    return r;
+  });
 }
 
 /**
@@ -97,20 +99,7 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void unary_op(raft::device_resources const& handle, InType in, OutType out, Lambda op)
 {
-  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
-  RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
-  RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input");
-
-  using in_value_t  = typename InType::value_type;
-  using out_value_t = typename OutType::value_type;
-
-  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
-    unaryOp<in_value_t, Lambda, std::uint32_t, out_value_t>(
-      out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream());
-  } else {
-    unaryOp<in_value_t, Lambda, std::uint64_t, out_value_t>(
-      out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream());
-  }
+  return map(handle, in, out, op);
 }
 
 /**
@@ -130,17 +119,7 @@ template <typename OutType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void write_only_unary_op(const raft::device_resources& handle, OutType out, Lambda op)
 {
-  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
-
-  using out_value_t = typename OutType::value_type;
-
-  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
-    writeOnlyUnaryOp<out_value_t, Lambda, std::uint32_t>(
-      out.data_handle(), out.size(), op, handle.get_stream());
-  } else {
-    writeOnlyUnaryOp<out_value_t, Lambda, std::uint64_t>(
-      out.data_handle(), out.size(), op, handle.get_stream());
-  }
+  return writeOnlyUnaryOp(out.data_handle(), out.size(), op, handle.get_stream());
 }
 
 /** @} */  // end of group unary_op
diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh
index f597bbd1c6..ed2fb4d209 100644
--- a/cpp/include/raft/matrix/init.cuh
+++ b/cpp/include/raft/matrix/init.cuh
@@ -18,6 +18,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
+#include <raft/linalg/map.cuh>
 #include <raft/matrix/detail/math.cuh>
 #include <raft/matrix/matrix.cuh>
 
@@ -65,9 +66,7 @@ void fill(raft::device_resources const& handle,
           raft::device_mdspan<math_t, extents, layout> inout,
           math_t scalar)
 {
-  RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Data layout not supported");
-  detail::setValue(
-    inout.data_handle(), inout.data_handle(), scalar, inout.size(), handle.get_stream());
+  linalg::map(handle, inout, raft::const_op{scalar});
 }
 
 /** @} */  // end of group matrix_init
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index 4c6ef0e30b..4b6e6f5e31 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -42,11 +42,11 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <cub/cub.cuh>
-#include <thrust/fill.h>
-#include <thrust/sequence.h>
 
 #include <cuda_fp16.h>
 
+#include <optional>
+
 namespace raft::neighbors::ivf_pq::detail {
 
 /**
@@ -1311,13 +1311,13 @@ void ivfpq_search_worker(raft::device_resources const& handle,
     // of a cluster by processing the cluster at the same time as much as
     // possible.
     index_list_sorted_buf.resize(n_queries * n_probes, stream);
-    rmm::device_uvector<uint32_t> index_list_buf(n_queries * n_probes, stream, mr);
+    auto index_list_buf =
+      make_device_mdarray<uint32_t>(handle, mr, make_extents<uint32_t>(n_queries * n_probes));
     rmm::device_uvector<uint32_t> cluster_labels_out(n_queries * n_probes, stream, mr);
-    auto index_list   = index_list_buf.data();
+    auto index_list   = index_list_buf.data_handle();
     index_list_sorted = index_list_sorted_buf.data();
-    thrust::sequence(handle.get_thrust_policy(),
-                     thrust::device_pointer_cast(index_list),
-                     thrust::device_pointer_cast(index_list + n_queries * n_probes));
+
+    linalg::map_offset(handle, index_list_buf.view(), identity_op{});
 
     int begin_bit             = 0;
     int end_bit               = sizeof(uint32_t) * 8;
@@ -1376,13 +1376,15 @@ void ivfpq_search_worker(raft::device_resources const& handle,
                                                                   topK);
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
-  rmm::device_uvector<float> query_kths(0, stream, mr);
+  std::optional<device_vector<float>> query_kths_buf{std::nullopt};
+  float* query_kths = nullptr;
   if (manage_local_topk) {
-    query_kths.resize(n_queries, stream);
-    thrust::fill_n(handle.get_thrust_policy(),
-                   query_kths.data(),
-                   n_queries,
-                   float(dummy_block_sort_t<ScoreT, IdxT>::queue_t::kDummy));
+    query_kths_buf.emplace(
+      make_device_mdarray<float>(handle, mr, make_extents<uint32_t>(n_queries)));
+    linalg::map(handle,
+                query_kths_buf->view(),
+                raft::const_op<float>{dummy_block_sort_t<ScoreT, IdxT>::queue_t::kDummy});
+    query_kths = query_kths_buf->data_handle();
   }
   search_instance(stream,
                   index.size(),
@@ -1401,7 +1403,7 @@ void ivfpq_search_worker(raft::device_resources const& handle,
                   chunk_index.data(),
                   query,
                   index_list_sorted,
-                  query_kths.data(),
+                  query_kths,
                   device_lut.data(),
                   distances_buf.data(),
                   neighbors_ptr);
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 5b52374789..15b40808ee 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -37,13 +37,15 @@ void mapLaunch(OutType* out,
   raft::device_resources handle{stream};
   auto out_view = raft::make_device_vector_view(out, len);
   auto in1_view = raft::make_device_vector_view(in1, len);
+  auto in2_view = raft::make_device_vector_view(in2, len);
+  auto in3_view = raft::make_device_vector_view(in3, len);
   map(
     handle,
-    in1_view,
     out_view,
     [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; },
-    in2,
-    in3);
+    in1_view,
+    in2_view,
+    in3_view);
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>