diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index ed083a1590..966e84965d 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -18,9 +18,12 @@
 
 #pragma once
 
+#include "detail/binary_op.cuh"
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
-#include <raft/linalg/map.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/input_validation.hpp>
 
 namespace raft {
 namespace linalg {
@@ -49,7 +52,7 @@ template <typename InType,
 void binaryOp(
   OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
 {
-  return detail::map<false>(stream, out, len, op, in1, in2);
+  detail::binaryOp(out, in1, in2, len, op, stream);
 }
 
 /**
@@ -77,7 +80,22 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void binary_op(raft::device_resources const& handle, InType in1, InType in2, OutType out, Lambda op)
 {
-  return map(handle, in1, in2, out, op);
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous");
+  RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size(),
+               "Size mismatch between Output and Inputs");
+
+  using in_value_t  = typename InType::value_type;
+  using out_value_t = typename OutType::value_type;
+
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    binaryOp<in_value_t, Lambda, out_value_t, std::uint32_t>(
+      out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream());
+  } else {
+    binaryOp<in_value_t, Lambda, out_value_t, std::uint64_t>(
+      out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream());
+  }
 }
 
 /** @} */  // end of group binary_op
@@ -85,4 +103,4 @@ void binary_op(raft::device_resources const& handle, InType in1, InType in2, Out
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh
new file mode 100644
index 0000000000..d073e164fd
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/binary_op.cuh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType>
+__global__ void binaryOpKernel(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op)
+{
+  typedef TxN_t<InType, VecLen> InVecType;
+  typedef TxN_t<OutType, VecLen> OutVecType;
+  InVecType a, b;
+  OutVecType c;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= InVecType::Ratio;
+  if (idx >= len) return;
+  a.load(in1, idx);
+  b.load(in2, idx);
+#pragma unroll
+  for (int i = 0; i < InVecType::Ratio; ++i) {
+    c.val.data[i] = op(a.val.data[i], b.val.data[i]);
+  }
+  c.store(out, idx);
+}
+
+template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType, int TPB>
+void binaryOpImpl(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+  binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
+    <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Checks if addresses are aligned on N bytes
+ */
+inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N)
+{
+  return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
+}
+
+template <typename InType,
+          typename Lambda,
+          typename OutType = InType,
+          typename IdxType = int,
+          int TPB          = 256>
+void binaryOp(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes           = len * maxSize;
+  uint64_t in1Addr       = uint64_t(in1);
+  uint64_t in2Addr       = uint64_t(in2);
+  uint64_t outAddr       = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) {
+    binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) {
+    binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) {
+    binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) {
+    binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (1 / maxSize) {
+    binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else {
+    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len, op, stream);
+  }
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/init.hpp b/cpp/include/raft/linalg/detail/init.hpp
new file mode 100644
index 0000000000..4718a2cb0e
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/init.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/exec_policy.hpp>
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename T>
+void range(T* out, int start, int end, cudaStream_t stream)
+{
+  thrust::counting_iterator<int> first(start);
+  thrust::counting_iterator<int> last = first + (end - start);
+  thrust::device_ptr<T> ptr(out);
+  thrust::copy(rmm::exec_policy(stream), first, last, ptr);
+}
+
+/**
+ * @brief Like Python range.
+ *
+ * Fills the output as out[i] = i.
+ *
+ * \param [out] out device array, size [n]
+ * \param [in] n length of the array
+ * \param [in] stream cuda stream
+ */
+template <typename T, int TPB = 256>
+void range(T* out, int n, cudaStream_t stream)
+{
+  range(out, 0, n, stream);
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
index 90b653b711..e0b473bdd4 100644
--- a/cpp/include/raft/linalg/detail/map.cuh
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -16,207 +16,43 @@
 
 #pragma once
 
-#include <raft/core/device_mdspan.hpp>
+#include <cub/cub.cuh>
 #include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
-#include <raft/util/input_validation.hpp>
-#include <raft/util/integer_utils.hpp>
-#include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
-#include <rmm/cuda_stream_view.hpp>
+namespace raft {
+namespace linalg {
+namespace detail {
 
-namespace raft::linalg::detail {
-
-template <bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
-__device__ __forceinline__ auto map_apply(Func f, const IdxT& offset, const InTs&... ins) -> OutT
-{
-  if constexpr (PassOffset) {
-    return f(offset, ins...);
-  } else {
-    return f(ins...);
-  }
-}
-
-template <int R,
-          bool PassOffset,
-          typename OutT,
-          typename IdxT,
-          typename Func,
-          typename... InTs,
-          size_t... Is>
-__device__ __forceinline__ void map_kernel_mainloop(
-  OutT* out_ptr, IdxT offset, IdxT len, Func f, const InTs*... in_ptrs, std::index_sequence<Is...>)
-{
-  TxN_t<OutT, R> wide;
-  thrust::tuple<TxN_t<InTs, R>...> wide_args;
-  if (offset + R <= len) {
-    (thrust::get<Is>(wide_args).load(in_ptrs, offset), ...);
-#pragma unroll
-    for (int j = 0; j < R; ++j) {
-      wide.val.data[j] = map_apply<PassOffset, OutT, IdxT, Func, InTs...>(
-        f, offset + j, thrust::get<Is>(wide_args).val.data[j]...);
-    }
-    wide.store(out_ptr, offset);
-  }
-}
-
-template <int R, bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
-__global__ void map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
-{
-  const IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if constexpr (R <= 1) {
-    if (tid < len) {
-      out_ptr[tid] = map_apply<PassOffset, OutT, IdxT, Func, InTs...>(f, tid, in_ptrs[tid]...);
-    }
-  } else {
-    using align_bytes = Pow2<sizeof(OutT) * size_t(R)>;
-    using align_elems = Pow2<R>;
-
-    // how many elements to skip in order to do aligned vectorized store
-    const IdxT skip_cnt_left = std::min<IdxT>(IdxT(align_bytes::roundUp(out_ptr) - out_ptr), len);
-
-    // The main loop: process all aligned data
-    map_kernel_mainloop<R, PassOffset, OutT, IdxT, Func, InTs...>(
-      out_ptr, tid * R + skip_cnt_left, len, f, in_ptrs..., std::index_sequence_for<InTs...>());
-
-    static_assert(WarpSize >= R);
-    // Processes the skipped elements on the left
-    if (tid < skip_cnt_left) {
-      out_ptr[tid] = map_apply<PassOffset, OutT, IdxT, Func, InTs...>(f, tid, in_ptrs[tid]...);
-    }
-    // Processes the skipped elements on the right
-    const IdxT skip_cnt_right = align_elems::mod(len - skip_cnt_left);
-    const IdxT remain_i       = len - skip_cnt_right + tid;
-    if (remain_i < len) {
-      out_ptr[remain_i] =
-        map_apply<PassOffset, OutT, IdxT, Func, InTs...>(f, remain_i, in_ptrs[remain_i]...);
-    }
-  }
-}
-
-template <int R, bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
-void map_call(rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
-{
-  const IdxT len_vectorized = raft::div_rounding_up_safe<IdxT>(len, R);
-  const int threads =
-    std::max<int>(WarpSize, std::min<IdxT>(raft::bound_by_power_of_two<IdxT>(len_vectorized), 256));
-  const IdxT blocks = raft::div_rounding_up_unsafe<IdxT>(len_vectorized, threads);
-  map_kernel<R, PassOffset><<<blocks, threads, 0, stream>>>(out_ptr, len, f, in_ptrs...);
-}
-
-constexpr int kCoalescedVectorSize = 16;
-constexpr int kSmallInputThreshold = 1024;
-
-struct ratio_selector {
-  int ratio;
-  int align;
-  constexpr inline ratio_selector(int r, int a) : ratio(r), align(a) {}
-
-  template <typename T>
-  constexpr static auto ignoring_alignment() -> ratio_selector
-  {
-    return ratio_selector{raft::div_rounding_up_safe<size_t>(kCoalescedVectorSize, sizeof(T)), 0};
-  }
-
-  template <typename T>
-  explicit ratio_selector(const T* ptr)
-  {
-    constexpr auto s = ignoring_alignment<T>();  // NOLINT
-    align            = int(Pow2<sizeof(T) * s.ratio>::roundUp(ptr) - ptr);
-    ratio            = int(s.ratio);
-  }
-};
-
-constexpr inline auto operator*(const ratio_selector& a, const ratio_selector& b) -> ratio_selector
-{
-  auto ratio = std::min<int>(a.ratio, b.ratio);
-  while ((a.align % ratio) != (b.align % ratio)) {
-    ratio >>= 1;
-  }
-  return ratio_selector{ratio, a.align % ratio};
-}
-
-template <int R, bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
-void map_call_rt(
-  int r, rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
-{
-  if (r >= R) { return map_call<R, PassOffset>(stream, out_ptr, len, f, in_ptrs...); }
-  if constexpr (R > 1) {
-    return map_call_rt<(R >> 1), PassOffset>(r, stream, out_ptr, len, f, in_ptrs...);
-  }
-}
-
-template <bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
-void map(rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
-{
-  // don't vectorize on small inputs
-  if (len <= kSmallInputThreshold) {
-    return map_call<1, PassOffset>(stream, out_ptr, len, f, in_ptrs...);
-  }
-  constexpr int kRatio =
-    (ratio_selector::ignoring_alignment<OutT>() * ... * ratio_selector::ignoring_alignment<InTs>())
-      .ratio;
-  static_assert(kRatio > 0, "Unexpected zero vector size.");
-  const int ratio = (ratio_selector(out_ptr) * ... * ratio_selector(in_ptrs)).ratio;
-  return map_call_rt<kRatio, PassOffset>(ratio, stream, out_ptr, len, f, in_ptrs...);
-}
-
-template <typename OutType,
-          typename InType,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType>>
-void map_check_shape(OutType out, InType in)
-{
-  RAFT_EXPECTS(raft::is_row_or_column_major(in) && out.size() == in.size(),
-               "All inputs must be contiguous and have the same size as the output");
-}
-
-/**
- * @brief Map a function over a zero or more inputs and optionally a 0-based flat index
- * (element offset).
- *
- * _Performance note_: when possible, this function loads the argument arrays and stores the output
- * array using vectorized cuda load/store instructions. The size of the vectorization depends on the
- * size of the largest input/output element type and on the alignment of all pointers.
- *
- * @tparam PassOffset whether to pass an offset as a first argument to Func
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- * @tparam InTypes data-types of the inputs (device_mdspan)
- *
- * @param[in] res raft::device_resources
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda of type
- *                 ([auto offset], InTypes::value_type xs...) -> OutType::value_type
- * @param[in] ins the inputs (each of the same size as the output) (device_mdspan)
- */
-template <bool PassOffset,
+template <typename InType,
           typename OutType,
-          typename Func,
-          typename... InTypes,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InTypes...>>
-void map(const raft::device_resources& res, OutType out, Func f, InTypes... ins)
+          typename IdxType,
+          typename MapOp,
+          int TPB,
+          typename... Args>
+__global__ void mapKernel(OutType* out, IdxType len, MapOp map, const InType* in, Args... args)
 {
-  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
-  (map_check_shape(out, ins), ...);
+  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
-    map<PassOffset,
-        typename OutType::value_type,
-        std::uint32_t,
-        Func,
-        typename InTypes::value_type...>(
-      res.get_stream(), out.data_handle(), uint32_t(out.size()), f, ins.data_handle()...);
-  } else {
-    map<PassOffset,
-        typename OutType::value_type,
-        std::uint64_t,
-        Func,
-        typename InTypes::value_type...>(
-      res.get_stream(), out.data_handle(), uint64_t(out.size()), f, ins.data_handle()...);
-  }
+  if (idx < len) { out[idx] = map(in[idx], args[idx]...); }
 }
 
-}  // namespace raft::linalg::detail
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MapOp,
+          int TPB,
+          typename... Args>
+void mapImpl(
+  OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  const int nblks = raft::ceildiv(len, (IdxType)TPB);
+  mapKernel<InType, OutType, IdxType, MapOp, TPB, Args...>
+    <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // namespace detail
+}  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index 42e79a9285..0e516b4750 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "unary_op.cuh"
 #include <cub/cub.cuh>
 #include <raft/core/operators.hpp>
 #include <raft/linalg/unary_op.cuh>
diff --git a/cpp/include/raft/linalg/detail/ternary_op.cuh b/cpp/include/raft/linalg/detail/ternary_op.cuh
new file mode 100644
index 0000000000..7874f20f56
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/ternary_op.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+template <typename math_t, typename out_t, int veclen_, typename Lambda, typename IdxType>
+__global__ void ternaryOpKernel(
+  out_t* out, const math_t* in1, const math_t* in2, const math_t* in3, IdxType len, Lambda op)
+{
+  typedef raft::TxN_t<math_t, veclen_> VecType;
+  VecType a, b, c;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= VecType::Ratio;
+  if (idx >= len) return;
+  a.load(in1, idx);
+  b.load(in2, idx);
+  c.load(in3, idx);
+#pragma unroll
+  for (int i = 0; i < VecType::Ratio; ++i) {
+    a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]);
+  }
+  a.store(out, idx);
+}
+
+template <typename math_t, typename out_t, int veclen_, typename Lambda, typename IdxType, int TPB>
+void ternaryOpImpl(out_t* out,
+                   const math_t* in1,
+                   const math_t* in2,
+                   const math_t* in3,
+                   IdxType len,
+                   Lambda op,
+                   cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB);
+  ternaryOpKernel<math_t, out_t, veclen_, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, in1, in2, in3, len, op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/**
+ * @brief perform element-wise ternary operation on the input arrays
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param in3 the third input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ */
+template <typename math_t, typename out_t, typename Lambda, typename IdxType = int, int TPB = 256>
+void ternaryOp(out_t* out,
+               const math_t* in1,
+               const math_t* in2,
+               const math_t* in3,
+               IdxType len,
+               Lambda op,
+               cudaStream_t stream)
+{
+  size_t bytes = len * sizeof(math_t);
+  if (16 / sizeof(math_t) && bytes % 16 == 0) {
+    ternaryOpImpl<math_t, out_t, 16 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (8 / sizeof(math_t) && bytes % 8 == 0) {
+    ternaryOpImpl<math_t, out_t, 8 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (4 / sizeof(math_t) && bytes % 4 == 0) {
+    ternaryOpImpl<math_t, out_t, 4 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (2 / sizeof(math_t) && bytes % 2 == 0) {
+    ternaryOpImpl<math_t, out_t, 2 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (1 / sizeof(math_t)) {
+    ternaryOpImpl<math_t, out_t, 1 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else {
+    ternaryOpImpl<math_t, out_t, 1, Lambda, IdxType, TPB>(out, in1, in2, in3, len, op, stream);
+  }
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh
new file mode 100644
index 0000000000..cdadc6f868
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/unary_op.cuh
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType>
+__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op)
+{
+  typedef TxN_t<InType, VecLen> InVecType;
+  typedef TxN_t<OutType, VecLen> OutVecType;
+  InVecType a;
+  OutVecType b;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= InVecType::Ratio;
+  if (idx >= len) return;
+  a.load(in, idx);
+#pragma unroll
+  for (int i = 0; i < InVecType::Ratio; ++i) {
+    b.val.data[i] = op(a.val.data[i]);
+  }
+  b.store(out, idx);
+}
+
+template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType, int TPB>
+void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+  unaryOpKernel<InType, VecLen, Lambda, OutType, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, in, len, op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename InType,
+          typename Lambda,
+          typename IdxType = int,
+          typename OutType = InType,
+          int TPB          = 256>
+void unaryOpCaller(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  if (len <= 0) return;  // silently skip in case of 0 length input
+  constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes           = len * maxSize;
+  uint64_t inAddr        = uint64_t(in);
+  uint64_t outAddr       = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) {
+    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) {
+    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) {
+    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) {
+    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (1 / maxSize) {
+    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else {
+    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  }
+}
+
+template <typename OutType, typename Lambda, typename IdxType>
+__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op)
+{
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  if (idx < len) { op(out + idx, idx); }
+}
+
+template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
+void writeOnlyUnaryOpCaller(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+{
+  if (len <= 0) return;  // silently skip in case of 0 length input
+  auto nblks = raft::ceildiv<IdxType>(len, TPB);
+  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType><<<nblks, TPB, 0, stream>>>(out, len, op);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/init.cuh b/cpp/include/raft/linalg/init.cuh
index ad772b3989..5a810bf2ba 100644
--- a/cpp/include/raft/linalg/init.cuh
+++ b/cpp/include/raft/linalg/init.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,10 +18,11 @@
 
 #pragma once
 
-#include <raft/linalg/map.cuh>
+#include "detail/init.hpp"
 #include <raft/util/cudart_utils.hpp>
 
-namespace raft::linalg {
+namespace raft {
+namespace linalg {
 
 /**
  * @brief Like Python range.
@@ -36,8 +37,7 @@ namespace raft::linalg {
 template <typename T>
 void range(T* out, int start, int end, cudaStream_t stream)
 {
-  return detail::map<true>(
-    stream, out, end - start, compose_op{cast_op<T>{}, add_const_op<int>{start}});
+  detail::range(out, start, end, stream);
 }
 
 /**
@@ -52,7 +52,7 @@ void range(T* out, int start, int end, cudaStream_t stream)
 template <typename T, int TPB = 256>
 void range(T* out, int n, cudaStream_t stream)
 {
-  return detail::map<true>(stream, out, n, cast_op<T>{});
+  detail::range(out, n, stream);
 }
 
 /**
@@ -68,6 +68,7 @@ void zero(T* out, int n, cudaStream_t stream)
   RAFT_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(out), 0, n * sizeof(T), stream));
 }
 
-}  // namespace raft::linalg
+}  // namespace linalg
+}  // namespace raft
 
-#endif
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh
index 305a8aa3cc..2b9e6c80a0 100644
--- a/cpp/include/raft/linalg/map.cuh
+++ b/cpp/include/raft/linalg/map.cuh
@@ -22,267 +22,119 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
+#include <raft/util/input_validation.hpp>
+#include <thrust/tabulate.h>
 
-namespace raft::linalg {
+namespace raft {
+namespace linalg {
 
 /**
- * @defgroup map Mapping ops
- * @{
- */
-
-/**
- * @brief Map a function over zero or more input mdspans of the same size.
- *
- * The algorithm applied on `k` inputs can be described in a following pseudo-code:
- * @code
- *  for (auto i: [0 ... out.size()]) {
- *    out[i] = f(in_0[i], in_1[i], ..., in_k[i])
- *  }
- * @endcode
- *
- * _Performance note_: when possible, this function loads the argument arrays and stores the output
- * array using vectorized cuda load/store instructions. The size of the vectorization depends on the
- * size of the largest input/output element type and on the alignment of all pointers.
- *
- * Usage example:
- * @code{.cpp}
- *  #include <raft/core/device_mdarray.hpp>
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/core/operators.hpp>
- *  #include <raft/linalg/map.cuh>
- *
- *  auto input = raft::make_device_vector<int>(res, n);
- *  ... fill input ..
- *  auto squares = raft::make_device_vector<int>(res, n);
- *  raft::linalg::map_offset(res, squares.view(), raft::sq_op{}, input.view());
- * @endcode
- *
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- * @tparam InTypes data-types of the inputs (device_mdspan)
- *
- * @param[in] res raft::device_resources
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *                 (InTypes::value_type xs...) -> OutType::value_type
- * @param[in] ins the inputs (each of the same size as the output) (device_mdspan)
+ * @brief CUDA version of map
+ * @tparam InType data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual operation
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @tparam OutType data-type in which the result will be stored
+ * @param out the output of the map operation (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param map the device-lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
  */
-template <typename OutType,
-          typename Func,
-          typename... InTypes,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InTypes...>>
-void map(const raft::device_resources& res, OutType out, Func f, InTypes... ins)
+template <typename InType,
+          typename MapOp,
+          typename IdxType = std::uint32_t,
+          int TPB          = 256,
+          typename OutType = InType,
+          typename... Args>
+void map_k(
+  OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
 {
-  return detail::map<false>(res, out, f, ins...);
+  detail::mapImpl<InType, OutType, IdxType, MapOp, TPB, Args...>(
+    out, len, map, stream, in, args...);
 }
 
 /**
- * @brief Map a function over one mdspan.
- *
- * @tparam InType1 data-type of the input (device_mdspan)
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- *
- * @param[in] res raft::device_resources
- * @param[in] in1 the input (the same size as the output) (device_mdspan)
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *                 (InType1::value_type x) -> OutType::value_type
+ * @defgroup map Mapping ops
+ * @{
  */
-template <typename InType1,
-          typename OutType,
-          typename Func,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType1>>
-void map(const raft::device_resources& res, InType1 in1, OutType out, Func f)
-{
-  return detail::map<false>(res, out, f, in1);
-}
 
 /**
- * @brief Map a function over two mdspans.
- *
- * @tparam InType1 data-type of the input (device_mdspan)
- * @tparam InType2 data-type of the input (device_mdspan)
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- *
- * @param[in] res raft::device_resources
- * @param[in] in1 the input (the same size as the output) (device_mdspan)
- * @param[in] in2 the input (the same size as the output) (device_mdspan)
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *                 (InType1::value_type x1, InType2::value_type x2) -> OutType::value_type
+ * @brief CUDA version of map
+ * @tparam InType data-type for math operation of type raft::device_mdspan
+ * @tparam MapOp the device-lambda performing the actual operation
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam OutType data-type of result of type raft::device_mdspan
+ * @tparam Args additional parameters
+ * @param[in] handle raft::device_resources
+ * @param[in] in the input of type raft::device_mdspan
+ * @param[out] out the output of the map operation of type raft::device_mdspan
+ * @param[in] map the device-lambda
+ * @param[in] args additional input arrays
  */
-template <typename InType1,
-          typename InType2,
+template <typename InType,
+          typename MapOp,
           typename OutType,
-          typename Func,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType1, InType2>>
-void map(const raft::device_resources& res, InType1 in1, InType2 in2, OutType out, Func f)
+          int TPB = 256,
+          typename... Args,
+          typename = raft::enable_if_input_device_mdspan<InType>,
+          typename = raft::enable_if_output_device_mdspan<OutType>>
+void map(raft::device_resources const& handle, InType in, OutType out, MapOp map, Args... args)
 {
-  return detail::map<false>(res, out, f, in1, in2);
+  using in_value_t  = typename InType::value_type;
+  using out_value_t = typename OutType::value_type;
+
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output is not exhaustive");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input is not exhaustive");
+  RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Input and Output");
+
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    map_k<in_value_t, MapOp, std::uint32_t, TPB, out_value_t, Args...>(
+      out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...);
+  } else {
+    map_k<in_value_t, MapOp, std::uint64_t, TPB, out_value_t, Args...>(
+      out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...);
+  }
 }
 
 /**
- * @brief Map a function over three mdspans.
- *
- * @tparam InType1 data-type of the input 1 (device_mdspan)
- * @tparam InType2 data-type of the input 2 (device_mdspan)
- * @tparam InType3 data-type of the input 3 (device_mdspan)
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- *
- * @param[in] res raft::device_resources
- * @param[in] in1 the input 1 (the same size as the output) (device_mdspan)
- * @param[in] in2 the input 2 (the same size as the output) (device_mdspan)
- * @param[in] in3 the input 3 (the same size as the output) (device_mdspan)
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *   (InType1::value_type x1, InType2::value_type x2, InType3::value_type x3) -> OutType::value_type
- */
-template <typename InType1,
-          typename InType2,
-          typename InType3,
-          typename OutType,
-          typename Func,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType1, InType2, InType3>>
-void map(
-  const raft::device_resources& res, InType1 in1, InType2 in2, InType3 in3, OutType out, Func f)
-{
-  return detail::map<false>(res, out, f, in1, in2, in3);
-}
-
-/**
- *  @brief Map a function over zero-based flat index (element offset) and zero or more inputs.
- *
- * The algorithm applied on `k` inputs can be described in a following pseudo-code:
- * @code
- *  for (auto i: [0 ... out.size()]) {
- *    out[i] = f(i, in_0[i], in_1[i], ..., in_k[i])
- *  }
- * @endcode
- *
- * _Performance note_: when possible, this function loads the argument arrays and stores the output
- * array using vectorized cuda load/store instructions. The size of the vectorization depends on the
- * size of the largest input/output element type and on the alignment of all pointers.
+ * @brief Perform an element-wise unary operation on the input offset into the output array
  *
  * Usage example:
  * @code{.cpp}
  *  #include <raft/core/device_mdarray.hpp>
- *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/core/handle.hpp>
  *  #include <raft/core/operators.hpp>
  *  #include <raft/linalg/map.cuh>
- *
+ *  ...
+ *  raft::handle_t handle;
  *  auto squares = raft::make_device_vector<int>(handle, n);
- *  raft::linalg::map_offset(res, squares.view(), raft::sq_op{});
+ *  raft::linalg::map_offset(handle, squares.view(), raft::sq_op());
  * @endcode
  *
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- * @tparam InTypes data-types of the inputs (device_mdspan)
- *
- * @param[in] res raft::device_resources
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *                 (auto offset, InTypes::value_type xs...) -> OutType::value_type
- * @param[in] ins the inputs (each of the same size as the output) (device_mdspan)
+ * @tparam OutType Output mdspan type
+ * @tparam MapOp   The unary operation type with signature `OutT func(const IdxT& idx);`
+ * @param[in]  handle The raft handle
+ * @param[out] out    Output array
+ * @param[in]  op     The unary operation
  */
 template <typename OutType,
-          typename Func,
-          typename... InTypes,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InTypes...>>
-void map_offset(const raft::device_resources& res, OutType out, Func f, InTypes... ins)
+          typename MapOp,
+          typename = raft::enable_if_output_device_mdspan<OutType>>
+void map_offset(const raft::device_resources& handle, OutType out, MapOp op)
 {
-  return detail::map<true>(res, out, f, ins...);
-}
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
 
-/**
- * @brief Map a function over zero-based flat index (element offset) and one mdspan.
- *
- * @tparam InType1 data-type of the input (device_mdspan)
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- *
- * @param[in] res raft::device_resources
- * @param[in] in1 the input (the same size as the output) (device_mdspan)
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *                 (auto offset, InType1::value_type x) -> OutType::value_type
- */
-template <typename InType1,
-          typename OutType,
-          typename Func,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType1>>
-void map_offset(const raft::device_resources& res, InType1 in1, OutType out, Func f)
-{
-  return detail::map<true>(res, out, f, in1);
-}
+  using out_value_t = typename OutType::value_type;
 
-/**
- * @brief Map a function over zero-based flat index (element offset) and two mdspans.
- *
- * @tparam InType1 data-type of the input (device_mdspan)
- * @tparam InType2 data-type of the input (device_mdspan)
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- *
- * @param[in] res raft::device_resources
- * @param[in] in1 the input (the same size as the output) (device_mdspan)
- * @param[in] in2 the input (the same size as the output) (device_mdspan)
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *    (auto offset, InType1::value_type x1, InType2::value_type x2) -> OutType::value_type
- */
-template <typename InType1,
-          typename InType2,
-          typename OutType,
-          typename Func,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType1, InType2>>
-void map_offset(const raft::device_resources& res, InType1 in1, InType2 in2, OutType out, Func f)
-{
-  return detail::map<true>(res, out, f, in1, in2);
-}
-
-/**
- * @brief Map a function over zero-based flat index (element offset) and three mdspans.
- *
- * @tparam InType1 data-type of the input 1 (device_mdspan)
- * @tparam InType2 data-type of the input 2 (device_mdspan)
- * @tparam InType3 data-type of the input 3 (device_mdspan)
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- *
- * @param[in] res raft::device_resources
- * @param[in] in1 the input 1 (the same size as the output) (device_mdspan)
- * @param[in] in2 the input 2 (the same size as the output) (device_mdspan)
- * @param[in] in3 the input 3 (the same size as the output) (device_mdspan)
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *   (auto offset, InType1::value_type x1, InType2::value_type x2, InType3::value_type x3)
- *      -> OutType::value_type
- */
-template <typename InType1,
-          typename InType2,
-          typename InType3,
-          typename OutType,
-          typename Func,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType1, InType2, InType3>>
-void map_offset(
-  const raft::device_resources& res, InType1 in1, InType2 in2, InType3 in3, OutType out, Func f)
-{
-  return detail::map<true>(res, out, f, in1, in2, in3);
+  thrust::tabulate(
+    handle.get_thrust_policy(), out.data_handle(), out.data_handle() + out.size(), op);
 }
 
 /** @} */  // end of map
 
-}  // namespace raft::linalg
+}  // namespace linalg
+};  // namespace raft
 
 #endif
diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh
index 1e347d69be..aa3859bc23 100644
--- a/cpp/include/raft/linalg/ternary_op.cuh
+++ b/cpp/include/raft/linalg/ternary_op.cuh
@@ -19,9 +19,11 @@
 
 #pragma once
 
+#include "detail/ternary_op.cuh"
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
-#include <raft/linalg/map.cuh>
+#include <raft/util/input_validation.hpp>
 
 namespace raft {
 namespace linalg {
@@ -48,7 +50,7 @@ void ternaryOp(out_t* out,
                Lambda op,
                cudaStream_t stream)
 {
-  return detail::map<false>(stream, out, len, op, in1, in2, in3);
+  detail::ternaryOp(out, in1, in2, in3, len, op, stream);
 }
 
 /**
@@ -78,7 +80,33 @@ template <typename InType,
 void ternary_op(
   raft::device_resources const& handle, InType in1, InType in2, InType in3, OutType out, Lambda op)
 {
-  return map(handle, in1, in2, in3, out, op);
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in3), "Input 3 must be contiguous");
+  RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size() && in2.size() == in3.size(),
+               "Size mismatch between Output and Inputs");
+
+  using in_value_t  = typename InType::value_type;
+  using out_value_t = typename OutType::value_type;
+
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    ternaryOp<in_value_t, Lambda, out_value_t, std::uint32_t>(out.data_handle(),
+                                                              in1.data_handle(),
+                                                              in2.data_handle(),
+                                                              in3.data_handle(),
+                                                              out.size(),
+                                                              op,
+                                                              handle.get_stream());
+  } else {
+    ternaryOp<in_value_t, Lambda, out_value_t, std::uint64_t>(out.data_handle(),
+                                                              in1.data_handle(),
+                                                              in2.data_handle(),
+                                                              in3.data_handle(),
+                                                              out.size(),
+                                                              op,
+                                                              handle.get_stream());
+  }
 }
 
 /** @} */  // end of group ternary_op
@@ -86,4 +114,4 @@ void ternary_op(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index 23f932d2f2..ce102adfd2 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -18,9 +18,11 @@
 
 #pragma once
 
+#include "detail/unary_op.cuh"
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
-#include <raft/linalg/map.cuh>
+#include <raft/util/input_validation.hpp>
 
 namespace raft {
 namespace linalg {
@@ -46,7 +48,7 @@ template <typename InType,
           int TPB          = 256>
 void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
 {
-  return detail::map<false>(stream, out, len, op, in);
+  detail::unaryOpCaller(out, in, len, op, stream);
 }
 
 /**
@@ -69,11 +71,7 @@ void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_
 template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
 void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
 {
-  return detail::map<true>(stream, out, len, [op] __device__(IdxType offset) {
-    OutType r;
-    op(&r, offset);
-    return r;
-  });
+  detail::writeOnlyUnaryOpCaller(out, len, op, stream);
 }
 
 /**
@@ -99,7 +97,20 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void unary_op(raft::device_resources const& handle, InType in, OutType out, Lambda op)
 {
-  return map(handle, in, out, op);
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
+  RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input");
+
+  using in_value_t  = typename InType::value_type;
+  using out_value_t = typename OutType::value_type;
+
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    unaryOp<in_value_t, Lambda, std::uint32_t, out_value_t>(
+      out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream());
+  } else {
+    unaryOp<in_value_t, Lambda, std::uint64_t, out_value_t>(
+      out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream());
+  }
 }
 
 /**
@@ -119,7 +130,17 @@ template <typename OutType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void write_only_unary_op(const raft::device_resources& handle, OutType out, Lambda op)
 {
-  return writeOnlyUnaryOp(out.data_handle(), out.size(), op, handle.get_stream());
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+
+  using out_value_t = typename OutType::value_type;
+
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    writeOnlyUnaryOp<out_value_t, Lambda, std::uint32_t>(
+      out.data_handle(), out.size(), op, handle.get_stream());
+  } else {
+    writeOnlyUnaryOp<out_value_t, Lambda, std::uint64_t>(
+      out.data_handle(), out.size(), op, handle.get_stream());
+  }
 }
 
 /** @} */  // end of group unary_op
diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh
index 1c0234e0bd..aaf92b795b 100644
--- a/cpp/include/raft/matrix/init.cuh
+++ b/cpp/include/raft/matrix/init.cuh
@@ -18,7 +18,6 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
-#include <raft/linalg/map.cuh>
 #include <raft/matrix/detail/math.cuh>
 #include <raft/matrix/matrix.cuh>
 
@@ -66,7 +65,9 @@ void fill(raft::device_resources const& handle,
           raft::device_mdspan<math_t, extents, layout> inout,
           math_t scalar)
 {
-  linalg::map(handle, inout, raft::const_op{scalar});
+  RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Data layout not supported");
+  detail::setValue(
+    inout.data_handle(), inout.data_handle(), scalar, inout.size(), handle.get_stream());
 }
 
 template <typename math_t, typename idx_t>
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index 4b6e6f5e31..4c6ef0e30b 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -42,11 +42,11 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <cub/cub.cuh>
+#include <thrust/fill.h>
+#include <thrust/sequence.h>
 
 #include <cuda_fp16.h>
 
-#include <optional>
-
 namespace raft::neighbors::ivf_pq::detail {
 
 /**
@@ -1311,13 +1311,13 @@ void ivfpq_search_worker(raft::device_resources const& handle,
     // of a cluster by processing the cluster at the same time as much as
     // possible.
     index_list_sorted_buf.resize(n_queries * n_probes, stream);
-    auto index_list_buf =
-      make_device_mdarray<uint32_t>(handle, mr, make_extents<uint32_t>(n_queries * n_probes));
+    rmm::device_uvector<uint32_t> index_list_buf(n_queries * n_probes, stream, mr);
     rmm::device_uvector<uint32_t> cluster_labels_out(n_queries * n_probes, stream, mr);
-    auto index_list   = index_list_buf.data_handle();
+    auto index_list   = index_list_buf.data();
     index_list_sorted = index_list_sorted_buf.data();
-
-    linalg::map_offset(handle, index_list_buf.view(), identity_op{});
+    thrust::sequence(handle.get_thrust_policy(),
+                     thrust::device_pointer_cast(index_list),
+                     thrust::device_pointer_cast(index_list + n_queries * n_probes));
 
     int begin_bit             = 0;
     int end_bit               = sizeof(uint32_t) * 8;
@@ -1376,15 +1376,13 @@ void ivfpq_search_worker(raft::device_resources const& handle,
                                                                   topK);
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
-  std::optional<device_vector<float>> query_kths_buf{std::nullopt};
-  float* query_kths = nullptr;
+  rmm::device_uvector<float> query_kths(0, stream, mr);
   if (manage_local_topk) {
-    query_kths_buf.emplace(
-      make_device_mdarray<float>(handle, mr, make_extents<uint32_t>(n_queries)));
-    linalg::map(handle,
-                query_kths_buf->view(),
-                raft::const_op<float>{dummy_block_sort_t<ScoreT, IdxT>::queue_t::kDummy});
-    query_kths = query_kths_buf->data_handle();
+    query_kths.resize(n_queries, stream);
+    thrust::fill_n(handle.get_thrust_policy(),
+                   query_kths.data(),
+                   n_queries,
+                   float(dummy_block_sort_t<ScoreT, IdxT>::queue_t::kDummy));
   }
   search_instance(stream,
                   index.size(),
@@ -1403,7 +1401,7 @@ void ivfpq_search_worker(raft::device_resources const& handle,
                   chunk_index.data(),
                   query,
                   index_list_sorted,
-                  query_kths,
+                  query_kths.data(),
                   device_lut.data(),
                   distances_buf.data(),
                   neighbors_ptr);
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 15b40808ee..5b52374789 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -37,15 +37,13 @@ void mapLaunch(OutType* out,
   raft::device_resources handle{stream};
   auto out_view = raft::make_device_vector_view(out, len);
   auto in1_view = raft::make_device_vector_view(in1, len);
-  auto in2_view = raft::make_device_vector_view(in2, len);
-  auto in3_view = raft::make_device_vector_view(in3, len);
   map(
     handle,
+    in1_view,
     out_view,
     [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; },
-    in1_view,
-    in2_view,
-    in3_view);
+    in2,
+    in3);
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>