From 7e328ff8eed6410fea3c79db8bfca6dd07050c66 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@users.noreply.github.com>
Date: Mon, 13 Mar 2023 20:54:27 -0400
Subject: [PATCH] Revert "Generic linalg::map (#1329)" (#1336)

The change to the public API was siginficant enough here that it's going to require updates on the cuml side which invoke `map_k` with very different arguments (e.g. mdspan, different order). For now, it's best we revert this commmit to unblock cuml and then we can proceed by keeping the old APIs and deprecating them until we change cuml.

Authors:
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Ben Frederickson (https://github.com/benfred)

URL: https://github.com/rapidsai/raft/pull/1336
---
 cpp/include/raft/linalg/binary_op.cuh         |  26 +-
 cpp/include/raft/linalg/detail/binary_op.cuh  |  98 ++++++
 cpp/include/raft/linalg/detail/init.hpp       |  54 ++++
 cpp/include/raft/linalg/detail/map.cuh        | 224 ++-----------
 .../raft/linalg/detail/strided_reduction.cuh  |   3 +-
 cpp/include/raft/linalg/detail/ternary_op.cuh | 105 ++++++
 cpp/include/raft/linalg/detail/unary_op.cuh   |  99 ++++++
 cpp/include/raft/linalg/init.cuh              |  17 +-
 cpp/include/raft/linalg/map.cuh               | 306 +++++-------------
 cpp/include/raft/linalg/ternary_op.cuh        |  36 ++-
 cpp/include/raft/linalg/unary_op.cuh          |  39 ++-
 cpp/include/raft/matrix/init.cuh              |   5 +-
 .../raft/neighbors/detail/ivf_pq_search.cuh   |  30 +-
 cpp/test/linalg/map.cu                        |   8 +-
 14 files changed, 580 insertions(+), 470 deletions(-)
 create mode 100644 cpp/include/raft/linalg/detail/binary_op.cuh
 create mode 100644 cpp/include/raft/linalg/detail/init.hpp
 create mode 100644 cpp/include/raft/linalg/detail/ternary_op.cuh
 create mode 100644 cpp/include/raft/linalg/detail/unary_op.cuh
diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh
index ed083a1590..966e84965d 100644
--- a/cpp/include/raft/linalg/binary_op.cuh
+++ b/cpp/include/raft/linalg/binary_op.cuh
@@ -18,9 +18,12 @@
 
 #pragma once
 
+#include "detail/binary_op.cuh"
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
-#include <raft/linalg/map.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/input_validation.hpp>
 
 namespace raft {
 namespace linalg {
@@ -49,7 +52,7 @@ template <typename InType,
 void binaryOp(
   OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
 {
-  return detail::map<false>(stream, out, len, op, in1, in2);
+  detail::binaryOp(out, in1, in2, len, op, stream);
 }
 
 /**
@@ -77,7 +80,22 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void binary_op(raft::device_resources const& handle, InType in1, InType in2, OutType out, Lambda op)
 {
-  return map(handle, in1, in2, out, op);
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous");
+  RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size(),
+               "Size mismatch between Output and Inputs");
+
+  using in_value_t  = typename InType::value_type;
+  using out_value_t = typename OutType::value_type;
+
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    binaryOp<in_value_t, Lambda, out_value_t, std::uint32_t>(
+      out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream());
+  } else {
+    binaryOp<in_value_t, Lambda, out_value_t, std::uint64_t>(
+      out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream());
+  }
 }
 
 /** @} */  // end of group binary_op
@@ -85,4 +103,4 @@ void binary_op(raft::device_resources const& handle, InType in1, InType in2, Out
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh
new file mode 100644
index 0000000000..d073e164fd
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/binary_op.cuh
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType>
+__global__ void binaryOpKernel(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op)
+{
+  typedef TxN_t<InType, VecLen> InVecType;
+  typedef TxN_t<OutType, VecLen> OutVecType;
+  InVecType a, b;
+  OutVecType c;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= InVecType::Ratio;
+  if (idx >= len) return;
+  a.load(in1, idx);
+  b.load(in2, idx);
+#pragma unroll
+  for (int i = 0; i < InVecType::Ratio; ++i) {
+    c.val.data[i] = op(a.val.data[i], b.val.data[i]);
+  }
+  c.store(out, idx);
+}
+
+template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType, int TPB>
+void binaryOpImpl(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+  binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
+    <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Checks if addresses are aligned on N bytes
+ */
+inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N)
+{
+  return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
+}
+
+template <typename InType,
+          typename Lambda,
+          typename OutType = InType,
+          typename IdxType = int,
+          int TPB          = 256>
+void binaryOp(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes           = len * maxSize;
+  uint64_t in1Addr       = uint64_t(in1);
+  uint64_t in2Addr       = uint64_t(in2);
+  uint64_t outAddr       = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) {
+    binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) {
+    binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) {
+    binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) {
+    binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (1 / maxSize) {
+    binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else {
+    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len, op, stream);
+  }
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/init.hpp b/cpp/include/raft/linalg/detail/init.hpp
new file mode 100644
index 0000000000..4718a2cb0e
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/init.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/exec_policy.hpp>
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename T>
+void range(T* out, int start, int end, cudaStream_t stream)
+{
+  thrust::counting_iterator<int> first(start);
+  thrust::counting_iterator<int> last = first + (end - start);
+  thrust::device_ptr<T> ptr(out);
+  thrust::copy(rmm::exec_policy(stream), first, last, ptr);
+}
+
+/**
+ * @brief Like Python range.
+ *
+ * Fills the output as out[i] = i.
+ *
+ * \param [out] out device array, size [n]
+ * \param [in] n length of the array
+ * \param [in] stream cuda stream
+ */
+template <typename T, int TPB = 256>
+void range(T* out, int n, cudaStream_t stream)
+{
+  range(out, 0, n, stream);
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh
index 90b653b711..e0b473bdd4 100644
--- a/cpp/include/raft/linalg/detail/map.cuh
+++ b/cpp/include/raft/linalg/detail/map.cuh
@@ -16,207 +16,43 @@
 
 #pragma once
 
-#include <raft/core/device_mdspan.hpp>
+#include <cub/cub.cuh>
 #include <raft/core/device_resources.hpp>
 #include <raft/util/cuda_utils.cuh>
-#include <raft/util/input_validation.hpp>
-#include <raft/util/integer_utils.hpp>
-#include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
-#include <rmm/cuda_stream_view.hpp>
+namespace raft {
+namespace linalg {
+namespace detail {
 
-namespace raft::linalg::detail {
-
-template <bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
-__device__ __forceinline__ auto map_apply(Func f, const IdxT& offset, const InTs&... ins) -> OutT
-{
-  if constexpr (PassOffset) {
-    return f(offset, ins...);
-  } else {
-    return f(ins...);
-  }
-}
-
-template <int R,
-          bool PassOffset,
-          typename OutT,
-          typename IdxT,
-          typename Func,
-          typename... InTs,
-          size_t... Is>
-__device__ __forceinline__ void map_kernel_mainloop(
-  OutT* out_ptr, IdxT offset, IdxT len, Func f, const InTs*... in_ptrs, std::index_sequence<Is...>)
-{
-  TxN_t<OutT, R> wide;
-  thrust::tuple<TxN_t<InTs, R>...> wide_args;
-  if (offset + R <= len) {
-    (thrust::get<Is>(wide_args).load(in_ptrs, offset), ...);
-#pragma unroll
-    for (int j = 0; j < R; ++j) {
-      wide.val.data[j] = map_apply<PassOffset, OutT, IdxT, Func, InTs...>(
-        f, offset + j, thrust::get<Is>(wide_args).val.data[j]...);
-    }
-    wide.store(out_ptr, offset);
-  }
-}
-
-template <int R, bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
-__global__ void map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
-{
-  const IdxT tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if constexpr (R <= 1) {
-    if (tid < len) {
-      out_ptr[tid] = map_apply<PassOffset, OutT, IdxT, Func, InTs...>(f, tid, in_ptrs[tid]...);
-    }
-  } else {
-    using align_bytes = Pow2<sizeof(OutT) * size_t(R)>;
-    using align_elems = Pow2<R>;
-
-    // how many elements to skip in order to do aligned vectorized store
-    const IdxT skip_cnt_left = std::min<IdxT>(IdxT(align_bytes::roundUp(out_ptr) - out_ptr), len);
-
-    // The main loop: process all aligned data
-    map_kernel_mainloop<R, PassOffset, OutT, IdxT, Func, InTs...>(
-      out_ptr, tid * R + skip_cnt_left, len, f, in_ptrs..., std::index_sequence_for<InTs...>());
-
-    static_assert(WarpSize >= R);
-    // Processes the skipped elements on the left
-    if (tid < skip_cnt_left) {
-      out_ptr[tid] = map_apply<PassOffset, OutT, IdxT, Func, InTs...>(f, tid, in_ptrs[tid]...);
-    }
-    // Processes the skipped elements on the right
-    const IdxT skip_cnt_right = align_elems::mod(len - skip_cnt_left);
-    const IdxT remain_i       = len - skip_cnt_right + tid;
-    if (remain_i < len) {
-      out_ptr[remain_i] =
-        map_apply<PassOffset, OutT, IdxT, Func, InTs...>(f, remain_i, in_ptrs[remain_i]...);
-    }
-  }
-}
-
-template <int R, bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
-void map_call(rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
-{
-  const IdxT len_vectorized = raft::div_rounding_up_safe<IdxT>(len, R);
-  const int threads =
-    std::max<int>(WarpSize, std::min<IdxT>(raft::bound_by_power_of_two<IdxT>(len_vectorized), 256));
-  const IdxT blocks = raft::div_rounding_up_unsafe<IdxT>(len_vectorized, threads);
-  map_kernel<R, PassOffset><<<blocks, threads, 0, stream>>>(out_ptr, len, f, in_ptrs...);
-}
-
-constexpr int kCoalescedVectorSize = 16;
-constexpr int kSmallInputThreshold = 1024;
-
-struct ratio_selector {
-  int ratio;
-  int align;
-  constexpr inline ratio_selector(int r, int a) : ratio(r), align(a) {}
-
-  template <typename T>
-  constexpr static auto ignoring_alignment() -> ratio_selector
-  {
-    return ratio_selector{raft::div_rounding_up_safe<size_t>(kCoalescedVectorSize, sizeof(T)), 0};
-  }
-
-  template <typename T>
-  explicit ratio_selector(const T* ptr)
-  {
-    constexpr auto s = ignoring_alignment<T>();  // NOLINT
-    align            = int(Pow2<sizeof(T) * s.ratio>::roundUp(ptr) - ptr);
-    ratio            = int(s.ratio);
-  }
-};
-
-constexpr inline auto operator*(const ratio_selector& a, const ratio_selector& b) -> ratio_selector
-{
-  auto ratio = std::min<int>(a.ratio, b.ratio);
-  while ((a.align % ratio) != (b.align % ratio)) {
-    ratio >>= 1;
-  }
-  return ratio_selector{ratio, a.align % ratio};
-}
-
-template <int R, bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
-void map_call_rt(
-  int r, rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
-{
-  if (r >= R) { return map_call<R, PassOffset>(stream, out_ptr, len, f, in_ptrs...); }
-  if constexpr (R > 1) {
-    return map_call_rt<(R >> 1), PassOffset>(r, stream, out_ptr, len, f, in_ptrs...);
-  }
-}
-
-template <bool PassOffset, typename OutT, typename IdxT, typename Func, typename... InTs>
-void map(rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs)
-{
-  // don't vectorize on small inputs
-  if (len <= kSmallInputThreshold) {
-    return map_call<1, PassOffset>(stream, out_ptr, len, f, in_ptrs...);
-  }
-  constexpr int kRatio =
-    (ratio_selector::ignoring_alignment<OutT>() * ... * ratio_selector::ignoring_alignment<InTs>())
-      .ratio;
-  static_assert(kRatio > 0, "Unexpected zero vector size.");
-  const int ratio = (ratio_selector(out_ptr) * ... * ratio_selector(in_ptrs)).ratio;
-  return map_call_rt<kRatio, PassOffset>(ratio, stream, out_ptr, len, f, in_ptrs...);
-}
-
-template <typename OutType,
-          typename InType,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType>>
-void map_check_shape(OutType out, InType in)
-{
-  RAFT_EXPECTS(raft::is_row_or_column_major(in) && out.size() == in.size(),
-               "All inputs must be contiguous and have the same size as the output");
-}
-
-/**
- * @brief Map a function over a zero or more inputs and optionally a 0-based flat index
- * (element offset).
- *
- * _Performance note_: when possible, this function loads the argument arrays and stores the output
- * array using vectorized cuda load/store instructions. The size of the vectorization depends on the
- * size of the largest input/output element type and on the alignment of all pointers.
- *
- * @tparam PassOffset whether to pass an offset as a first argument to Func
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- * @tparam InTypes data-types of the inputs (device_mdspan)
- *
- * @param[in] res raft::device_resources
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda of type
- *                 ([auto offset], InTypes::value_type xs...) -> OutType::value_type
- * @param[in] ins the inputs (each of the same size as the output) (device_mdspan)
- */
-template <bool PassOffset,
+template <typename InType,
           typename OutType,
-          typename Func,
-          typename... InTypes,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InTypes...>>
-void map(const raft::device_resources& res, OutType out, Func f, InTypes... ins)
+          typename IdxType,
+          typename MapOp,
+          int TPB,
+          typename... Args>
+__global__ void mapKernel(OutType* out, IdxType len, MapOp map, const InType* in, Args... args)
 {
-  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
-  (map_check_shape(out, ins), ...);
+  auto idx = (threadIdx.x + (blockIdx.x * blockDim.x));
 
-  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
-    map<PassOffset,
-        typename OutType::value_type,
-        std::uint32_t,
-        Func,
-        typename InTypes::value_type...>(
-      res.get_stream(), out.data_handle(), uint32_t(out.size()), f, ins.data_handle()...);
-  } else {
-    map<PassOffset,
-        typename OutType::value_type,
-        std::uint64_t,
-        Func,
-        typename InTypes::value_type...>(
-      res.get_stream(), out.data_handle(), uint64_t(out.size()), f, ins.data_handle()...);
-  }
+  if (idx < len) { out[idx] = map(in[idx], args[idx]...); }
 }
 
-}  // namespace raft::linalg::detail
+template <typename InType,
+          typename OutType,
+          typename IdxType,
+          typename MapOp,
+          int TPB,
+          typename... Args>
+void mapImpl(
+  OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
+{
+  const int nblks = raft::ceildiv(len, (IdxType)TPB);
+  mapKernel<InType, OutType, IdxType, MapOp, TPB, Args...>
+    <<<nblks, TPB, 0, stream>>>(out, len, map, in, args...);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+}  // namespace detail
+}  // namespace linalg
+};  // namespace raft
diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh
index 42e79a9285..0e516b4750 100644
--- a/cpp/include/raft/linalg/detail/strided_reduction.cuh
+++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "unary_op.cuh"
 #include <cub/cub.cuh>
 #include <raft/core/operators.hpp>
 #include <raft/linalg/unary_op.cuh>
diff --git a/cpp/include/raft/linalg/detail/ternary_op.cuh b/cpp/include/raft/linalg/detail/ternary_op.cuh
new file mode 100644
index 0000000000..7874f20f56
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/ternary_op.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+template <typename math_t, typename out_t, int veclen_, typename Lambda, typename IdxType>
+__global__ void ternaryOpKernel(
+  out_t* out, const math_t* in1, const math_t* in2, const math_t* in3, IdxType len, Lambda op)
+{
+  typedef raft::TxN_t<math_t, veclen_> VecType;
+  VecType a, b, c;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= VecType::Ratio;
+  if (idx >= len) return;
+  a.load(in1, idx);
+  b.load(in2, idx);
+  c.load(in3, idx);
+#pragma unroll
+  for (int i = 0; i < VecType::Ratio; ++i) {
+    a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]);
+  }
+  a.store(out, idx);
+}
+
+template <typename math_t, typename out_t, int veclen_, typename Lambda, typename IdxType, int TPB>
+void ternaryOpImpl(out_t* out,
+                   const math_t* in1,
+                   const math_t* in2,
+                   const math_t* in3,
+                   IdxType len,
+                   Lambda op,
+                   cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB);
+  ternaryOpKernel<math_t, out_t, veclen_, Lambda, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, in1, in2, in3, len, op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/**
+ * @brief perform element-wise ternary operation on the input arrays
+ * @tparam math_t data-type upon which the math operation will be performed
+ * @tparam Lambda the device-lambda performing the actual operation
+ * @tparam IdxType Integer type used to for addressing
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @param out the output array
+ * @param in1 the first input array
+ * @param in2 the second input array
+ * @param in3 the third input array
+ * @param len number of elements in the input array
+ * @param op the device-lambda
+ * @param stream cuda stream where to launch work
+ */
+template <typename math_t, typename out_t, typename Lambda, typename IdxType = int, int TPB = 256>
+void ternaryOp(out_t* out,
+               const math_t* in1,
+               const math_t* in2,
+               const math_t* in3,
+               IdxType len,
+               Lambda op,
+               cudaStream_t stream)
+{
+  size_t bytes = len * sizeof(math_t);
+  if (16 / sizeof(math_t) && bytes % 16 == 0) {
+    ternaryOpImpl<math_t, out_t, 16 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (8 / sizeof(math_t) && bytes % 8 == 0) {
+    ternaryOpImpl<math_t, out_t, 8 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (4 / sizeof(math_t) && bytes % 4 == 0) {
+    ternaryOpImpl<math_t, out_t, 4 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (2 / sizeof(math_t) && bytes % 2 == 0) {
+    ternaryOpImpl<math_t, out_t, 2 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else if (1 / sizeof(math_t)) {
+    ternaryOpImpl<math_t, out_t, 1 / sizeof(math_t), Lambda, IdxType, TPB>(
+      out, in1, in2, in3, len, op, stream);
+  } else {
+    ternaryOpImpl<math_t, out_t, 1, Lambda, IdxType, TPB>(out, in1, in2, in3, len, op, stream);
+  }
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh
new file mode 100644
index 0000000000..cdadc6f868
--- /dev/null
+++ b/cpp/include/raft/linalg/detail/unary_op.cuh
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
+#include <raft/util/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType>
+__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op)
+{
+  typedef TxN_t<InType, VecLen> InVecType;
+  typedef TxN_t<OutType, VecLen> OutVecType;
+  InVecType a;
+  OutVecType b;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= InVecType::Ratio;
+  if (idx >= len) return;
+  a.load(in, idx);
+#pragma unroll
+  for (int i = 0; i < InVecType::Ratio; ++i) {
+    b.val.data[i] = op(a.val.data[i]);
+  }
+  b.store(out, idx);
+}
+
+template <typename InType, int VecLen, typename Lambda, typename OutType, typename IdxType, int TPB>
+void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+  unaryOpKernel<InType, VecLen, Lambda, OutType, IdxType>
+    <<<nblks, TPB, 0, stream>>>(out, in, len, op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename InType,
+          typename Lambda,
+          typename IdxType = int,
+          typename OutType = InType,
+          int TPB          = 256>
+void unaryOpCaller(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
+{
+  if (len <= 0) return;  // silently skip in case of 0 length input
+  constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes           = len * maxSize;
+  uint64_t inAddr        = uint64_t(in);
+  uint64_t outAddr       = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) {
+    unaryOpImpl<InType, 16 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) {
+    unaryOpImpl<InType, 8 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) {
+    unaryOpImpl<InType, 4 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) {
+    unaryOpImpl<InType, 2 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else if (1 / maxSize) {
+    unaryOpImpl<InType, 1 / maxSize, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  } else {
+    unaryOpImpl<InType, 1, Lambda, OutType, IdxType, TPB>(out, in, len, op, stream);
+  }
+}
+
+template <typename OutType, typename Lambda, typename IdxType>
+__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op)
+{
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  if (idx < len) { op(out + idx, idx); }
+}
+
+template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
+void writeOnlyUnaryOpCaller(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
+{
+  if (len <= 0) return;  // silently skip in case of 0 length input
+  auto nblks = raft::ceildiv<IdxType>(len, TPB);
+  writeOnlyUnaryOpKernel<OutType, Lambda, IdxType><<<nblks, TPB, 0, stream>>>(out, len, op);
+  RAFT_CUDA_TRY(cudaGetLastError());
+}
+
+};  // end namespace detail
+};  // end namespace linalg
+};  // end namespace raft
diff --git a/cpp/include/raft/linalg/init.cuh b/cpp/include/raft/linalg/init.cuh
index ad772b3989..5a810bf2ba 100644
--- a/cpp/include/raft/linalg/init.cuh
+++ b/cpp/include/raft/linalg/init.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,10 +18,11 @@
 
 #pragma once
 
-#include <raft/linalg/map.cuh>
+#include "detail/init.hpp"
 #include <raft/util/cudart_utils.hpp>
 
-namespace raft::linalg {
+namespace raft {
+namespace linalg {
 
 /**
  * @brief Like Python range.
@@ -36,8 +37,7 @@ namespace raft::linalg {
 template <typename T>
 void range(T* out, int start, int end, cudaStream_t stream)
 {
-  return detail::map<true>(
-    stream, out, end - start, compose_op{cast_op<T>{}, add_const_op<int>{start}});
+  detail::range(out, start, end, stream);
 }
 
 /**
@@ -52,7 +52,7 @@ void range(T* out, int start, int end, cudaStream_t stream)
 template <typename T, int TPB = 256>
 void range(T* out, int n, cudaStream_t stream)
 {
-  return detail::map<true>(stream, out, n, cast_op<T>{});
+  detail::range(out, n, stream);
 }
 
 /**
@@ -68,6 +68,7 @@ void zero(T* out, int n, cudaStream_t stream)
   RAFT_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(out), 0, n * sizeof(T), stream));
 }
 
-}  // namespace raft::linalg
+}  // namespace linalg
+}  // namespace raft
 
-#endif
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh
index 305a8aa3cc..2b9e6c80a0 100644
--- a/cpp/include/raft/linalg/map.cuh
+++ b/cpp/include/raft/linalg/map.cuh
@@ -22,267 +22,119 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
+#include <raft/util/input_validation.hpp>
+#include <thrust/tabulate.h>
 
-namespace raft::linalg {
+namespace raft {
+namespace linalg {
 
 /**
- * @defgroup map Mapping ops
- * @{
- */
-
-/**
- * @brief Map a function over zero or more input mdspans of the same size.
- *
- * The algorithm applied on `k` inputs can be described in a following pseudo-code:
- * @code
- *  for (auto i: [0 ... out.size()]) {
- *    out[i] = f(in_0[i], in_1[i], ..., in_k[i])
- *  }
- * @endcode
- *
- * _Performance note_: when possible, this function loads the argument arrays and stores the output
- * array using vectorized cuda load/store instructions. The size of the vectorization depends on the
- * size of the largest input/output element type and on the alignment of all pointers.
- *
- * Usage example:
- * @code{.cpp}
- *  #include <raft/core/device_mdarray.hpp>
- *  #include <raft/core/device_resources.hpp>
- *  #include <raft/core/operators.hpp>
- *  #include <raft/linalg/map.cuh>
- *
- *  auto input = raft::make_device_vector<int>(res, n);
- *  ... fill input ..
- *  auto squares = raft::make_device_vector<int>(res, n);
- *  raft::linalg::map_offset(res, squares.view(), raft::sq_op{}, input.view());
- * @endcode
- *
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- * @tparam InTypes data-types of the inputs (device_mdspan)
- *
- * @param[in] res raft::device_resources
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *                 (InTypes::value_type xs...) -> OutType::value_type
- * @param[in] ins the inputs (each of the same size as the output) (device_mdspan)
+ * @brief CUDA version of map
+ * @tparam InType data-type upon which the math operation will be performed
+ * @tparam MapOp the device-lambda performing the actual operation
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam Args additional parameters
+ * @tparam OutType data-type in which the result will be stored
+ * @param out the output of the map operation (assumed to be a device pointer)
+ * @param len number of elements in the input array
+ * @param map the device-lambda
+ * @param stream cuda-stream where to launch this kernel
+ * @param in the input array
+ * @param args additional input arrays
  */
-template <typename OutType,
-          typename Func,
-          typename... InTypes,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InTypes...>>
-void map(const raft::device_resources& res, OutType out, Func f, InTypes... ins)
+template <typename InType,
+          typename MapOp,
+          typename IdxType = std::uint32_t,
+          int TPB          = 256,
+          typename OutType = InType,
+          typename... Args>
+void map_k(
+  OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args)
 {
-  return detail::map<false>(res, out, f, ins...);
+  detail::mapImpl<InType, OutType, IdxType, MapOp, TPB, Args...>(
+    out, len, map, stream, in, args...);
 }
 
 /**
- * @brief Map a function over one mdspan.
- *
- * @tparam InType1 data-type of the input (device_mdspan)
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- *
- * @param[in] res raft::device_resources
- * @param[in] in1 the input (the same size as the output) (device_mdspan)
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *                 (InType1::value_type x) -> OutType::value_type
+ * @defgroup map Mapping ops
+ * @{
  */
-template <typename InType1,
-          typename OutType,
-          typename Func,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType1>>
-void map(const raft::device_resources& res, InType1 in1, OutType out, Func f)
-{
-  return detail::map<false>(res, out, f, in1);
-}
 
 /**
- * @brief Map a function over two mdspans.
- *
- * @tparam InType1 data-type of the input (device_mdspan)
- * @tparam InType2 data-type of the input (device_mdspan)
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- *
- * @param[in] res raft::device_resources
- * @param[in] in1 the input (the same size as the output) (device_mdspan)
- * @param[in] in2 the input (the same size as the output) (device_mdspan)
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *                 (InType1::value_type x1, InType2::value_type x2) -> OutType::value_type
+ * @brief CUDA version of map
+ * @tparam InType data-type for math operation of type raft::device_mdspan
+ * @tparam MapOp the device-lambda performing the actual operation
+ * @tparam TPB threads-per-block in the final kernel launched
+ * @tparam OutType data-type of result of type raft::device_mdspan
+ * @tparam Args additional parameters
+ * @param[in] handle raft::device_resources
+ * @param[in] in the input of type raft::device_mdspan
+ * @param[out] out the output of the map operation of type raft::device_mdspan
+ * @param[in] map the device-lambda
+ * @param[in] args additional input arrays
  */
-template <typename InType1,
-          typename InType2,
+template <typename InType,
+          typename MapOp,
           typename OutType,
-          typename Func,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType1, InType2>>
-void map(const raft::device_resources& res, InType1 in1, InType2 in2, OutType out, Func f)
+          int TPB = 256,
+          typename... Args,
+          typename = raft::enable_if_input_device_mdspan<InType>,
+          typename = raft::enable_if_output_device_mdspan<OutType>>
+void map(raft::device_resources const& handle, InType in, OutType out, MapOp map, Args... args)
 {
-  return detail::map<false>(res, out, f, in1, in2);
+  using in_value_t  = typename InType::value_type;
+  using out_value_t = typename OutType::value_type;
+
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output is not exhaustive");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input is not exhaustive");
+  RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Input and Output");
+
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    map_k<in_value_t, MapOp, std::uint32_t, TPB, out_value_t, Args...>(
+      out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...);
+  } else {
+    map_k<in_value_t, MapOp, std::uint64_t, TPB, out_value_t, Args...>(
+      out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...);
+  }
 }
 
 /**
- * @brief Map a function over three mdspans.
- *
- * @tparam InType1 data-type of the input 1 (device_mdspan)
- * @tparam InType2 data-type of the input 2 (device_mdspan)
- * @tparam InType3 data-type of the input 3 (device_mdspan)
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- *
- * @param[in] res raft::device_resources
- * @param[in] in1 the input 1 (the same size as the output) (device_mdspan)
- * @param[in] in2 the input 2 (the same size as the output) (device_mdspan)
- * @param[in] in3 the input 3 (the same size as the output) (device_mdspan)
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *   (InType1::value_type x1, InType2::value_type x2, InType3::value_type x3) -> OutType::value_type
- */
-template <typename InType1,
-          typename InType2,
-          typename InType3,
-          typename OutType,
-          typename Func,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType1, InType2, InType3>>
-void map(
-  const raft::device_resources& res, InType1 in1, InType2 in2, InType3 in3, OutType out, Func f)
-{
-  return detail::map<false>(res, out, f, in1, in2, in3);
-}
-
-/**
- *  @brief Map a function over zero-based flat index (element offset) and zero or more inputs.
- *
- * The algorithm applied on `k` inputs can be described in a following pseudo-code:
- * @code
- *  for (auto i: [0 ... out.size()]) {
- *    out[i] = f(i, in_0[i], in_1[i], ..., in_k[i])
- *  }
- * @endcode
- *
- * _Performance note_: when possible, this function loads the argument arrays and stores the output
- * array using vectorized cuda load/store instructions. The size of the vectorization depends on the
- * size of the largest input/output element type and on the alignment of all pointers.
+ * @brief Perform an element-wise unary operation on the input offset into the output array
  *
  * Usage example:
  * @code{.cpp}
  *  #include <raft/core/device_mdarray.hpp>
- *  #include <raft/core/device_resources.hpp>
+ *  #include <raft/core/handle.hpp>
  *  #include <raft/core/operators.hpp>
  *  #include <raft/linalg/map.cuh>
- *
+ *  ...
+ *  raft::handle_t handle;
  *  auto squares = raft::make_device_vector<int>(handle, n);
- *  raft::linalg::map_offset(res, squares.view(), raft::sq_op{});
+ *  raft::linalg::map_offset(handle, squares.view(), raft::sq_op());
  * @endcode
  *
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- * @tparam InTypes data-types of the inputs (device_mdspan)
- *
- * @param[in] res raft::device_resources
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *                 (auto offset, InTypes::value_type xs...) -> OutType::value_type
- * @param[in] ins the inputs (each of the same size as the output) (device_mdspan)
+ * @tparam OutType Output mdspan type
+ * @tparam MapOp   The unary operation type with signature `OutT func(const IdxT& idx);`
+ * @param[in]  handle The raft handle
+ * @param[out] out    Output array
+ * @param[in]  op     The unary operation
  */
 template <typename OutType,
-          typename Func,
-          typename... InTypes,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InTypes...>>
-void map_offset(const raft::device_resources& res, OutType out, Func f, InTypes... ins)
+          typename MapOp,
+          typename = raft::enable_if_output_device_mdspan<OutType>>
+void map_offset(const raft::device_resources& handle, OutType out, MapOp op)
 {
-  return detail::map<true>(res, out, f, ins...);
-}
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
 
-/**
- * @brief Map a function over zero-based flat index (element offset) and one mdspan.
- *
- * @tparam InType1 data-type of the input (device_mdspan)
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- *
- * @param[in] res raft::device_resources
- * @param[in] in1 the input (the same size as the output) (device_mdspan)
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *                 (auto offset, InType1::value_type x) -> OutType::value_type
- */
-template <typename InType1,
-          typename OutType,
-          typename Func,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType1>>
-void map_offset(const raft::device_resources& res, InType1 in1, OutType out, Func f)
-{
-  return detail::map<true>(res, out, f, in1);
-}
+  using out_value_t = typename OutType::value_type;
 
-/**
- * @brief Map a function over zero-based flat index (element offset) and two mdspans.
- *
- * @tparam InType1 data-type of the input (device_mdspan)
- * @tparam InType2 data-type of the input (device_mdspan)
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- *
- * @param[in] res raft::device_resources
- * @param[in] in1 the input (the same size as the output) (device_mdspan)
- * @param[in] in2 the input (the same size as the output) (device_mdspan)
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *    (auto offset, InType1::value_type x1, InType2::value_type x2) -> OutType::value_type
- */
-template <typename InType1,
-          typename InType2,
-          typename OutType,
-          typename Func,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType1, InType2>>
-void map_offset(const raft::device_resources& res, InType1 in1, InType2 in2, OutType out, Func f)
-{
-  return detail::map<true>(res, out, f, in1, in2);
-}
-
-/**
- * @brief Map a function over zero-based flat index (element offset) and three mdspans.
- *
- * @tparam InType1 data-type of the input 1 (device_mdspan)
- * @tparam InType2 data-type of the input 2 (device_mdspan)
- * @tparam InType3 data-type of the input 3 (device_mdspan)
- * @tparam OutType data-type of the result (device_mdspan)
- * @tparam Func the device-lambda performing the actual operation
- *
- * @param[in] res raft::device_resources
- * @param[in] in1 the input 1 (the same size as the output) (device_mdspan)
- * @param[in] in2 the input 2 (the same size as the output) (device_mdspan)
- * @param[in] in3 the input 3 (the same size as the output) (device_mdspan)
- * @param[out] out the output of the map operation (device_mdspan)
- * @param[in] f device lambda
- *   (auto offset, InType1::value_type x1, InType2::value_type x2, InType3::value_type x3)
- *      -> OutType::value_type
- */
-template <typename InType1,
-          typename InType2,
-          typename InType3,
-          typename OutType,
-          typename Func,
-          typename = raft::enable_if_output_device_mdspan<OutType>,
-          typename = raft::enable_if_input_device_mdspan<InType1, InType2, InType3>>
-void map_offset(
-  const raft::device_resources& res, InType1 in1, InType2 in2, InType3 in3, OutType out, Func f)
-{
-  return detail::map<true>(res, out, f, in1, in2, in3);
+  thrust::tabulate(
+    handle.get_thrust_policy(), out.data_handle(), out.data_handle() + out.size(), op);
 }
 
 /** @} */  // end of map
 
-}  // namespace raft::linalg
+}  // namespace linalg
+};  // namespace raft
 
 #endif
diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh
index 1e347d69be..aa3859bc23 100644
--- a/cpp/include/raft/linalg/ternary_op.cuh
+++ b/cpp/include/raft/linalg/ternary_op.cuh
@@ -19,9 +19,11 @@
 
 #pragma once
 
+#include "detail/ternary_op.cuh"
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
-#include <raft/linalg/map.cuh>
+#include <raft/util/input_validation.hpp>
 
 namespace raft {
 namespace linalg {
@@ -48,7 +50,7 @@ void ternaryOp(out_t* out,
                Lambda op,
                cudaStream_t stream)
 {
-  return detail::map<false>(stream, out, len, op, in1, in2, in3);
+  detail::ternaryOp(out, in1, in2, in3, len, op, stream);
 }
 
 /**
@@ -78,7 +80,33 @@ template <typename InType,
 void ternary_op(
   raft::device_resources const& handle, InType in1, InType in2, InType in3, OutType out, Lambda op)
 {
-  return map(handle, in1, in2, in3, out, op);
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in3), "Input 3 must be contiguous");
+  RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size() && in2.size() == in3.size(),
+               "Size mismatch between Output and Inputs");
+
+  using in_value_t  = typename InType::value_type;
+  using out_value_t = typename OutType::value_type;
+
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    ternaryOp<in_value_t, Lambda, out_value_t, std::uint32_t>(out.data_handle(),
+                                                              in1.data_handle(),
+                                                              in2.data_handle(),
+                                                              in3.data_handle(),
+                                                              out.size(),
+                                                              op,
+                                                              handle.get_stream());
+  } else {
+    ternaryOp<in_value_t, Lambda, out_value_t, std::uint64_t>(out.data_handle(),
+                                                              in1.data_handle(),
+                                                              in2.data_handle(),
+                                                              in3.data_handle(),
+                                                              out.size(),
+                                                              op,
+                                                              handle.get_stream());
+  }
 }
 
 /** @} */  // end of group ternary_op
@@ -86,4 +114,4 @@ void ternary_op(
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
+#endif
\ No newline at end of file
diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh
index 23f932d2f2..ce102adfd2 100644
--- a/cpp/include/raft/linalg/unary_op.cuh
+++ b/cpp/include/raft/linalg/unary_op.cuh
@@ -18,9 +18,11 @@
 
 #pragma once
 
+#include "detail/unary_op.cuh"
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
-#include <raft/linalg/map.cuh>
+#include <raft/util/input_validation.hpp>
 
 namespace raft {
 namespace linalg {
@@ -46,7 +48,7 @@ template <typename InType,
           int TPB          = 256>
 void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream)
 {
-  return detail::map<false>(stream, out, len, op, in);
+  detail::unaryOpCaller(out, in, len, op, stream);
 }
 
 /**
@@ -69,11 +71,7 @@ void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_
 template <typename OutType, typename Lambda, typename IdxType = int, int TPB = 256>
 void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream)
 {
-  return detail::map<true>(stream, out, len, [op] __device__(IdxType offset) {
-    OutType r;
-    op(&r, offset);
-    return r;
-  });
+  detail::writeOnlyUnaryOpCaller(out, len, op, stream);
 }
 
 /**
@@ -99,7 +97,20 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void unary_op(raft::device_resources const& handle, InType in, OutType out, Lambda op)
 {
-  return map(handle, in, out, op);
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous");
+  RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input");
+
+  using in_value_t  = typename InType::value_type;
+  using out_value_t = typename OutType::value_type;
+
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    unaryOp<in_value_t, Lambda, std::uint32_t, out_value_t>(
+      out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream());
+  } else {
+    unaryOp<in_value_t, Lambda, std::uint64_t, out_value_t>(
+      out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream());
+  }
 }
 
 /**
@@ -119,7 +130,17 @@ template <typename OutType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void write_only_unary_op(const raft::device_resources& handle, OutType out, Lambda op)
 {
-  return writeOnlyUnaryOp(out.data_handle(), out.size(), op, handle.get_stream());
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+
+  using out_value_t = typename OutType::value_type;
+
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    writeOnlyUnaryOp<out_value_t, Lambda, std::uint32_t>(
+      out.data_handle(), out.size(), op, handle.get_stream());
+  } else {
+    writeOnlyUnaryOp<out_value_t, Lambda, std::uint64_t>(
+      out.data_handle(), out.size(), op, handle.get_stream());
+  }
 }
 
 /** @} */  // end of group unary_op
diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh
index ed2fb4d209..f597bbd1c6 100644
--- a/cpp/include/raft/matrix/init.cuh
+++ b/cpp/include/raft/matrix/init.cuh
@@ -18,7 +18,6 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
-#include <raft/linalg/map.cuh>
 #include <raft/matrix/detail/math.cuh>
 #include <raft/matrix/matrix.cuh>
 
@@ -66,7 +65,9 @@ void fill(raft::device_resources const& handle,
           raft::device_mdspan<math_t, extents, layout> inout,
           math_t scalar)
 {
-  linalg::map(handle, inout, raft::const_op{scalar});
+  RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Data layout not supported");
+  detail::setValue(
+    inout.data_handle(), inout.data_handle(), scalar, inout.size(), handle.get_stream());
 }
 
 /** @} */  // end of group matrix_init
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index 4b6e6f5e31..4c6ef0e30b 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -42,11 +42,11 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <cub/cub.cuh>
+#include <thrust/fill.h>
+#include <thrust/sequence.h>
 
 #include <cuda_fp16.h>
 
-#include <optional>
-
 namespace raft::neighbors::ivf_pq::detail {
 
 /**
@@ -1311,13 +1311,13 @@ void ivfpq_search_worker(raft::device_resources const& handle,
     // of a cluster by processing the cluster at the same time as much as
     // possible.
     index_list_sorted_buf.resize(n_queries * n_probes, stream);
-    auto index_list_buf =
-      make_device_mdarray<uint32_t>(handle, mr, make_extents<uint32_t>(n_queries * n_probes));
+    rmm::device_uvector<uint32_t> index_list_buf(n_queries * n_probes, stream, mr);
     rmm::device_uvector<uint32_t> cluster_labels_out(n_queries * n_probes, stream, mr);
-    auto index_list   = index_list_buf.data_handle();
+    auto index_list   = index_list_buf.data();
     index_list_sorted = index_list_sorted_buf.data();
-
-    linalg::map_offset(handle, index_list_buf.view(), identity_op{});
+    thrust::sequence(handle.get_thrust_policy(),
+                     thrust::device_pointer_cast(index_list),
+                     thrust::device_pointer_cast(index_list + n_queries * n_probes));
 
     int begin_bit             = 0;
     int end_bit               = sizeof(uint32_t) * 8;
@@ -1376,15 +1376,13 @@ void ivfpq_search_worker(raft::device_resources const& handle,
                                                                   topK);
 
   rmm::device_uvector<LutT> device_lut(search_instance.device_lut_size, stream, mr);
-  std::optional<device_vector<float>> query_kths_buf{std::nullopt};
-  float* query_kths = nullptr;
+  rmm::device_uvector<float> query_kths(0, stream, mr);
   if (manage_local_topk) {
-    query_kths_buf.emplace(
-      make_device_mdarray<float>(handle, mr, make_extents<uint32_t>(n_queries)));
-    linalg::map(handle,
-                query_kths_buf->view(),
-                raft::const_op<float>{dummy_block_sort_t<ScoreT, IdxT>::queue_t::kDummy});
-    query_kths = query_kths_buf->data_handle();
+    query_kths.resize(n_queries, stream);
+    thrust::fill_n(handle.get_thrust_policy(),
+                   query_kths.data(),
+                   n_queries,
+                   float(dummy_block_sort_t<ScoreT, IdxT>::queue_t::kDummy));
   }
   search_instance(stream,
                   index.size(),
@@ -1403,7 +1401,7 @@ void ivfpq_search_worker(raft::device_resources const& handle,
                   chunk_index.data(),
                   query,
                   index_list_sorted,
-                  query_kths,
+                  query_kths.data(),
                   device_lut.data(),
                   distances_buf.data(),
                   neighbors_ptr);
diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu
index 15b40808ee..5b52374789 100644
--- a/cpp/test/linalg/map.cu
+++ b/cpp/test/linalg/map.cu
@@ -37,15 +37,13 @@ void mapLaunch(OutType* out,
   raft::device_resources handle{stream};
   auto out_view = raft::make_device_vector_view(out, len);
   auto in1_view = raft::make_device_vector_view(in1, len);
-  auto in2_view = raft::make_device_vector_view(in2, len);
-  auto in3_view = raft::make_device_vector_view(in3, len);
   map(
     handle,
+    in1_view,
     out_view,
     [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; },
-    in1_view,
-    in2_view,
-    in3_view);
+    in2,
+    in3);
 }
 
 template <typename InType, typename IdxType = int, typename OutType = InType>