rapidsai · rapids-bot · Mar 14, 2023 · Mar 13, 2023
@@ -18,9 +18,12 @@
 
 #pragma once
 
+#include "detail/binary_op.cuh"
+
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
-#include <raft/linalg/map.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/input_validation.hpp>
 
 namespace raft {
 namespace linalg {
@@ -49,7 +52,7 @@ template <typename InType,
 void binaryOp(
   OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
 {
-  return detail::map<false>(stream, out, len, op, in1, in2);
+  detail::binaryOp(out, in1, in2, len, op, stream);
 }
 
 /**
@@ -77,12 +80,27 @@ template <typename InType,
           typename = raft::enable_if_output_device_mdspan<OutType>>
 void binary_op(raft::device_resources const& handle, InType in1, InType in2, OutType out, Lambda op)
 {
-  return map(handle, in1, in2, out, op);
+  RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous");
+  RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous");
+  RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size(),
+               "Size mismatch between Output and Inputs");
+
+  using in_value_t  = typename InType::value_type;
+  using out_value_t = typename OutType::value_type;
+
+  if (out.size() <= std::numeric_limits<std::uint32_t>::max()) {
+    binaryOp<in_value_t, Lambda, out_value_t, std::uint32_t>(
+      out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream());
+  } else {
+    binaryOp<in_value_t, Lambda, out_value_t, std::uint64_t>(
+      out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream());
+  }
 }
 
 /** @} */  // end of group binary_op
 
 };  // end namespace linalg
 };  // end namespace raft
 
-#endif
+#endif
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/vectorized.cuh>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType>
+__global__ void binaryOpKernel(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op)
+{
+  typedef TxN_t<InType, VecLen> InVecType;
+  typedef TxN_t<OutType, VecLen> OutVecType;
+  InVecType a, b;
+  OutVecType c;
+  IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x);
+  idx *= InVecType::Ratio;
+  if (idx >= len) return;
+  a.load(in1, idx);
+  b.load(in2, idx);
+#pragma unroll
+  for (int i = 0; i < InVecType::Ratio; ++i) {
+    c.val.data[i] = op(a.val.data[i], b.val.data[i]);
+  }
+  c.store(out, idx);
+}
+
+template <typename InType, int VecLen, typename Lambda, typename IdxType, typename OutType, int TPB>
+void binaryOpImpl(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB);
+  binaryOpKernel<InType, VecLen, Lambda, IdxType, OutType>
+    <<<nblks, TPB, 0, stream>>>(out, in1, in2, len, op);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/**
+ * @brief Checks if addresses are aligned on N bytes
+ */
+inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N)
+{
+  return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0;
+}
+
+template <typename InType,
+          typename Lambda,
+          typename OutType = InType,
+          typename IdxType = int,
+          int TPB          = 256>
+void binaryOp(
+  OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream)
+{
+  constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType);
+  size_t bytes           = len * maxSize;
+  uint64_t in1Addr       = uint64_t(in1);
+  uint64_t in2Addr       = uint64_t(in2);
+  uint64_t outAddr       = uint64_t(out);
+  if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) {
+    binaryOpImpl<InType, 16 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) {
+    binaryOpImpl<InType, 8 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) {
+    binaryOpImpl<InType, 4 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) {
+    binaryOpImpl<InType, 2 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else if (1 / maxSize) {
+    binaryOpImpl<InType, 1 / maxSize, Lambda, IdxType, OutType, TPB>(
+      out, in1, in2, len, op, stream);
+  } else {
+    binaryOpImpl<InType, 1, Lambda, IdxType, OutType, TPB>(out, in1, in2, len, op, stream);
+  }
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/exec_policy.hpp>
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace raft {
+namespace linalg {
+namespace detail {
+
+template <typename T>
+void range(T* out, int start, int end, cudaStream_t stream)
+{
+  thrust::counting_iterator<int> first(start);
+  thrust::counting_iterator<int> last = first + (end - start);
+  thrust::device_ptr<T> ptr(out);
+  thrust::copy(rmm::exec_policy(stream), first, last, ptr);
+}
+
+/**
+ * @brief Like Python range.
+ *
+ * Fills the output as out[i] = i.
+ *
+ * \param [out] out device array, size [n]
+ * \param [in] n length of the array
+ * \param [in] stream cuda stream
+ */
+template <typename T, int TPB = 256>
+void range(T* out, int n, cudaStream_t stream)
+{
+  range(out, 0, n, stream);
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace raft