diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh index ed083a1590..966e84965d 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -18,9 +18,12 @@ #pragma once +#include "detail/binary_op.cuh" + #include #include -#include +#include +#include namespace raft { namespace linalg { @@ -49,7 +52,7 @@ template (stream, out, len, op, in1, in2); + detail::binaryOp(out, in1, in2, len, op, stream); } /** @@ -77,7 +80,22 @@ template > void binary_op(raft::device_resources const& handle, InType in1, InType in2, OutType out, Lambda op) { - return map(handle, in1, in2, out, op); + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous"); + RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size(), + "Size mismatch between Output and Inputs"); + + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + if (out.size() <= std::numeric_limits::max()) { + binaryOp( + out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream()); + } else { + binaryOp( + out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream()); + } } /** @} */ // end of group binary_op @@ -85,4 +103,4 @@ void binary_op(raft::device_resources const& handle, InType in1, InType in2, Out }; // end namespace linalg }; // end namespace raft -#endif +#endif \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh new file mode 100644 index 0000000000..d073e164fd --- /dev/null +++ b/cpp/include/raft/linalg/detail/binary_op.cuh @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +__global__ void binaryOpKernel( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op) +{ + typedef TxN_t InVecType; + typedef TxN_t OutVecType; + InVecType a, b; + OutVecType c; + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + idx *= InVecType::Ratio; + if (idx >= len) return; + a.load(in1, idx); + b.load(in2, idx); +#pragma unroll + for (int i = 0; i < InVecType::Ratio; ++i) { + c.val.data[i] = op(a.val.data[i], b.val.data[i]); + } + c.store(out, idx); +} + +template +void binaryOpImpl( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); + binaryOpKernel + <<>>(out, in1, in2, len, op); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * @brief Checks if addresses are aligned on N bytes + */ +inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N) +{ + return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0; +} + +template +void binaryOp( + OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) +{ + constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t in1Addr = uint64_t(in1); + uint64_t in2Addr = uint64_t(in2); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else if (1 / maxSize) { + binaryOpImpl( + out, in1, in2, len, op, stream); + } else { + binaryOpImpl(out, in1, in2, len, op, stream); + } +} + +} // namespace detail +} // namespace linalg +} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/init.hpp b/cpp/include/raft/linalg/detail/init.hpp new file mode 100644 index 0000000000..4718a2cb0e --- /dev/null +++ b/cpp/include/raft/linalg/detail/init.hpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +void range(T* out, int start, int end, cudaStream_t stream) +{ + thrust::counting_iterator first(start); + thrust::counting_iterator last = first + (end - start); + thrust::device_ptr ptr(out); + thrust::copy(rmm::exec_policy(stream), first, last, ptr); +} + +/** + * @brief Like Python range. + * + * Fills the output as out[i] = i. + * + * \param [out] out device array, size [n] + * \param [in] n length of the array + * \param [in] stream cuda stream + */ +template +void range(T* out, int n, cudaStream_t stream) +{ + range(out, 0, n, stream); +} + +} // namespace detail +} // namespace linalg +} // namespace raft diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh index 90b653b711..e0b473bdd4 100644 --- a/cpp/include/raft/linalg/detail/map.cuh +++ b/cpp/include/raft/linalg/detail/map.cuh @@ -16,207 +16,43 @@ #pragma once -#include +#include #include #include -#include -#include -#include #include -#include +namespace raft { +namespace linalg { +namespace detail { -namespace raft::linalg::detail { - -template -__device__ __forceinline__ auto map_apply(Func f, const IdxT& offset, const InTs&... ins) -> OutT -{ - if constexpr (PassOffset) { - return f(offset, ins...); - } else { - return f(ins...); - } -} - -template -__device__ __forceinline__ void map_kernel_mainloop( - OutT* out_ptr, IdxT offset, IdxT len, Func f, const InTs*... in_ptrs, std::index_sequence) -{ - TxN_t wide; - thrust::tuple...> wide_args; - if (offset + R <= len) { - (thrust::get(wide_args).load(in_ptrs, offset), ...); -#pragma unroll - for (int j = 0; j < R; ++j) { - wide.val.data[j] = map_apply( - f, offset + j, thrust::get(wide_args).val.data[j]...); - } - wide.store(out_ptr, offset); - } -} - -template -__global__ void map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) -{ - const IdxT tid = blockIdx.x * blockDim.x + threadIdx.x; - if constexpr (R <= 1) { - if (tid < len) { - out_ptr[tid] = map_apply(f, tid, in_ptrs[tid]...); - } - } else { - using align_bytes = Pow2; - using align_elems = Pow2; - - // how many elements to skip in order to do aligned vectorized store - const IdxT skip_cnt_left = std::min(IdxT(align_bytes::roundUp(out_ptr) - out_ptr), len); - - // The main loop: process all aligned data - map_kernel_mainloop( - out_ptr, tid * R + skip_cnt_left, len, f, in_ptrs..., std::index_sequence_for()); - - static_assert(WarpSize >= R); - // Processes the skipped elements on the left - if (tid < skip_cnt_left) { - out_ptr[tid] = map_apply(f, tid, in_ptrs[tid]...); - } - // Processes the skipped elements on the right - const IdxT skip_cnt_right = align_elems::mod(len - skip_cnt_left); - const IdxT remain_i = len - skip_cnt_right + tid; - if (remain_i < len) { - out_ptr[remain_i] = - map_apply(f, remain_i, in_ptrs[remain_i]...); - } - } -} - -template -void map_call(rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) -{ - const IdxT len_vectorized = raft::div_rounding_up_safe(len, R); - const int threads = - std::max(WarpSize, std::min(raft::bound_by_power_of_two(len_vectorized), 256)); - const IdxT blocks = raft::div_rounding_up_unsafe(len_vectorized, threads); - map_kernel<<>>(out_ptr, len, f, in_ptrs...); -} - -constexpr int kCoalescedVectorSize = 16; -constexpr int kSmallInputThreshold = 1024; - -struct ratio_selector { - int ratio; - int align; - constexpr inline ratio_selector(int r, int a) : ratio(r), align(a) {} - - template - constexpr static auto ignoring_alignment() -> ratio_selector - { - return ratio_selector{raft::div_rounding_up_safe(kCoalescedVectorSize, sizeof(T)), 0}; - } - - template - explicit ratio_selector(const T* ptr) - { - constexpr auto s = ignoring_alignment(); // NOLINT - align = int(Pow2::roundUp(ptr) - ptr); - ratio = int(s.ratio); - } -}; - -constexpr inline auto operator*(const ratio_selector& a, const ratio_selector& b) -> ratio_selector -{ - auto ratio = std::min(a.ratio, b.ratio); - while ((a.align % ratio) != (b.align % ratio)) { - ratio >>= 1; - } - return ratio_selector{ratio, a.align % ratio}; -} - -template -void map_call_rt( - int r, rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) -{ - if (r >= R) { return map_call(stream, out_ptr, len, f, in_ptrs...); } - if constexpr (R > 1) { - return map_call_rt<(R >> 1), PassOffset>(r, stream, out_ptr, len, f, in_ptrs...); - } -} - -template -void map(rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) -{ - // don't vectorize on small inputs - if (len <= kSmallInputThreshold) { - return map_call<1, PassOffset>(stream, out_ptr, len, f, in_ptrs...); - } - constexpr int kRatio = - (ratio_selector::ignoring_alignment() * ... * ratio_selector::ignoring_alignment()) - .ratio; - static_assert(kRatio > 0, "Unexpected zero vector size."); - const int ratio = (ratio_selector(out_ptr) * ... * ratio_selector(in_ptrs)).ratio; - return map_call_rt(ratio, stream, out_ptr, len, f, in_ptrs...); -} - -template , - typename = raft::enable_if_input_device_mdspan> -void map_check_shape(OutType out, InType in) -{ - RAFT_EXPECTS(raft::is_row_or_column_major(in) && out.size() == in.size(), - "All inputs must be contiguous and have the same size as the output"); -} - -/** - * @brief Map a function over a zero or more inputs and optionally a 0-based flat index - * (element offset). - * - * _Performance note_: when possible, this function loads the argument arrays and stores the output - * array using vectorized cuda load/store instructions. The size of the vectorization depends on the - * size of the largest input/output element type and on the alignment of all pointers. - * - * @tparam PassOffset whether to pass an offset as a first argument to Func - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * @tparam InTypes data-types of the inputs (device_mdspan) - * - * @param[in] res raft::device_resources - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda of type - * ([auto offset], InTypes::value_type xs...) -> OutType::value_type - * @param[in] ins the inputs (each of the same size as the output) (device_mdspan) - */ -template , - typename = raft::enable_if_input_device_mdspan> -void map(const raft::device_resources& res, OutType out, Func f, InTypes... ins) + typename IdxType, + typename MapOp, + int TPB, + typename... Args> +__global__ void mapKernel(OutType* out, IdxType len, MapOp map, const InType* in, Args... args) { - RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); - (map_check_shape(out, ins), ...); + auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); - if (out.size() <= std::numeric_limits::max()) { - map( - res.get_stream(), out.data_handle(), uint32_t(out.size()), f, ins.data_handle()...); - } else { - map( - res.get_stream(), out.data_handle(), uint64_t(out.size()), f, ins.data_handle()...); - } + if (idx < len) { out[idx] = map(in[idx], args[idx]...); } } -} // namespace raft::linalg::detail +template +void mapImpl( + OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +{ + const int nblks = raft::ceildiv(len, (IdxType)TPB); + mapKernel + <<>>(out, len, map, in, args...); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +} // namespace detail +} // namespace linalg +}; // namespace raft diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index 42e79a9285..0e516b4750 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once +#include "unary_op.cuh" #include #include #include diff --git a/cpp/include/raft/linalg/detail/ternary_op.cuh b/cpp/include/raft/linalg/detail/ternary_op.cuh new file mode 100644 index 0000000000..7874f20f56 --- /dev/null +++ b/cpp/include/raft/linalg/detail/ternary_op.cuh @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft { +namespace linalg { +namespace detail { +template +__global__ void ternaryOpKernel( + out_t* out, const math_t* in1, const math_t* in2, const math_t* in3, IdxType len, Lambda op) +{ + typedef raft::TxN_t VecType; + VecType a, b, c; + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + idx *= VecType::Ratio; + if (idx >= len) return; + a.load(in1, idx); + b.load(in2, idx); + c.load(in3, idx); +#pragma unroll + for (int i = 0; i < VecType::Ratio; ++i) { + a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]); + } + a.store(out, idx); +} + +template +void ternaryOpImpl(out_t* out, + const math_t* in1, + const math_t* in2, + const math_t* in3, + IdxType len, + Lambda op, + cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB); + ternaryOpKernel + <<>>(out, in1, in2, in3, len, op); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** + * @brief perform element-wise ternary operation on the input arrays + * @tparam math_t data-type upon which the math operation will be performed + * @tparam Lambda the device-lambda performing the actual operation + * @tparam IdxType Integer type used to for addressing + * @tparam TPB threads-per-block in the final kernel launched + * @param out the output array + * @param in1 the first input array + * @param in2 the second input array + * @param in3 the third input array + * @param len number of elements in the input array + * @param op the device-lambda + * @param stream cuda stream where to launch work + */ +template +void ternaryOp(out_t* out, + const math_t* in1, + const math_t* in2, + const math_t* in3, + IdxType len, + Lambda op, + cudaStream_t stream) +{ + size_t bytes = len * sizeof(math_t); + if (16 / sizeof(math_t) && bytes % 16 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (8 / sizeof(math_t) && bytes % 8 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (4 / sizeof(math_t) && bytes % 4 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (2 / sizeof(math_t) && bytes % 2 == 0) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else if (1 / sizeof(math_t)) { + ternaryOpImpl( + out, in1, in2, in3, len, op, stream); + } else { + ternaryOpImpl(out, in1, in2, in3, len, op, stream); + } +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh new file mode 100644 index 0000000000..cdadc6f868 --- /dev/null +++ b/cpp/include/raft/linalg/detail/unary_op.cuh @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace raft { +namespace linalg { +namespace detail { + +template +__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op) +{ + typedef TxN_t InVecType; + typedef TxN_t OutVecType; + InVecType a; + OutVecType b; + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + idx *= InVecType::Ratio; + if (idx >= len) return; + a.load(in, idx); +#pragma unroll + for (int i = 0; i < InVecType::Ratio; ++i) { + b.val.data[i] = op(a.val.data[i]); + } + b.store(out, idx); +} + +template +void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); + unaryOpKernel + <<>>(out, in, len, op); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +void unaryOpCaller(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) +{ + if (len <= 0) return; // silently skip in case of 0 length input + constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); + size_t bytes = len * maxSize; + uint64_t inAddr = uint64_t(in); + uint64_t outAddr = uint64_t(out); + if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) { + unaryOpImpl(out, in, len, op, stream); + } else if (1 / maxSize) { + unaryOpImpl(out, in, len, op, stream); + } else { + unaryOpImpl(out, in, len, op, stream); + } +} + +template +__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op) +{ + IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); + if (idx < len) { op(out + idx, idx); } +} + +template +void writeOnlyUnaryOpCaller(OutType* out, IdxType len, Lambda op, cudaStream_t stream) +{ + if (len <= 0) return; // silently skip in case of 0 length input + auto nblks = raft::ceildiv(len, TPB); + writeOnlyUnaryOpKernel<<>>(out, len, op); + RAFT_CUDA_TRY(cudaGetLastError()); +} + +}; // end namespace detail +}; // end namespace linalg +}; // end namespace raft diff --git a/cpp/include/raft/linalg/init.cuh b/cpp/include/raft/linalg/init.cuh index ad772b3989..5a810bf2ba 100644 --- a/cpp/include/raft/linalg/init.cuh +++ b/cpp/include/raft/linalg/init.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,10 +18,11 @@ #pragma once -#include +#include "detail/init.hpp" #include -namespace raft::linalg { +namespace raft { +namespace linalg { /** * @brief Like Python range. @@ -36,8 +37,7 @@ namespace raft::linalg { template void range(T* out, int start, int end, cudaStream_t stream) { - return detail::map( - stream, out, end - start, compose_op{cast_op{}, add_const_op{start}}); + detail::range(out, start, end, stream); } /** @@ -52,7 +52,7 @@ void range(T* out, int start, int end, cudaStream_t stream) template void range(T* out, int n, cudaStream_t stream) { - return detail::map(stream, out, n, cast_op{}); + detail::range(out, n, stream); } /** @@ -68,6 +68,7 @@ void zero(T* out, int n, cudaStream_t stream) RAFT_CUDA_TRY(cudaMemsetAsync(static_cast(out), 0, n * sizeof(T), stream)); } -} // namespace raft::linalg +} // namespace linalg +} // namespace raft -#endif +#endif \ No newline at end of file diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh index 305a8aa3cc..2b9e6c80a0 100644 --- a/cpp/include/raft/linalg/map.cuh +++ b/cpp/include/raft/linalg/map.cuh @@ -22,267 +22,119 @@ #include #include +#include +#include -namespace raft::linalg { +namespace raft { +namespace linalg { /** - * @defgroup map Mapping ops - * @{ - */ - -/** - * @brief Map a function over zero or more input mdspans of the same size. - * - * The algorithm applied on `k` inputs can be described in a following pseudo-code: - * @code - * for (auto i: [0 ... out.size()]) { - * out[i] = f(in_0[i], in_1[i], ..., in_k[i]) - * } - * @endcode - * - * _Performance note_: when possible, this function loads the argument arrays and stores the output - * array using vectorized cuda load/store instructions. The size of the vectorization depends on the - * size of the largest input/output element type and on the alignment of all pointers. - * - * Usage example: - * @code{.cpp} - * #include - * #include - * #include - * #include - * - * auto input = raft::make_device_vector(res, n); - * ... fill input .. - * auto squares = raft::make_device_vector(res, n); - * raft::linalg::map_offset(res, squares.view(), raft::sq_op{}, input.view()); - * @endcode - * - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * @tparam InTypes data-types of the inputs (device_mdspan) - * - * @param[in] res raft::device_resources - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (InTypes::value_type xs...) -> OutType::value_type - * @param[in] ins the inputs (each of the same size as the output) (device_mdspan) + * @brief CUDA version of map + * @tparam InType data-type upon which the math operation will be performed + * @tparam MapOp the device-lambda performing the actual operation + * @tparam TPB threads-per-block in the final kernel launched + * @tparam Args additional parameters + * @tparam OutType data-type in which the result will be stored + * @param out the output of the map operation (assumed to be a device pointer) + * @param len number of elements in the input array + * @param map the device-lambda + * @param stream cuda-stream where to launch this kernel + * @param in the input array + * @param args additional input arrays */ -template , - typename = raft::enable_if_input_device_mdspan> -void map(const raft::device_resources& res, OutType out, Func f, InTypes... ins) +template +void map_k( + OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args) { - return detail::map(res, out, f, ins...); + detail::mapImpl( + out, len, map, stream, in, args...); } /** - * @brief Map a function over one mdspan. - * - * @tparam InType1 data-type of the input (device_mdspan) - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * - * @param[in] res raft::device_resources - * @param[in] in1 the input (the same size as the output) (device_mdspan) - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (InType1::value_type x) -> OutType::value_type + * @defgroup map Mapping ops + * @{ */ -template , - typename = raft::enable_if_input_device_mdspan> -void map(const raft::device_resources& res, InType1 in1, OutType out, Func f) -{ - return detail::map(res, out, f, in1); -} /** - * @brief Map a function over two mdspans. - * - * @tparam InType1 data-type of the input (device_mdspan) - * @tparam InType2 data-type of the input (device_mdspan) - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * - * @param[in] res raft::device_resources - * @param[in] in1 the input (the same size as the output) (device_mdspan) - * @param[in] in2 the input (the same size as the output) (device_mdspan) - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (InType1::value_type x1, InType2::value_type x2) -> OutType::value_type + * @brief CUDA version of map + * @tparam InType data-type for math operation of type raft::device_mdspan + * @tparam MapOp the device-lambda performing the actual operation + * @tparam TPB threads-per-block in the final kernel launched + * @tparam OutType data-type of result of type raft::device_mdspan + * @tparam Args additional parameters + * @param[in] handle raft::device_resources + * @param[in] in the input of type raft::device_mdspan + * @param[out] out the output of the map operation of type raft::device_mdspan + * @param[in] map the device-lambda + * @param[in] args additional input arrays */ -template , - typename = raft::enable_if_input_device_mdspan> -void map(const raft::device_resources& res, InType1 in1, InType2 in2, OutType out, Func f) + int TPB = 256, + typename... Args, + typename = raft::enable_if_input_device_mdspan, + typename = raft::enable_if_output_device_mdspan> +void map(raft::device_resources const& handle, InType in, OutType out, MapOp map, Args... args) { - return detail::map(res, out, f, in1, in2); + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output is not exhaustive"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input is not exhaustive"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Input and Output"); + + if (out.size() <= std::numeric_limits::max()) { + map_k( + out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...); + } else { + map_k( + out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...); + } } /** - * @brief Map a function over three mdspans. - * - * @tparam InType1 data-type of the input 1 (device_mdspan) - * @tparam InType2 data-type of the input 2 (device_mdspan) - * @tparam InType3 data-type of the input 3 (device_mdspan) - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * - * @param[in] res raft::device_resources - * @param[in] in1 the input 1 (the same size as the output) (device_mdspan) - * @param[in] in2 the input 2 (the same size as the output) (device_mdspan) - * @param[in] in3 the input 3 (the same size as the output) (device_mdspan) - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (InType1::value_type x1, InType2::value_type x2, InType3::value_type x3) -> OutType::value_type - */ -template , - typename = raft::enable_if_input_device_mdspan> -void map( - const raft::device_resources& res, InType1 in1, InType2 in2, InType3 in3, OutType out, Func f) -{ - return detail::map(res, out, f, in1, in2, in3); -} - -/** - * @brief Map a function over zero-based flat index (element offset) and zero or more inputs. - * - * The algorithm applied on `k` inputs can be described in a following pseudo-code: - * @code - * for (auto i: [0 ... out.size()]) { - * out[i] = f(i, in_0[i], in_1[i], ..., in_k[i]) - * } - * @endcode - * - * _Performance note_: when possible, this function loads the argument arrays and stores the output - * array using vectorized cuda load/store instructions. The size of the vectorization depends on the - * size of the largest input/output element type and on the alignment of all pointers. + * @brief Perform an element-wise unary operation on the input offset into the output array * * Usage example: * @code{.cpp} * #include - * #include + * #include * #include * #include - * + * ... + * raft::handle_t handle; * auto squares = raft::make_device_vector(handle, n); - * raft::linalg::map_offset(res, squares.view(), raft::sq_op{}); + * raft::linalg::map_offset(handle, squares.view(), raft::sq_op()); * @endcode * - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * @tparam InTypes data-types of the inputs (device_mdspan) - * - * @param[in] res raft::device_resources - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (auto offset, InTypes::value_type xs...) -> OutType::value_type - * @param[in] ins the inputs (each of the same size as the output) (device_mdspan) + * @tparam OutType Output mdspan type + * @tparam MapOp The unary operation type with signature `OutT func(const IdxT& idx);` + * @param[in] handle The raft handle + * @param[out] out Output array + * @param[in] op The unary operation */ template , - typename = raft::enable_if_input_device_mdspan> -void map_offset(const raft::device_resources& res, OutType out, Func f, InTypes... ins) + typename MapOp, + typename = raft::enable_if_output_device_mdspan> +void map_offset(const raft::device_resources& handle, OutType out, MapOp op) { - return detail::map(res, out, f, ins...); -} + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); -/** - * @brief Map a function over zero-based flat index (element offset) and one mdspan. - * - * @tparam InType1 data-type of the input (device_mdspan) - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * - * @param[in] res raft::device_resources - * @param[in] in1 the input (the same size as the output) (device_mdspan) - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (auto offset, InType1::value_type x) -> OutType::value_type - */ -template , - typename = raft::enable_if_input_device_mdspan> -void map_offset(const raft::device_resources& res, InType1 in1, OutType out, Func f) -{ - return detail::map(res, out, f, in1); -} + using out_value_t = typename OutType::value_type; -/** - * @brief Map a function over zero-based flat index (element offset) and two mdspans. - * - * @tparam InType1 data-type of the input (device_mdspan) - * @tparam InType2 data-type of the input (device_mdspan) - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * - * @param[in] res raft::device_resources - * @param[in] in1 the input (the same size as the output) (device_mdspan) - * @param[in] in2 the input (the same size as the output) (device_mdspan) - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (auto offset, InType1::value_type x1, InType2::value_type x2) -> OutType::value_type - */ -template , - typename = raft::enable_if_input_device_mdspan> -void map_offset(const raft::device_resources& res, InType1 in1, InType2 in2, OutType out, Func f) -{ - return detail::map(res, out, f, in1, in2); -} - -/** - * @brief Map a function over zero-based flat index (element offset) and three mdspans. - * - * @tparam InType1 data-type of the input 1 (device_mdspan) - * @tparam InType2 data-type of the input 2 (device_mdspan) - * @tparam InType3 data-type of the input 3 (device_mdspan) - * @tparam OutType data-type of the result (device_mdspan) - * @tparam Func the device-lambda performing the actual operation - * - * @param[in] res raft::device_resources - * @param[in] in1 the input 1 (the same size as the output) (device_mdspan) - * @param[in] in2 the input 2 (the same size as the output) (device_mdspan) - * @param[in] in3 the input 3 (the same size as the output) (device_mdspan) - * @param[out] out the output of the map operation (device_mdspan) - * @param[in] f device lambda - * (auto offset, InType1::value_type x1, InType2::value_type x2, InType3::value_type x3) - * -> OutType::value_type - */ -template , - typename = raft::enable_if_input_device_mdspan> -void map_offset( - const raft::device_resources& res, InType1 in1, InType2 in2, InType3 in3, OutType out, Func f) -{ - return detail::map(res, out, f, in1, in2, in3); + thrust::tabulate( + handle.get_thrust_policy(), out.data_handle(), out.data_handle() + out.size(), op); } /** @} */ // end of map -} // namespace raft::linalg +} // namespace linalg +}; // namespace raft #endif diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh index 1e347d69be..aa3859bc23 100644 --- a/cpp/include/raft/linalg/ternary_op.cuh +++ b/cpp/include/raft/linalg/ternary_op.cuh @@ -19,9 +19,11 @@ #pragma once +#include "detail/ternary_op.cuh" + #include #include -#include +#include namespace raft { namespace linalg { @@ -48,7 +50,7 @@ void ternaryOp(out_t* out, Lambda op, cudaStream_t stream) { - return detail::map(stream, out, len, op, in1, in2, in3); + detail::ternaryOp(out, in1, in2, in3, len, op, stream); } /** @@ -78,7 +80,33 @@ template ::max()) { + ternaryOp(out.data_handle(), + in1.data_handle(), + in2.data_handle(), + in3.data_handle(), + out.size(), + op, + handle.get_stream()); + } else { + ternaryOp(out.data_handle(), + in1.data_handle(), + in2.data_handle(), + in3.data_handle(), + out.size(), + op, + handle.get_stream()); + } } /** @} */ // end of group ternary_op @@ -86,4 +114,4 @@ void ternary_op( }; // end namespace linalg }; // end namespace raft -#endif +#endif \ No newline at end of file diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh index 23f932d2f2..ce102adfd2 100644 --- a/cpp/include/raft/linalg/unary_op.cuh +++ b/cpp/include/raft/linalg/unary_op.cuh @@ -18,9 +18,11 @@ #pragma once +#include "detail/unary_op.cuh" + #include #include -#include +#include namespace raft { namespace linalg { @@ -46,7 +48,7 @@ template void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) { - return detail::map(stream, out, len, op, in); + detail::unaryOpCaller(out, in, len, op, stream); } /** @@ -69,11 +71,7 @@ void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_ template void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) { - return detail::map(stream, out, len, [op] __device__(IdxType offset) { - OutType r; - op(&r, offset); - return r; - }); + detail::writeOnlyUnaryOpCaller(out, len, op, stream); } /** @@ -99,7 +97,20 @@ template > void unary_op(raft::device_resources const& handle, InType in, OutType out, Lambda op) { - return map(handle, in, out, op); + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); + RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input"); + + using in_value_t = typename InType::value_type; + using out_value_t = typename OutType::value_type; + + if (out.size() <= std::numeric_limits::max()) { + unaryOp( + out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream()); + } else { + unaryOp( + out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream()); + } } /** @@ -119,7 +130,17 @@ template > void write_only_unary_op(const raft::device_resources& handle, OutType out, Lambda op) { - return writeOnlyUnaryOp(out.data_handle(), out.size(), op, handle.get_stream()); + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + + using out_value_t = typename OutType::value_type; + + if (out.size() <= std::numeric_limits::max()) { + writeOnlyUnaryOp( + out.data_handle(), out.size(), op, handle.get_stream()); + } else { + writeOnlyUnaryOp( + out.data_handle(), out.size(), op, handle.get_stream()); + } } /** @} */ // end of group unary_op diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh index ed2fb4d209..f597bbd1c6 100644 --- a/cpp/include/raft/matrix/init.cuh +++ b/cpp/include/raft/matrix/init.cuh @@ -18,7 +18,6 @@ #include #include -#include #include #include @@ -66,7 +65,9 @@ void fill(raft::device_resources const& handle, raft::device_mdspan inout, math_t scalar) { - linalg::map(handle, inout, raft::const_op{scalar}); + RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Data layout not supported"); + detail::setValue( + inout.data_handle(), inout.data_handle(), scalar, inout.size(), handle.get_stream()); } /** @} */ // end of group matrix_init diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index 4b6e6f5e31..4c6ef0e30b 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -42,11 +42,11 @@ #include #include +#include +#include #include -#include - namespace raft::neighbors::ivf_pq::detail { /** @@ -1311,13 +1311,13 @@ void ivfpq_search_worker(raft::device_resources const& handle, // of a cluster by processing the cluster at the same time as much as // possible. index_list_sorted_buf.resize(n_queries * n_probes, stream); - auto index_list_buf = - make_device_mdarray(handle, mr, make_extents(n_queries * n_probes)); + rmm::device_uvector index_list_buf(n_queries * n_probes, stream, mr); rmm::device_uvector cluster_labels_out(n_queries * n_probes, stream, mr); - auto index_list = index_list_buf.data_handle(); + auto index_list = index_list_buf.data(); index_list_sorted = index_list_sorted_buf.data(); - - linalg::map_offset(handle, index_list_buf.view(), identity_op{}); + thrust::sequence(handle.get_thrust_policy(), + thrust::device_pointer_cast(index_list), + thrust::device_pointer_cast(index_list + n_queries * n_probes)); int begin_bit = 0; int end_bit = sizeof(uint32_t) * 8; @@ -1376,15 +1376,13 @@ void ivfpq_search_worker(raft::device_resources const& handle, topK); rmm::device_uvector device_lut(search_instance.device_lut_size, stream, mr); - std::optional> query_kths_buf{std::nullopt}; - float* query_kths = nullptr; + rmm::device_uvector query_kths(0, stream, mr); if (manage_local_topk) { - query_kths_buf.emplace( - make_device_mdarray(handle, mr, make_extents(n_queries))); - linalg::map(handle, - query_kths_buf->view(), - raft::const_op{dummy_block_sort_t::queue_t::kDummy}); - query_kths = query_kths_buf->data_handle(); + query_kths.resize(n_queries, stream); + thrust::fill_n(handle.get_thrust_policy(), + query_kths.data(), + n_queries, + float(dummy_block_sort_t::queue_t::kDummy)); } search_instance(stream, index.size(), @@ -1403,7 +1401,7 @@ void ivfpq_search_worker(raft::device_resources const& handle, chunk_index.data(), query, index_list_sorted, - query_kths, + query_kths.data(), device_lut.data(), distances_buf.data(), neighbors_ptr); diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 15b40808ee..5b52374789 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -37,15 +37,13 @@ void mapLaunch(OutType* out, raft::device_resources handle{stream}; auto out_view = raft::make_device_vector_view(out, len); auto in1_view = raft::make_device_vector_view(in1, len); - auto in2_view = raft::make_device_vector_view(in2, len); - auto in3_view = raft::make_device_vector_view(in3, len); map( handle, + in1_view, out_view, [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; }, - in1_view, - in2_view, - in3_view); + in2, + in3); } template