diff --git a/cpp/include/raft/linalg/binary_op.cuh b/cpp/include/raft/linalg/binary_op.cuh index 966e84965d..ed083a1590 100644 --- a/cpp/include/raft/linalg/binary_op.cuh +++ b/cpp/include/raft/linalg/binary_op.cuh @@ -18,12 +18,9 @@ #pragma once -#include "detail/binary_op.cuh" - #include #include -#include -#include +#include namespace raft { namespace linalg { @@ -52,7 +49,7 @@ template (stream, out, len, op, in1, in2); } /** @@ -80,22 +77,7 @@ template > void binary_op(raft::device_resources const& handle, InType in1, InType in2, OutType out, Lambda op) { - RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); - RAFT_EXPECTS(raft::is_row_or_column_major(in1), "Input 1 must be contiguous"); - RAFT_EXPECTS(raft::is_row_or_column_major(in2), "Input 2 must be contiguous"); - RAFT_EXPECTS(out.size() == in1.size() && in1.size() == in2.size(), - "Size mismatch between Output and Inputs"); - - using in_value_t = typename InType::value_type; - using out_value_t = typename OutType::value_type; - - if (out.size() <= std::numeric_limits::max()) { - binaryOp( - out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream()); - } else { - binaryOp( - out.data_handle(), in1.data_handle(), in2.data_handle(), out.size(), op, handle.get_stream()); - } + return map(handle, in1, in2, out, op); } /** @} */ // end of group binary_op @@ -103,4 +85,4 @@ void binary_op(raft::device_resources const& handle, InType in1, InType in2, Out }; // end namespace linalg }; // end namespace raft -#endif \ No newline at end of file +#endif diff --git a/cpp/include/raft/linalg/detail/binary_op.cuh b/cpp/include/raft/linalg/detail/binary_op.cuh deleted file mode 100644 index d073e164fd..0000000000 --- a/cpp/include/raft/linalg/detail/binary_op.cuh +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -namespace raft { -namespace linalg { -namespace detail { - -template -__global__ void binaryOpKernel( - OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op) -{ - typedef TxN_t InVecType; - typedef TxN_t OutVecType; - InVecType a, b; - OutVecType c; - IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - idx *= InVecType::Ratio; - if (idx >= len) return; - a.load(in1, idx); - b.load(in2, idx); -#pragma unroll - for (int i = 0; i < InVecType::Ratio; ++i) { - c.val.data[i] = op(a.val.data[i], b.val.data[i]); - } - c.store(out, idx); -} - -template -void binaryOpImpl( - OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) -{ - const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); - binaryOpKernel - <<>>(out, in1, in2, len, op); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -/** - * @brief Checks if addresses are aligned on N bytes - */ -inline bool addressAligned(uint64_t addr1, uint64_t addr2, uint64_t addr3, uint64_t N) -{ - return addr1 % N == 0 && addr2 % N == 0 && addr3 % N == 0; -} - -template -void binaryOp( - OutType* out, const InType* in1, const InType* in2, IdxType len, Lambda op, cudaStream_t stream) -{ - constexpr auto maxSize = sizeof(InType) > sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t in1Addr = uint64_t(in1); - uint64_t in2Addr = uint64_t(in2); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 16)) { - binaryOpImpl( - out, in1, in2, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 8)) { - binaryOpImpl( - out, in1, in2, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 4)) { - binaryOpImpl( - out, in1, in2, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && addressAligned(in1Addr, in2Addr, outAddr, 2)) { - binaryOpImpl( - out, in1, in2, len, op, stream); - } else if (1 / maxSize) { - binaryOpImpl( - out, in1, in2, len, op, stream); - } else { - binaryOpImpl(out, in1, in2, len, op, stream); - } -} - -} // namespace detail -} // namespace linalg -} // namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/init.hpp b/cpp/include/raft/linalg/detail/init.hpp deleted file mode 100644 index 4718a2cb0e..0000000000 --- a/cpp/include/raft/linalg/detail/init.hpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -namespace raft { -namespace linalg { -namespace detail { - -template -void range(T* out, int start, int end, cudaStream_t stream) -{ - thrust::counting_iterator first(start); - thrust::counting_iterator last = first + (end - start); - thrust::device_ptr ptr(out); - thrust::copy(rmm::exec_policy(stream), first, last, ptr); -} - -/** - * @brief Like Python range. - * - * Fills the output as out[i] = i. - * - * \param [out] out device array, size [n] - * \param [in] n length of the array - * \param [in] stream cuda stream - */ -template -void range(T* out, int n, cudaStream_t stream) -{ - range(out, 0, n, stream); -} - -} // namespace detail -} // namespace linalg -} // namespace raft diff --git a/cpp/include/raft/linalg/detail/map.cuh b/cpp/include/raft/linalg/detail/map.cuh index e0b473bdd4..90b653b711 100644 --- a/cpp/include/raft/linalg/detail/map.cuh +++ b/cpp/include/raft/linalg/detail/map.cuh @@ -16,43 +16,207 @@ #pragma once -#include +#include #include #include +#include +#include +#include #include -namespace raft { -namespace linalg { -namespace detail { +#include -template -__global__ void mapKernel(OutType* out, IdxType len, MapOp map, const InType* in, Args... args) +namespace raft::linalg::detail { + +template +__device__ __forceinline__ auto map_apply(Func f, const IdxT& offset, const InTs&... ins) -> OutT +{ + if constexpr (PassOffset) { + return f(offset, ins...); + } else { + return f(ins...); + } +} + +template +__device__ __forceinline__ void map_kernel_mainloop( + OutT* out_ptr, IdxT offset, IdxT len, Func f, const InTs*... in_ptrs, std::index_sequence) { - auto idx = (threadIdx.x + (blockIdx.x * blockDim.x)); + TxN_t wide; + thrust::tuple...> wide_args; + if (offset + R <= len) { + (thrust::get(wide_args).load(in_ptrs, offset), ...); +#pragma unroll + for (int j = 0; j < R; ++j) { + wide.val.data[j] = map_apply( + f, offset + j, thrust::get(wide_args).val.data[j]...); + } + wide.store(out_ptr, offset); + } +} + +template +__global__ void map_kernel(OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) +{ + const IdxT tid = blockIdx.x * blockDim.x + threadIdx.x; + if constexpr (R <= 1) { + if (tid < len) { + out_ptr[tid] = map_apply(f, tid, in_ptrs[tid]...); + } + } else { + using align_bytes = Pow2; + using align_elems = Pow2; + + // how many elements to skip in order to do aligned vectorized store + const IdxT skip_cnt_left = std::min(IdxT(align_bytes::roundUp(out_ptr) - out_ptr), len); + + // The main loop: process all aligned data + map_kernel_mainloop( + out_ptr, tid * R + skip_cnt_left, len, f, in_ptrs..., std::index_sequence_for()); - if (idx < len) { out[idx] = map(in[idx], args[idx]...); } + static_assert(WarpSize >= R); + // Processes the skipped elements on the left + if (tid < skip_cnt_left) { + out_ptr[tid] = map_apply(f, tid, in_ptrs[tid]...); + } + // Processes the skipped elements on the right + const IdxT skip_cnt_right = align_elems::mod(len - skip_cnt_left); + const IdxT remain_i = len - skip_cnt_right + tid; + if (remain_i < len) { + out_ptr[remain_i] = + map_apply(f, remain_i, in_ptrs[remain_i]...); + } + } } -template +void map_call(rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) +{ + const IdxT len_vectorized = raft::div_rounding_up_safe(len, R); + const int threads = + std::max(WarpSize, std::min(raft::bound_by_power_of_two(len_vectorized), 256)); + const IdxT blocks = raft::div_rounding_up_unsafe(len_vectorized, threads); + map_kernel<<>>(out_ptr, len, f, in_ptrs...); +} + +constexpr int kCoalescedVectorSize = 16; +constexpr int kSmallInputThreshold = 1024; + +struct ratio_selector { + int ratio; + int align; + constexpr inline ratio_selector(int r, int a) : ratio(r), align(a) {} + + template + constexpr static auto ignoring_alignment() -> ratio_selector + { + return ratio_selector{raft::div_rounding_up_safe(kCoalescedVectorSize, sizeof(T)), 0}; + } + + template + explicit ratio_selector(const T* ptr) + { + constexpr auto s = ignoring_alignment(); // NOLINT + align = int(Pow2::roundUp(ptr) - ptr); + ratio = int(s.ratio); + } +}; + +constexpr inline auto operator*(const ratio_selector& a, const ratio_selector& b) -> ratio_selector +{ + auto ratio = std::min(a.ratio, b.ratio); + while ((a.align % ratio) != (b.align % ratio)) { + ratio >>= 1; + } + return ratio_selector{ratio, a.align % ratio}; +} + +template +void map_call_rt( + int r, rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) +{ + if (r >= R) { return map_call(stream, out_ptr, len, f, in_ptrs...); } + if constexpr (R > 1) { + return map_call_rt<(R >> 1), PassOffset>(r, stream, out_ptr, len, f, in_ptrs...); + } +} + +template +void map(rmm::cuda_stream_view stream, OutT* out_ptr, IdxT len, Func f, const InTs*... in_ptrs) +{ + // don't vectorize on small inputs + if (len <= kSmallInputThreshold) { + return map_call<1, PassOffset>(stream, out_ptr, len, f, in_ptrs...); + } + constexpr int kRatio = + (ratio_selector::ignoring_alignment() * ... * ratio_selector::ignoring_alignment()) + .ratio; + static_assert(kRatio > 0, "Unexpected zero vector size."); + const int ratio = (ratio_selector(out_ptr) * ... * ratio_selector(in_ptrs)).ratio; + return map_call_rt(ratio, stream, out_ptr, len, f, in_ptrs...); +} + +template , + typename = raft::enable_if_input_device_mdspan> +void map_check_shape(OutType out, InType in) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(in) && out.size() == in.size(), + "All inputs must be contiguous and have the same size as the output"); +} + +/** + * @brief Map a function over a zero or more inputs and optionally a 0-based flat index + * (element offset). + * + * _Performance note_: when possible, this function loads the argument arrays and stores the output + * array using vectorized cuda load/store instructions. The size of the vectorization depends on the + * size of the largest input/output element type and on the alignment of all pointers. + * + * @tparam PassOffset whether to pass an offset as a first argument to Func + * @tparam OutType data-type of the result (device_mdspan) + * @tparam Func the device-lambda performing the actual operation + * @tparam InTypes data-types of the inputs (device_mdspan) + * + * @param[in] res raft::device_resources + * @param[out] out the output of the map operation (device_mdspan) + * @param[in] f device lambda of type + * ([auto offset], InTypes::value_type xs...) -> OutType::value_type + * @param[in] ins the inputs (each of the same size as the output) (device_mdspan) + */ +template -void mapImpl( - OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args) -{ - const int nblks = raft::ceildiv(len, (IdxType)TPB); - mapKernel - <<>>(out, len, map, in, args...); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -} // namespace detail -} // namespace linalg -}; // namespace raft + typename Func, + typename... InTypes, + typename = raft::enable_if_output_device_mdspan, + typename = raft::enable_if_input_device_mdspan> +void map(const raft::device_resources& res, OutType out, Func f, InTypes... ins) +{ + RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + (map_check_shape(out, ins), ...); + + if (out.size() <= std::numeric_limits::max()) { + map( + res.get_stream(), out.data_handle(), uint32_t(out.size()), f, ins.data_handle()...); + } else { + map( + res.get_stream(), out.data_handle(), uint64_t(out.size()), f, ins.data_handle()...); + } +} + +} // namespace raft::linalg::detail diff --git a/cpp/include/raft/linalg/detail/strided_reduction.cuh b/cpp/include/raft/linalg/detail/strided_reduction.cuh index 0e516b4750..42e79a9285 100644 --- a/cpp/include/raft/linalg/detail/strided_reduction.cuh +++ b/cpp/include/raft/linalg/detail/strided_reduction.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ #pragma once -#include "unary_op.cuh" #include #include #include diff --git a/cpp/include/raft/linalg/detail/ternary_op.cuh b/cpp/include/raft/linalg/detail/ternary_op.cuh deleted file mode 100644 index 7874f20f56..0000000000 --- a/cpp/include/raft/linalg/detail/ternary_op.cuh +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -namespace raft { -namespace linalg { -namespace detail { -template -__global__ void ternaryOpKernel( - out_t* out, const math_t* in1, const math_t* in2, const math_t* in3, IdxType len, Lambda op) -{ - typedef raft::TxN_t VecType; - VecType a, b, c; - IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - idx *= VecType::Ratio; - if (idx >= len) return; - a.load(in1, idx); - b.load(in2, idx); - c.load(in3, idx); -#pragma unroll - for (int i = 0; i < VecType::Ratio; ++i) { - a.val.data[i] = op(a.val.data[i], b.val.data[i], c.val.data[i]); - } - a.store(out, idx); -} - -template -void ternaryOpImpl(out_t* out, - const math_t* in1, - const math_t* in2, - const math_t* in3, - IdxType len, - Lambda op, - cudaStream_t stream) -{ - const IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : len, (IdxType)TPB); - ternaryOpKernel - <<>>(out, in1, in2, in3, len, op); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -/** - * @brief perform element-wise ternary operation on the input arrays - * @tparam math_t data-type upon which the math operation will be performed - * @tparam Lambda the device-lambda performing the actual operation - * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads-per-block in the final kernel launched - * @param out the output array - * @param in1 the first input array - * @param in2 the second input array - * @param in3 the third input array - * @param len number of elements in the input array - * @param op the device-lambda - * @param stream cuda stream where to launch work - */ -template -void ternaryOp(out_t* out, - const math_t* in1, - const math_t* in2, - const math_t* in3, - IdxType len, - Lambda op, - cudaStream_t stream) -{ - size_t bytes = len * sizeof(math_t); - if (16 / sizeof(math_t) && bytes % 16 == 0) { - ternaryOpImpl( - out, in1, in2, in3, len, op, stream); - } else if (8 / sizeof(math_t) && bytes % 8 == 0) { - ternaryOpImpl( - out, in1, in2, in3, len, op, stream); - } else if (4 / sizeof(math_t) && bytes % 4 == 0) { - ternaryOpImpl( - out, in1, in2, in3, len, op, stream); - } else if (2 / sizeof(math_t) && bytes % 2 == 0) { - ternaryOpImpl( - out, in1, in2, in3, len, op, stream); - } else if (1 / sizeof(math_t)) { - ternaryOpImpl( - out, in1, in2, in3, len, op, stream); - } else { - ternaryOpImpl(out, in1, in2, in3, len, op, stream); - } -} - -}; // end namespace detail -}; // end namespace linalg -}; // end namespace raft \ No newline at end of file diff --git a/cpp/include/raft/linalg/detail/unary_op.cuh b/cpp/include/raft/linalg/detail/unary_op.cuh deleted file mode 100644 index cdadc6f868..0000000000 --- a/cpp/include/raft/linalg/detail/unary_op.cuh +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -namespace raft { -namespace linalg { -namespace detail { - -template -__global__ void unaryOpKernel(OutType* out, const InType* in, IdxType len, Lambda op) -{ - typedef TxN_t InVecType; - typedef TxN_t OutVecType; - InVecType a; - OutVecType b; - IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - idx *= InVecType::Ratio; - if (idx >= len) return; - a.load(in, idx); -#pragma unroll - for (int i = 0; i < InVecType::Ratio; ++i) { - b.val.data[i] = op(a.val.data[i]); - } - b.store(out, idx); -} - -template -void unaryOpImpl(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) -{ - const IdxType nblks = raft::ceildiv(VecLen ? len / VecLen : len, (IdxType)TPB); - unaryOpKernel - <<>>(out, in, len, op); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -template -void unaryOpCaller(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) -{ - if (len <= 0) return; // silently skip in case of 0 length input - constexpr auto maxSize = sizeof(InType) >= sizeof(OutType) ? sizeof(InType) : sizeof(OutType); - size_t bytes = len * maxSize; - uint64_t inAddr = uint64_t(in); - uint64_t outAddr = uint64_t(out); - if (16 / maxSize && bytes % 16 == 0 && inAddr % 16 == 0 && outAddr % 16 == 0) { - unaryOpImpl(out, in, len, op, stream); - } else if (8 / maxSize && bytes % 8 == 0 && inAddr % 8 == 0 && outAddr % 8 == 0) { - unaryOpImpl(out, in, len, op, stream); - } else if (4 / maxSize && bytes % 4 == 0 && inAddr % 4 == 0 && outAddr % 4 == 0) { - unaryOpImpl(out, in, len, op, stream); - } else if (2 / maxSize && bytes % 2 == 0 && inAddr % 2 == 0 && outAddr % 2 == 0) { - unaryOpImpl(out, in, len, op, stream); - } else if (1 / maxSize) { - unaryOpImpl(out, in, len, op, stream); - } else { - unaryOpImpl(out, in, len, op, stream); - } -} - -template -__global__ void writeOnlyUnaryOpKernel(OutType* out, IdxType len, Lambda op) -{ - IdxType idx = threadIdx.x + ((IdxType)blockIdx.x * blockDim.x); - if (idx < len) { op(out + idx, idx); } -} - -template -void writeOnlyUnaryOpCaller(OutType* out, IdxType len, Lambda op, cudaStream_t stream) -{ - if (len <= 0) return; // silently skip in case of 0 length input - auto nblks = raft::ceildiv(len, TPB); - writeOnlyUnaryOpKernel<<>>(out, len, op); - RAFT_CUDA_TRY(cudaGetLastError()); -} - -}; // end namespace detail -}; // end namespace linalg -}; // end namespace raft diff --git a/cpp/include/raft/linalg/init.cuh b/cpp/include/raft/linalg/init.cuh index 5a810bf2ba..ad772b3989 100644 --- a/cpp/include/raft/linalg/init.cuh +++ b/cpp/include/raft/linalg/init.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,11 +18,10 @@ #pragma once -#include "detail/init.hpp" +#include #include -namespace raft { -namespace linalg { +namespace raft::linalg { /** * @brief Like Python range. @@ -37,7 +36,8 @@ namespace linalg { template void range(T* out, int start, int end, cudaStream_t stream) { - detail::range(out, start, end, stream); + return detail::map( + stream, out, end - start, compose_op{cast_op{}, add_const_op{start}}); } /** @@ -52,7 +52,7 @@ void range(T* out, int start, int end, cudaStream_t stream) template void range(T* out, int n, cudaStream_t stream) { - detail::range(out, n, stream); + return detail::map(stream, out, n, cast_op{}); } /** @@ -68,7 +68,6 @@ void zero(T* out, int n, cudaStream_t stream) RAFT_CUDA_TRY(cudaMemsetAsync(static_cast(out), 0, n * sizeof(T), stream)); } -} // namespace linalg -} // namespace raft +} // namespace raft::linalg -#endif \ No newline at end of file +#endif diff --git a/cpp/include/raft/linalg/map.cuh b/cpp/include/raft/linalg/map.cuh index 2b9e6c80a0..305a8aa3cc 100644 --- a/cpp/include/raft/linalg/map.cuh +++ b/cpp/include/raft/linalg/map.cuh @@ -22,119 +22,267 @@ #include #include -#include -#include -namespace raft { -namespace linalg { +namespace raft::linalg { /** - * @brief CUDA version of map - * @tparam InType data-type upon which the math operation will be performed - * @tparam MapOp the device-lambda performing the actual operation - * @tparam TPB threads-per-block in the final kernel launched - * @tparam Args additional parameters - * @tparam OutType data-type in which the result will be stored - * @param out the output of the map operation (assumed to be a device pointer) - * @param len number of elements in the input array - * @param map the device-lambda - * @param stream cuda-stream where to launch this kernel - * @param in the input array - * @param args additional input arrays + * @defgroup map Mapping ops + * @{ + */ + +/** + * @brief Map a function over zero or more input mdspans of the same size. + * + * The algorithm applied on `k` inputs can be described in a following pseudo-code: + * @code + * for (auto i: [0 ... out.size()]) { + * out[i] = f(in_0[i], in_1[i], ..., in_k[i]) + * } + * @endcode + * + * _Performance note_: when possible, this function loads the argument arrays and stores the output + * array using vectorized cuda load/store instructions. The size of the vectorization depends on the + * size of the largest input/output element type and on the alignment of all pointers. + * + * Usage example: + * @code{.cpp} + * #include + * #include + * #include + * #include + * + * auto input = raft::make_device_vector(res, n); + * ... fill input .. + * auto squares = raft::make_device_vector(res, n); + * raft::linalg::map_offset(res, squares.view(), raft::sq_op{}, input.view()); + * @endcode + * + * @tparam OutType data-type of the result (device_mdspan) + * @tparam Func the device-lambda performing the actual operation + * @tparam InTypes data-types of the inputs (device_mdspan) + * + * @param[in] res raft::device_resources + * @param[out] out the output of the map operation (device_mdspan) + * @param[in] f device lambda + * (InTypes::value_type xs...) -> OutType::value_type + * @param[in] ins the inputs (each of the same size as the output) (device_mdspan) */ -template -void map_k( - OutType* out, IdxType len, MapOp map, cudaStream_t stream, const InType* in, Args... args) +template , + typename = raft::enable_if_input_device_mdspan> +void map(const raft::device_resources& res, OutType out, Func f, InTypes... ins) { - detail::mapImpl( - out, len, map, stream, in, args...); + return detail::map(res, out, f, ins...); } /** - * @defgroup map Mapping ops - * @{ + * @brief Map a function over one mdspan. + * + * @tparam InType1 data-type of the input (device_mdspan) + * @tparam OutType data-type of the result (device_mdspan) + * @tparam Func the device-lambda performing the actual operation + * + * @param[in] res raft::device_resources + * @param[in] in1 the input (the same size as the output) (device_mdspan) + * @param[out] out the output of the map operation (device_mdspan) + * @param[in] f device lambda + * (InType1::value_type x) -> OutType::value_type */ +template , + typename = raft::enable_if_input_device_mdspan> +void map(const raft::device_resources& res, InType1 in1, OutType out, Func f) +{ + return detail::map(res, out, f, in1); +} /** - * @brief CUDA version of map - * @tparam InType data-type for math operation of type raft::device_mdspan - * @tparam MapOp the device-lambda performing the actual operation - * @tparam TPB threads-per-block in the final kernel launched - * @tparam OutType data-type of result of type raft::device_mdspan - * @tparam Args additional parameters - * @param[in] handle raft::device_resources - * @param[in] in the input of type raft::device_mdspan - * @param[out] out the output of the map operation of type raft::device_mdspan - * @param[in] map the device-lambda - * @param[in] args additional input arrays + * @brief Map a function over two mdspans. + * + * @tparam InType1 data-type of the input (device_mdspan) + * @tparam InType2 data-type of the input (device_mdspan) + * @tparam OutType data-type of the result (device_mdspan) + * @tparam Func the device-lambda performing the actual operation + * + * @param[in] res raft::device_resources + * @param[in] in1 the input (the same size as the output) (device_mdspan) + * @param[in] in2 the input (the same size as the output) (device_mdspan) + * @param[out] out the output of the map operation (device_mdspan) + * @param[in] f device lambda + * (InType1::value_type x1, InType2::value_type x2) -> OutType::value_type */ -template , - typename = raft::enable_if_output_device_mdspan> -void map(raft::device_resources const& handle, InType in, OutType out, MapOp map, Args... args) + typename Func, + typename = raft::enable_if_output_device_mdspan, + typename = raft::enable_if_input_device_mdspan> +void map(const raft::device_resources& res, InType1 in1, InType2 in2, OutType out, Func f) { - using in_value_t = typename InType::value_type; - using out_value_t = typename OutType::value_type; - - RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output is not exhaustive"); - RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input is not exhaustive"); - RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Input and Output"); - - if (out.size() <= std::numeric_limits::max()) { - map_k( - out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...); - } else { - map_k( - out.data_handle(), out.size(), map, handle.get_stream(), in.data_handle(), args...); - } + return detail::map(res, out, f, in1, in2); } /** - * @brief Perform an element-wise unary operation on the input offset into the output array + * @brief Map a function over three mdspans. + * + * @tparam InType1 data-type of the input 1 (device_mdspan) + * @tparam InType2 data-type of the input 2 (device_mdspan) + * @tparam InType3 data-type of the input 3 (device_mdspan) + * @tparam OutType data-type of the result (device_mdspan) + * @tparam Func the device-lambda performing the actual operation + * + * @param[in] res raft::device_resources + * @param[in] in1 the input 1 (the same size as the output) (device_mdspan) + * @param[in] in2 the input 2 (the same size as the output) (device_mdspan) + * @param[in] in3 the input 3 (the same size as the output) (device_mdspan) + * @param[out] out the output of the map operation (device_mdspan) + * @param[in] f device lambda + * (InType1::value_type x1, InType2::value_type x2, InType3::value_type x3) -> OutType::value_type + */ +template , + typename = raft::enable_if_input_device_mdspan> +void map( + const raft::device_resources& res, InType1 in1, InType2 in2, InType3 in3, OutType out, Func f) +{ + return detail::map(res, out, f, in1, in2, in3); +} + +/** + * @brief Map a function over zero-based flat index (element offset) and zero or more inputs. + * + * The algorithm applied on `k` inputs can be described in a following pseudo-code: + * @code + * for (auto i: [0 ... out.size()]) { + * out[i] = f(i, in_0[i], in_1[i], ..., in_k[i]) + * } + * @endcode + * + * _Performance note_: when possible, this function loads the argument arrays and stores the output + * array using vectorized cuda load/store instructions. The size of the vectorization depends on the + * size of the largest input/output element type and on the alignment of all pointers. * * Usage example: * @code{.cpp} * #include - * #include + * #include * #include * #include - * ... - * raft::handle_t handle; + * * auto squares = raft::make_device_vector(handle, n); - * raft::linalg::map_offset(handle, squares.view(), raft::sq_op()); + * raft::linalg::map_offset(res, squares.view(), raft::sq_op{}); * @endcode * - * @tparam OutType Output mdspan type - * @tparam MapOp The unary operation type with signature `OutT func(const IdxT& idx);` - * @param[in] handle The raft handle - * @param[out] out Output array - * @param[in] op The unary operation + * @tparam OutType data-type of the result (device_mdspan) + * @tparam Func the device-lambda performing the actual operation + * @tparam InTypes data-types of the inputs (device_mdspan) + * + * @param[in] res raft::device_resources + * @param[out] out the output of the map operation (device_mdspan) + * @param[in] f device lambda + * (auto offset, InTypes::value_type xs...) -> OutType::value_type + * @param[in] ins the inputs (each of the same size as the output) (device_mdspan) */ template > -void map_offset(const raft::device_resources& handle, OutType out, MapOp op) + typename Func, + typename... InTypes, + typename = raft::enable_if_output_device_mdspan, + typename = raft::enable_if_input_device_mdspan> +void map_offset(const raft::device_resources& res, OutType out, Func f, InTypes... ins) { - RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); + return detail::map(res, out, f, ins...); +} - using out_value_t = typename OutType::value_type; +/** + * @brief Map a function over zero-based flat index (element offset) and one mdspan. + * + * @tparam InType1 data-type of the input (device_mdspan) + * @tparam OutType data-type of the result (device_mdspan) + * @tparam Func the device-lambda performing the actual operation + * + * @param[in] res raft::device_resources + * @param[in] in1 the input (the same size as the output) (device_mdspan) + * @param[out] out the output of the map operation (device_mdspan) + * @param[in] f device lambda + * (auto offset, InType1::value_type x) -> OutType::value_type + */ +template , + typename = raft::enable_if_input_device_mdspan> +void map_offset(const raft::device_resources& res, InType1 in1, OutType out, Func f) +{ + return detail::map(res, out, f, in1); +} - thrust::tabulate( - handle.get_thrust_policy(), out.data_handle(), out.data_handle() + out.size(), op); +/** + * @brief Map a function over zero-based flat index (element offset) and two mdspans. + * + * @tparam InType1 data-type of the input (device_mdspan) + * @tparam InType2 data-type of the input (device_mdspan) + * @tparam OutType data-type of the result (device_mdspan) + * @tparam Func the device-lambda performing the actual operation + * + * @param[in] res raft::device_resources + * @param[in] in1 the input (the same size as the output) (device_mdspan) + * @param[in] in2 the input (the same size as the output) (device_mdspan) + * @param[out] out the output of the map operation (device_mdspan) + * @param[in] f device lambda + * (auto offset, InType1::value_type x1, InType2::value_type x2) -> OutType::value_type + */ +template , + typename = raft::enable_if_input_device_mdspan> +void map_offset(const raft::device_resources& res, InType1 in1, InType2 in2, OutType out, Func f) +{ + return detail::map(res, out, f, in1, in2); +} + +/** + * @brief Map a function over zero-based flat index (element offset) and three mdspans. + * + * @tparam InType1 data-type of the input 1 (device_mdspan) + * @tparam InType2 data-type of the input 2 (device_mdspan) + * @tparam InType3 data-type of the input 3 (device_mdspan) + * @tparam OutType data-type of the result (device_mdspan) + * @tparam Func the device-lambda performing the actual operation + * + * @param[in] res raft::device_resources + * @param[in] in1 the input 1 (the same size as the output) (device_mdspan) + * @param[in] in2 the input 2 (the same size as the output) (device_mdspan) + * @param[in] in3 the input 3 (the same size as the output) (device_mdspan) + * @param[out] out the output of the map operation (device_mdspan) + * @param[in] f device lambda + * (auto offset, InType1::value_type x1, InType2::value_type x2, InType3::value_type x3) + * -> OutType::value_type + */ +template , + typename = raft::enable_if_input_device_mdspan> +void map_offset( + const raft::device_resources& res, InType1 in1, InType2 in2, InType3 in3, OutType out, Func f) +{ + return detail::map(res, out, f, in1, in2, in3); } /** @} */ // end of map -} // namespace linalg -}; // namespace raft +} // namespace raft::linalg #endif diff --git a/cpp/include/raft/linalg/ternary_op.cuh b/cpp/include/raft/linalg/ternary_op.cuh index aa3859bc23..1e347d69be 100644 --- a/cpp/include/raft/linalg/ternary_op.cuh +++ b/cpp/include/raft/linalg/ternary_op.cuh @@ -19,11 +19,9 @@ #pragma once -#include "detail/ternary_op.cuh" - #include #include -#include +#include namespace raft { namespace linalg { @@ -50,7 +48,7 @@ void ternaryOp(out_t* out, Lambda op, cudaStream_t stream) { - detail::ternaryOp(out, in1, in2, in3, len, op, stream); + return detail::map(stream, out, len, op, in1, in2, in3); } /** @@ -80,33 +78,7 @@ template ::max()) { - ternaryOp(out.data_handle(), - in1.data_handle(), - in2.data_handle(), - in3.data_handle(), - out.size(), - op, - handle.get_stream()); - } else { - ternaryOp(out.data_handle(), - in1.data_handle(), - in2.data_handle(), - in3.data_handle(), - out.size(), - op, - handle.get_stream()); - } + return map(handle, in1, in2, in3, out, op); } /** @} */ // end of group ternary_op @@ -114,4 +86,4 @@ void ternary_op( }; // end namespace linalg }; // end namespace raft -#endif \ No newline at end of file +#endif diff --git a/cpp/include/raft/linalg/unary_op.cuh b/cpp/include/raft/linalg/unary_op.cuh index ce102adfd2..23f932d2f2 100644 --- a/cpp/include/raft/linalg/unary_op.cuh +++ b/cpp/include/raft/linalg/unary_op.cuh @@ -18,11 +18,9 @@ #pragma once -#include "detail/unary_op.cuh" - #include #include -#include +#include namespace raft { namespace linalg { @@ -48,7 +46,7 @@ template void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_t stream) { - detail::unaryOpCaller(out, in, len, op, stream); + return detail::map(stream, out, len, op, in); } /** @@ -71,7 +69,11 @@ void unaryOp(OutType* out, const InType* in, IdxType len, Lambda op, cudaStream_ template void writeOnlyUnaryOp(OutType* out, IdxType len, Lambda op, cudaStream_t stream) { - detail::writeOnlyUnaryOpCaller(out, len, op, stream); + return detail::map(stream, out, len, [op] __device__(IdxType offset) { + OutType r; + op(&r, offset); + return r; + }); } /** @@ -97,20 +99,7 @@ template > void unary_op(raft::device_resources const& handle, InType in, OutType out, Lambda op) { - RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); - RAFT_EXPECTS(raft::is_row_or_column_major(in), "Input must be contiguous"); - RAFT_EXPECTS(out.size() == in.size(), "Size mismatch between Output and Input"); - - using in_value_t = typename InType::value_type; - using out_value_t = typename OutType::value_type; - - if (out.size() <= std::numeric_limits::max()) { - unaryOp( - out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream()); - } else { - unaryOp( - out.data_handle(), in.data_handle(), out.size(), op, handle.get_stream()); - } + return map(handle, in, out, op); } /** @@ -130,17 +119,7 @@ template > void write_only_unary_op(const raft::device_resources& handle, OutType out, Lambda op) { - RAFT_EXPECTS(raft::is_row_or_column_major(out), "Output must be contiguous"); - - using out_value_t = typename OutType::value_type; - - if (out.size() <= std::numeric_limits::max()) { - writeOnlyUnaryOp( - out.data_handle(), out.size(), op, handle.get_stream()); - } else { - writeOnlyUnaryOp( - out.data_handle(), out.size(), op, handle.get_stream()); - } + return writeOnlyUnaryOp(out.data_handle(), out.size(), op, handle.get_stream()); } /** @} */ // end of group unary_op diff --git a/cpp/include/raft/matrix/init.cuh b/cpp/include/raft/matrix/init.cuh index f597bbd1c6..ed2fb4d209 100644 --- a/cpp/include/raft/matrix/init.cuh +++ b/cpp/include/raft/matrix/init.cuh @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -65,9 +66,7 @@ void fill(raft::device_resources const& handle, raft::device_mdspan inout, math_t scalar) { - RAFT_EXPECTS(raft::is_row_or_column_major(inout), "Data layout not supported"); - detail::setValue( - inout.data_handle(), inout.data_handle(), scalar, inout.size(), handle.get_stream()); + linalg::map(handle, inout, raft::const_op{scalar}); } /** @} */ // end of group matrix_init diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh index 4c6ef0e30b..4b6e6f5e31 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh @@ -42,11 +42,11 @@ #include #include -#include -#include #include +#include + namespace raft::neighbors::ivf_pq::detail { /** @@ -1311,13 +1311,13 @@ void ivfpq_search_worker(raft::device_resources const& handle, // of a cluster by processing the cluster at the same time as much as // possible. index_list_sorted_buf.resize(n_queries * n_probes, stream); - rmm::device_uvector index_list_buf(n_queries * n_probes, stream, mr); + auto index_list_buf = + make_device_mdarray(handle, mr, make_extents(n_queries * n_probes)); rmm::device_uvector cluster_labels_out(n_queries * n_probes, stream, mr); - auto index_list = index_list_buf.data(); + auto index_list = index_list_buf.data_handle(); index_list_sorted = index_list_sorted_buf.data(); - thrust::sequence(handle.get_thrust_policy(), - thrust::device_pointer_cast(index_list), - thrust::device_pointer_cast(index_list + n_queries * n_probes)); + + linalg::map_offset(handle, index_list_buf.view(), identity_op{}); int begin_bit = 0; int end_bit = sizeof(uint32_t) * 8; @@ -1376,13 +1376,15 @@ void ivfpq_search_worker(raft::device_resources const& handle, topK); rmm::device_uvector device_lut(search_instance.device_lut_size, stream, mr); - rmm::device_uvector query_kths(0, stream, mr); + std::optional> query_kths_buf{std::nullopt}; + float* query_kths = nullptr; if (manage_local_topk) { - query_kths.resize(n_queries, stream); - thrust::fill_n(handle.get_thrust_policy(), - query_kths.data(), - n_queries, - float(dummy_block_sort_t::queue_t::kDummy)); + query_kths_buf.emplace( + make_device_mdarray(handle, mr, make_extents(n_queries))); + linalg::map(handle, + query_kths_buf->view(), + raft::const_op{dummy_block_sort_t::queue_t::kDummy}); + query_kths = query_kths_buf->data_handle(); } search_instance(stream, index.size(), @@ -1401,7 +1403,7 @@ void ivfpq_search_worker(raft::device_resources const& handle, chunk_index.data(), query, index_list_sorted, - query_kths.data(), + query_kths, device_lut.data(), distances_buf.data(), neighbors_ptr); diff --git a/cpp/test/linalg/map.cu b/cpp/test/linalg/map.cu index 5b52374789..15b40808ee 100644 --- a/cpp/test/linalg/map.cu +++ b/cpp/test/linalg/map.cu @@ -37,13 +37,15 @@ void mapLaunch(OutType* out, raft::device_resources handle{stream}; auto out_view = raft::make_device_vector_view(out, len); auto in1_view = raft::make_device_vector_view(in1, len); + auto in2_view = raft::make_device_vector_view(in2, len); + auto in3_view = raft::make_device_vector_view(in3, len); map( handle, - in1_view, out_view, [=] __device__(InType a, InType b, InType c) { return a + b + c + scalar; }, - in2, - in3); + in1_view, + in2_view, + in3_view); } template