diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml index 051dad7eaa..2c92ad0a99 100644 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -29,12 +29,15 @@ dependencies: - libcusolver=11.4.1.48 - libcusparse-dev=11.7.5.86 - libcusparse=11.7.5.86 +- matplotlib - nccl>=2.9.9 - ninja - nlohmann_json>=3.11.2 - nvcc_linux-64=11.8 - openblas -- rmm=23.12.* +- pandas +- pyyaml +- rmm==23.12.* - scikit-build>=0.13.1 - sysroot_linux-64==2.17 name: bench_ann_cuda-118_arch-x86_64 diff --git a/cpp/bench/ann/src/raft/raft_benchmark.cu b/cpp/bench/ann/src/raft/raft_benchmark.cu index 7b859269b0..fa20c5c223 100644 --- a/cpp/bench/ann/src/raft/raft_benchmark.cu +++ b/cpp/bench/ann/src/raft/raft_benchmark.cu @@ -154,6 +154,7 @@ void parse_build_param(const nlohmann::json& conf, param.build_algo = raft::neighbors::cagra::graph_build_algo::NN_DESCENT; } } + if (conf.contains("nn_descent_niter")) { param.nn_descent_niter = conf.at("nn_descent_niter"); } } template diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index ca4b0f099d..5da2cd916b 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -32,6 +32,7 @@ function(ConfigureBench) PRIVATE raft::raft raft_internal $<$:raft::compiled> + ${RAFT_CTK_MATH_DEPENDENCIES} benchmark::benchmark Threads::Threads $ @@ -73,11 +74,14 @@ function(ConfigureBench) endfunction() if(BUILD_PRIMS_BENCH) + ConfigureBench( + NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp + ) + ConfigureBench( NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY ) - ConfigureBench(NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/main.cpp) ConfigureBench( NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu diff --git a/cpp/bench/prims/cluster/kmeans_balanced.cu b/cpp/bench/prims/cluster/kmeans_balanced.cu index effe2a55a4..129578c303 100644 --- a/cpp/bench/prims/cluster/kmeans_balanced.cu +++ b/cpp/bench/prims/cluster/kmeans_balanced.cu @@ -50,10 +50,10 @@ struct KMeansBalanced : public fixture { constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); if constexpr (std::is_integral_v) { raft::random::uniformInt( - rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax, stream); + handle, rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax); } else { raft::random::uniform( - rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax, stream); + handle, rng, X.data_handle(), params.data.rows * params.data.cols, kRangeMin, kRangeMax); } resource::sync_stream(handle, stream); } diff --git a/cpp/bench/prims/core/copy.cu b/cpp/bench/prims/core/copy.cu new file mode 100644 index 0000000000..31ee83b924 --- /dev/null +++ b/cpp/bench/prims/core/copy.cu @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace raft::bench::core { + +template +auto constexpr const default_dims = []() { + auto dims = std::array{}; + std::fill(dims.begin(), dims.end(), 2); + return dims; +}(); + +template +auto constexpr const default_dims = std::array{3000000}; + +template +auto constexpr const default_dims = std::array{1000, 3000}; + +template +auto constexpr const default_dims = std::array{20, 300, 500}; + +template > +struct bench_array_type; + +template +struct bench_array_type> { + template + auto static constexpr const extent_type = raft::dynamic_extent; + + using type = + std::conditional_t...>, LayoutPolicy>, + device_mdarray...>, LayoutPolicy>>; +}; + +template +struct params { + std::array dims = default_dims; + using src_array_type = + typename bench_array_type::type; + using dst_array_type = + typename bench_array_type::type; +}; + +template +struct CopyBench : public fixture { + using params_type = + params; + using src_array_type = typename params_type::src_array_type; + using dst_array_type = typename params_type::dst_array_type; + explicit CopyBench(const params_type& ps) + : fixture{true}, + res_{}, + params_{ps}, + src_{ + res_, + typename src_array_type::mapping_type{ + std::apply([](auto... exts) { return make_extents(exts...); }, ps.dims)}, + typename src_array_type::container_policy_type{}, + }, + dst_{ + res_, + typename dst_array_type::mapping_type{ + std::apply([](auto... exts) { return make_extents(exts...); }, ps.dims)}, + typename dst_array_type::container_policy_type{}, + } + { + res_.get_cublas_handle(); // initialize cublas handle + auto src_data = std::vector(src_.size()); + std::iota(src_data.begin(), src_data.end(), SrcT{}); + raft::copy(src_.data_handle(), src_data.data(), src_.size(), res_.get_stream()); + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { raft::copy(res_, dst_.view(), src_.view()); }); + } + + private: + raft::device_resources res_; + params_type params_; + src_array_type src_; + dst_array_type dst_; +}; + +template +auto static const inputs = std::vector{ParamsT{}}; + +#define COPY_REGISTER(BenchT) \ + RAFT_BENCH_REGISTER(BenchT, "BenchT", inputs) + +using copy_bench_device_device_1d_same_dtype_same_layout = CopyBench; +using copy_bench_device_device_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_device_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_device_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_device_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_device_device_3d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_device_3d_diff_dtype_same_layout = CopyBench; + +using copy_bench_host_host_1d_same_dtype_same_layout = CopyBench; +using copy_bench_host_host_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_host_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_host_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_host_2d_same_dtype_diff_layout_float_float = CopyBench; +using copy_bench_host_host_3d_diff_dtype_same_layout = CopyBench; +using copy_bench_host_host_3d_diff_dtype_diff_layout = CopyBench; + +using copy_bench_device_host_1d_same_dtype_same_layout = CopyBench; +using copy_bench_device_host_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_host_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_device_host_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_device_host_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_device_host_3d_diff_dtype_same_layout = CopyBench; +using copy_bench_device_host_3d_diff_dtype_diff_layout = CopyBench; + +using copy_bench_host_device_1d_same_dtype_same_layout = CopyBench; +using copy_bench_host_device_1d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_device_1d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_device_2d_same_dtype_diff_layout = CopyBench; +using copy_bench_host_device_2d_same_dtype_diff_layout_cublas = CopyBench; +using copy_bench_host_device_3d_diff_dtype_diff_layout = CopyBench; +using copy_bench_host_device_3d_diff_dtype_same_layout = CopyBench; + +// COPY_REGISTER(copy_bench_same_dtype_1d_host_host); +COPY_REGISTER(copy_bench_device_device_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_device_device_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_device_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_device_device_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_host_host_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_host_host_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_host_2d_same_dtype_diff_layout_float_float); +COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_host_host_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_device_host_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_device_host_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_device_host_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_device_host_3d_diff_dtype_diff_layout); + +COPY_REGISTER(copy_bench_host_device_1d_same_dtype_same_layout); +COPY_REGISTER(copy_bench_host_device_1d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_1d_diff_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout); +COPY_REGISTER(copy_bench_host_device_2d_same_dtype_diff_layout_cublas); +COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_same_layout); +COPY_REGISTER(copy_bench_host_device_3d_diff_dtype_diff_layout); + +} // namespace raft::bench::core diff --git a/cpp/bench/prims/distance/kernels.cu b/cpp/bench/prims/distance/kernels.cu index 7d916e6ce0..3f74759665 100644 --- a/cpp/bench/prims/distance/kernels.cu +++ b/cpp/bench/prims/distance/kernels.cu @@ -46,9 +46,9 @@ struct GramMatrix : public fixture { A.resize(params.m * params.k, stream); B.resize(params.k * params.n, stream); C.resize(params.m * params.n, stream); - raft::random::Rng r(123456ULL); - r.uniform(A.data(), params.m * params.k, T(-1.0), T(1.0), stream); - r.uniform(B.data(), params.k * params.n, T(-1.0), T(1.0), stream); + raft::random::RngState rng(123456ULL); + raft::random::uniform(handle, rng, A.data(), params.m * params.k, T(-1.0), T(1.0)); + raft::random::uniform(handle, rng, B.data(), params.k * params.n, T(-1.0), T(1.0)); } ~GramMatrix() diff --git a/cpp/bench/prims/linalg/norm.cu b/cpp/bench/prims/linalg/norm.cu index f83953f8e4..1db23e4ca4 100644 --- a/cpp/bench/prims/linalg/norm.cu +++ b/cpp/bench/prims/linalg/norm.cu @@ -42,7 +42,7 @@ struct rowNorm : public fixture { rowNorm(const norm_input& p) : params(p), in(p.rows * p.cols, stream), dots(p.rows, stream) { raft::random::RngState rng{1234}; - raft::random::uniform(rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0, stream); + raft::random::uniform(handle, rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0); } void run_benchmark(::benchmark::State& state) override diff --git a/cpp/bench/prims/linalg/normalize.cu b/cpp/bench/prims/linalg/normalize.cu index ad9052a008..91319e774c 100644 --- a/cpp/bench/prims/linalg/normalize.cu +++ b/cpp/bench/prims/linalg/normalize.cu @@ -41,7 +41,7 @@ struct rowNormalize : public fixture { : params(p), in(p.rows * p.cols, stream), out(p.rows * p.cols, stream) { raft::random::RngState rng{1234}; - raft::random::uniform(rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0, stream); + raft::random::uniform(handle, rng, in.data(), p.rows * p.cols, (T)-10.0, (T)10.0); } void run_benchmark(::benchmark::State& state) override diff --git a/cpp/bench/prims/linalg/reduce_cols_by_key.cu b/cpp/bench/prims/linalg/reduce_cols_by_key.cu index ac0c612ee4..1b584e80c8 100644 --- a/cpp/bench/prims/linalg/reduce_cols_by_key.cu +++ b/cpp/bench/prims/linalg/reduce_cols_by_key.cu @@ -42,7 +42,7 @@ struct reduce_cols_by_key : public fixture { : params(p), in(p.rows * p.cols, stream), out(p.rows * p.keys, stream), keys(p.cols, stream) { raft::random::RngState rng{42}; - raft::random::uniformInt(rng, keys.data(), p.cols, (KeyT)0, (KeyT)p.keys, stream); + raft::random::uniformInt(handle, rng, keys.data(), p.cols, (KeyT)0, (KeyT)p.keys); } void run_benchmark(::benchmark::State& state) override diff --git a/cpp/bench/prims/linalg/reduce_rows_by_key.cu b/cpp/bench/prims/linalg/reduce_rows_by_key.cu index aa9c9a1f62..b68cefc274 100644 --- a/cpp/bench/prims/linalg/reduce_rows_by_key.cu +++ b/cpp/bench/prims/linalg/reduce_rows_by_key.cu @@ -37,7 +37,7 @@ struct reduce_rows_by_key : public fixture { workspace(p.rows, stream) { raft::random::RngState rng{42}; - raft::random::uniformInt(rng, keys.data(), p.rows, (KeyT)0, (KeyT)p.keys, stream); + raft::random::uniformInt(handle, rng, keys.data(), p.rows, (KeyT)0, (KeyT)p.keys); } void run_benchmark(::benchmark::State& state) override diff --git a/cpp/bench/prims/matrix/argmin.cu b/cpp/bench/prims/matrix/argmin.cu index a8f667257a..afee81aa00 100644 --- a/cpp/bench/prims/matrix/argmin.cu +++ b/cpp/bench/prims/matrix/argmin.cu @@ -40,7 +40,7 @@ struct Argmin : public fixture { raft::random::RngState rng{1234}; raft::random::uniform( - rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1), stream); + handle, rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1)); resource::sync_stream(handle, stream); } diff --git a/cpp/bench/prims/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu index ca6a2830bd..00a145ffa9 100644 --- a/cpp/bench/prims/matrix/gather.cu +++ b/cpp/bench/prims/matrix/gather.cu @@ -52,11 +52,11 @@ struct Gather : public fixture { raft::random::RngState rng{1234}; raft::random::uniform( - rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1), stream); + handle, rng, matrix.data_handle(), params.rows * params.cols, T(-1), T(1)); raft::random::uniformInt( handle, rng, map.data_handle(), params.map_length, (MapT)0, (MapT)params.rows); if constexpr (Conditional) { - raft::random::uniform(rng, stencil.data_handle(), params.map_length, T(-1), T(1), stream); + raft::random::uniform(handle, rng, stencil.data_handle(), params.map_length, T(-1), T(1)); } resource::sync_stream(handle, stream); } diff --git a/cpp/bench/prims/neighbors/cagra_bench.cuh b/cpp/bench/prims/neighbors/cagra_bench.cuh index 63f6c14686..07e93a3473 100644 --- a/cpp/bench/prims/neighbors/cagra_bench.cuh +++ b/cpp/bench/prims/neighbors/cagra_bench.cuh @@ -62,20 +62,20 @@ struct CagraBench : public fixture { constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); if constexpr (std::is_integral_v) { raft::random::uniformInt( - state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax, stream); + handle, state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax); raft::random::uniformInt( - state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax, stream); + handle, state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax); } else { raft::random::uniform( - state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax, stream); + handle, state, dataset_.data_handle(), dataset_.size(), kRangeMin, kRangeMax); raft::random::uniform( - state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax, stream); + handle, state, queries_.data_handle(), queries_.size(), kRangeMin, kRangeMax); } // Generate random knn graph raft::random::uniformInt( - state, knn_graph_.data_handle(), knn_graph_.size(), 0, ps.n_samples - 1, stream); + handle, state, knn_graph_.data_handle(), knn_graph_.size(), 0, ps.n_samples - 1); auto metric = raft::distance::DistanceType::L2Expanded; diff --git a/cpp/bench/prims/neighbors/knn.cuh b/cpp/bench/prims/neighbors/knn.cuh index e580b20fdc..31ac869b37 100644 --- a/cpp/bench/prims/neighbors/knn.cuh +++ b/cpp/bench/prims/neighbors/knn.cuh @@ -260,9 +260,9 @@ struct knn : public fixture { constexpr T kRangeMax = std::is_integral_v ? std::numeric_limits::max() : T(1); constexpr T kRangeMin = std::is_integral_v ? std::numeric_limits::min() : T(-1); if constexpr (std::is_integral_v) { - raft::random::uniformInt(state, vec.data(), n, kRangeMin, kRangeMax, stream); + raft::random::uniformInt(handle, state, vec.data(), n, kRangeMin, kRangeMax); } else { - raft::random::uniform(state, vec.data(), n, kRangeMin, kRangeMax, stream); + raft::random::uniform(handle, state, vec.data(), n, kRangeMin, kRangeMax); } } diff --git a/cpp/include/raft/core/copy.cuh b/cpp/include/raft/core/copy.cuh new file mode 100644 index 0000000000..f256f9ea0f --- /dev/null +++ b/cpp/include/raft/core/copy.cuh @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +namespace raft { +/** + * @brief Copy data from one mdspan to another with the same extents + * + * This function copies data from one mdspan to another, regardless of whether + * or not the mdspans have the same layout, memory type (host/device/managed) + * or data type. So long as it is possible to convert the data type from source + * to destination, and the extents are equal, this function should be able to + * perform the copy. Any necessary device operations will be stream-ordered via the CUDA stream + * provided by the `raft::resources` argument. + * + * This header includes a custom kernel used for copying data between + * completely arbitrary mdspans on device. To compile this function in a + * non-CUDA translation unit, `raft/core/copy.hpp` may be used instead. The + * pure C++ header will correctly compile even without a CUDA compiler. + * Depending on the specialization, this CUDA header may invoke the kernel and + * therefore require a CUDA compiler. + * + * Limitations: Currently this function does not support copying directly + * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the + * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the + * underlying memory layout are currently not performant, although they are supported. + * + * Note that when copying to an mdspan with a non-unique layout (i.e. the same + * underlying memory is addressed by different element indexes), the source + * data must contain non-unique values for every non-unique destination + * element. If this is not the case, the behavior is undefined. Some copies + * to non-unique layouts which are well-defined will nevertheless fail with an + * exception to avoid race conditions in the underlying copy. + * + * @tparam DstType An mdspan type for the destination container. + * @tparam SrcType An mdspan type for the source container + * @param res raft::resources used to provide a stream for copies involving the + * device. + * @param dst The destination mdspan. + * @param src The source mdspan. + */ +template +detail::mdspan_copyable_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) +{ + detail::copy(res, std::forward(dst), std::forward(src)); +} + +#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED +#define RAFT_NON_CUDA_COPY_IMPLEMENTED +template +detail::mdspan_copyable_not_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) +{ + detail::copy(res, std::forward(dst), std::forward(src)); +} +#endif +} // namespace raft diff --git a/cpp/include/raft/core/copy.hpp b/cpp/include/raft/core/copy.hpp new file mode 100644 index 0000000000..0a16b742a2 --- /dev/null +++ b/cpp/include/raft/core/copy.hpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +namespace raft { + +#ifndef RAFT_NON_CUDA_COPY_IMPLEMENTED +#define RAFT_NON_CUDA_COPY_IMPLEMENTED +/** + * @brief Copy data from one mdspan to another with the same extents + * + * This function copies data from one mdspan to another, regardless of whether + * or not the mdspans have the same layout, memory type (host/device/managed) + * or data type. So long as it is possible to convert the data type from source + * to destination, and the extents are equal, this function should be able to + * perform the copy. + * + * This header does _not_ include the custom kernel used for copying data + * between completely arbitrary mdspans on device. For arbitrary copies of this + * kind, `#include ` instead. Specializations of this + * function that require the custom kernel will be SFINAE-omitted when this + * header is used instead of `copy.cuh`. This header _does_ support + * device-to-device copies that can be performed with cuBLAS or a + * straightforward cudaMemcpy. Any necessary device operations will be stream-ordered via the CUDA + * stream provided by the `raft::resources` argument. + * + * Limitations: Currently this function does not support copying directly + * between two arbitrary mdspans on different CUDA devices. It is assumed that the caller sets the + * correct CUDA device. Furthermore, host-to-host copies that require a transformation of the + * underlying memory layout are currently not performant, although they are supported. + * + * Note that when copying to an mdspan with a non-unique layout (i.e. the same + * underlying memory is addressed by different element indexes), the source + * data must contain non-unique values for every non-unique destination + * element. If this is not the case, the behavior is undefined. Some copies + * to non-unique layouts which are well-defined will nevertheless fail with an + * exception to avoid race conditions in the underlying copy. + * + * @tparam DstType An mdspan type for the destination container. + * @tparam SrcType An mdspan type for the source container + * @param res raft::resources used to provide a stream for copies involving the + * device. + * @param dst The destination mdspan. + * @param src The source mdspan. + */ +template +detail::mdspan_copyable_not_with_kernel_t copy(resources const& res, + DstType&& dst, + SrcType&& src) +{ + detail::copy(res, std::forward(dst), std::forward(src)); +} +#endif + +} // namespace raft diff --git a/cpp/include/raft/core/cuda_support.hpp b/cpp/include/raft/core/cuda_support.hpp new file mode 100644 index 0000000000..07fb95a921 --- /dev/null +++ b/cpp/include/raft/core/cuda_support.hpp @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +namespace raft { +#ifndef RAFT_DISABLE_CUDA +auto constexpr static const CUDA_ENABLED = true; +#else +auto constexpr static const CUDA_ENABLED = false; +#endif +} // namespace raft diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp new file mode 100644 index 0000000000..b23660fefe --- /dev/null +++ b/cpp/include/raft/core/detail/copy.hpp @@ -0,0 +1,541 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#include +#include +#ifdef __CUDACC__ +#include +#endif +#endif + +namespace raft { +namespace detail { + +template +struct mdspan_copyable : std::false_type { + auto static constexpr const custom_kernel_allowed = false; + auto static constexpr const custom_kernel_not_allowed = false; +}; + +/* + * A helper struct used to determine whether one mdspan type can be copied to + * another and if so how + */ +template +struct mdspan_copyable>>, + std::bool_constant>>>>> { + using dst_type = std::remove_reference_t; + using src_type = std::remove_reference_t; + + // Extents properties + using dst_extents_type = typename dst_type::extents_type; + using src_extents_type = typename src_type::extents_type; + using index_type = + std::conditional_t<(std::numeric_limits::max() > + std::numeric_limits::max()), + typename dst_extents_type::index_type, + typename src_extents_type::index_type>; + + // Dtype properties + using dst_value_type = typename dst_type::value_type; + using src_value_type = typename src_type::value_type; + using dst_element_type = typename dst_type::element_type; + using src_element_type = typename src_type::element_type; + auto static constexpr const same_dtype = std::is_same_v; + auto static constexpr const compatible_dtype = + std::is_assignable_v; + + auto static constexpr const dst_float = std::is_same_v; + auto static constexpr const src_float = std::is_same_v; + auto static constexpr const dst_double = std::is_same_v; + auto static constexpr const src_double = std::is_same_v; + + auto static constexpr const both_float = dst_float && src_float; + auto static constexpr const both_double = dst_double && src_double; + auto static constexpr const both_float_or_both_double = both_float || both_double; + + // Ranks + auto static constexpr const dst_rank = dst_extents_type::rank(); + auto static constexpr const src_rank = src_extents_type::rank(); + auto static constexpr const compatible_rank = (dst_rank == src_rank); + auto static constexpr const has_vector_rank = (dst_rank == 1); + auto static constexpr const has_matrix_rank = (dst_rank == 2); + + // Layout properties + using dst_layout_type = typename dst_type::layout_type; + using src_layout_type = typename src_type::layout_type; + + auto static constexpr const same_layout = std::is_same_v; + + auto static check_for_unique_dst(dst_type dst) + { + if constexpr (!dst_type::is_always_unique()) { + RAFT_EXPECTS(dst.is_unique(), "Destination mdspan must be unique for parallelized copies"); + } + } + + auto static constexpr const src_contiguous = + std::disjunction_v, + std::is_same>; + + auto static constexpr const dst_contiguous = + std::disjunction_v, + std::is_same>; + + auto static constexpr const both_contiguous = src_contiguous && dst_contiguous; + + auto static constexpr const same_underlying_layout = + std::disjunction_v, + std::bool_constant>; + // Layout for intermediate tile if copying through custom kernel + using tile_layout_type = + std::conditional_t>; + + // Accessibility + auto static constexpr const dst_device_accessible = is_device_mdspan_v; + auto static constexpr const src_device_accessible = is_device_mdspan_v; + auto static constexpr const both_device_accessible = + dst_device_accessible && src_device_accessible; + + auto static constexpr const dst_host_accessible = is_host_mdspan_v; + auto static constexpr const src_host_accessible = is_host_mdspan_v; + auto static constexpr const both_host_accessible = dst_host_accessible && src_host_accessible; + + // Allowed copy codepaths + auto static constexpr const can_use_host = both_host_accessible; + +#if (defined(__AVX__) || defined(__SSE__) || defined(__ARM_NEON)) + // TODO(wphicks): Following should be only necessary restrictions. Test if + // perf actually improves once fully implemented. + // auto static constexpr const can_use_simd = can_use_host && both_contiguous && + // both_float_or_both_double; + auto static constexpr const can_use_simd = + can_use_host && both_contiguous && both_float && has_matrix_rank; +#else + auto static constexpr const can_use_simd = false; +#endif + + auto static constexpr const can_use_std_copy = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + auto static constexpr const can_use_raft_copy = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + + // Do we need intermediate storage on device in order to perform + // non-trivial layout or dtype conversions after copying source from host or + // before copying converted results back to host? + auto static constexpr const requires_intermediate = + !both_host_accessible && !both_device_accessible && !can_use_raft_copy; + + auto static constexpr const use_intermediate_dst = + std::conjunction_v, + std::bool_constant>; + + auto static constexpr const use_intermediate_src = + std::conjunction_v, + std::bool_constant>; + auto static constexpr const can_use_device = + std::conjunction_v, + std::disjunction, + std::bool_constant, + std::bool_constant>>; + + auto static constexpr const can_use_cublas = + std::conjunction_v, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant, + std::bool_constant>; + + auto static constexpr const custom_kernel_allowed = + std::conjunction_v, + std::bool_constant>; + + auto static constexpr const custom_kernel_not_allowed = !custom_kernel_allowed; + auto static constexpr const custom_kernel_required = + std::conjunction_v, + std::bool_constant>; + + // Viable overload? + auto static constexpr const value = + std::conjunction_v>, + std::bool_constant>, + std::bool_constant>; + using type = std::enable_if_t; +}; + +template +using mdspan_copyable_t = typename mdspan_copyable::type; +template +auto static constexpr const mdspan_copyable_v = + mdspan_copyable::value; + +template +auto static constexpr const mdspan_copyable_with_kernel_v = + mdspan_copyable::custom_kernel_allowed; +template +auto static constexpr const mdspan_copyable_not_with_kernel_v = + mdspan_copyable::custom_kernel_not_allowed; + +template +using mdspan_copyable_with_kernel_t = + std::enable_if_t, T>; + +template +using mdspan_copyable_not_with_kernel_t = + std::enable_if_t, T>; + +#ifdef __CUDACC__ +auto static constexpr const mdspan_copy_tile_dim = 32; +auto static constexpr const mdspan_copy_tile_elems = mdspan_copy_tile_dim * mdspan_copy_tile_dim; + +// Helper struct to work around lack of CUDA-native std::apply +template +struct index_sequence {}; + +template +struct make_index_sequence + : std::conditional_t, + make_index_sequence> {}; + +/* template +__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args, index_sequence) +{ + return lambda(args[Idx]...); +} + +template +__host__ __device__ decltype(auto) apply(LambdaT&& lambda, ContainerT&& args) +{ + return apply(std::forward(lambda), std::forward(args), +make_index_sequence{}); +} */ + +/* + * Given an mdspan and an array of indices, return a reference to the + * indicated element. + */ +template +__device__ decltype(auto) get_mdspan_elem(MdspanType md, + IdxType const* indices, + index_sequence) +{ + return md(indices[Idx]...); +} + +template +__device__ decltype(auto) get_mdspan_elem(MdspanType md, IdxType const* indices) +{ + return get_mdspan_elem( + md, indices, make_index_sequence{}); +} + +/* Advance old_indices forward by the number of mdspan elements specified + * by increment. Store the result in indices. Return true if the new + * indices are valid for the input mdspan. + */ +template +__device__ auto increment_indices(IdxType* indices, + MdspanType const& md, + IdxType const* old_indices, + IdxType const* index_strides, + IncrType increment) +{ +#pragma unroll + for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { + increment += index_strides[i] * old_indices[i]; + } + +#pragma unroll + for (auto i = typename MdspanType::extents_type::rank_type{}; i < md.rank(); ++i) { + // Iterate through dimensions in order from slowest to fastest varying for + // layout_right and layout_left. Otherwise, just iterate through dimensions + // in order. + // + // TODO(wphicks): It is possible to always iterate through dimensions in + // the slowest to fastest order. Consider this or at minimum expanding to + // padded layouts. + auto const real_index = [](auto ind) { + if constexpr (std::is_same_v) { + return MdspanType::rank() - ind - 1; + } else { + return ind; + } + }(i); + + auto cur_index = IdxType{}; + + while (cur_index < md.extent(real_index) - 1 && increment >= index_strides[real_index]) { + increment -= index_strides[real_index]; + ++cur_index; + } + indices[real_index] = cur_index; + } + + return increment == IdxType{}; +} + +/* + * WARNING: This kernel _must_ be launched with mdspan_copy_tile_dim x + * mdspan_copy_tile_dim threads per block. This restriction allows for + * additional optimizations at the expense of generalized launch + * parameters. + */ +template +__global__ mdspan_copyable_with_kernel_t mdspan_copy_kernel(DstType dst, + SrcType src) +{ + using config = mdspan_copyable; + + // An intermediate storage location for the data to be copied. + __shared__ typename config::dst_value_type tile[mdspan_copy_tile_dim][mdspan_copy_tile_dim + 1]; + + // Compute the cumulative product of extents in order from fastest to + // slowest varying extent + typename config::index_type index_strides[config::dst_rank]; + auto cur_stride = typename config::index_type{1}; +#pragma unroll + for (auto i = typename SrcType::extents_type::rank_type{}; i < config::src_rank; ++i) { + // Iterate through dimensions in order from fastest to slowest varying + auto const real_index = [](auto ind) { + if constexpr (std::is_same_v) { + return config::src_rank - ind - 1; + } else { + return ind; + } + }(i); + + index_strides[real_index] = cur_stride; + cur_stride *= src.extent(real_index); + } + + // The index of the first element in the mdspan which will be copied via + // the current tile for this block. + typename config::index_type tile_offset[config::dst_rank] = {0}; + typename config::index_type cur_indices[config::dst_rank]; + auto valid_tile = increment_indices( + tile_offset, src, tile_offset, index_strides, blockIdx.x * mdspan_copy_tile_elems); + + while (valid_tile) { + auto tile_read_x = std::is_same_v + ? threadIdx.x + : threadIdx.y; + auto tile_read_y = std::is_same_v + ? threadIdx.y + : threadIdx.x; + + auto valid_index = increment_indices(cur_indices, + src, + tile_offset, + index_strides, + tile_read_x * mdspan_copy_tile_dim + tile_read_y); + + if constexpr (config::same_underlying_layout || !config::dst_contiguous) { + if (valid_index) { + tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); + get_mdspan_elem(dst, cur_indices) = tile[tile_read_x][tile_read_y]; + } + } else { + if (valid_index) { tile[tile_read_x][tile_read_y] = get_mdspan_elem(src, cur_indices); } + __syncthreads(); + + valid_index = increment_indices(cur_indices, + src, + tile_offset, + index_strides, + tile_read_y * mdspan_copy_tile_dim + tile_read_x); + if (valid_index) { get_mdspan_elem(dst, cur_indices) = tile[tile_read_y][tile_read_x]; } + __syncthreads(); + } + valid_tile = increment_indices( + tile_offset, src, tile_offset, index_strides, blockDim.x * mdspan_copy_tile_elems); + } +} +#endif + +template +mdspan_copyable_t copy(resources const& res, DstType&& dst, SrcType&& src) +{ + using config = mdspan_copyable; + for (auto i = std::size_t{}; i < config::src_rank; ++i) { + RAFT_EXPECTS(src.extent(i) == dst.extent(i), "Must copy between mdspans of the same shape"); + } + + if constexpr (config::use_intermediate_src) { +#ifndef RAFT_DISABLE_CUDA + // Copy to intermediate source on device, then perform necessary + // changes in layout on device, directly into final destination + using mdarray_t = device_mdarray; + auto intermediate = mdarray_t(res, + typename mdarray_t::mapping_type{src.extents()}, + typename mdarray_t::container_policy_type{}); + detail::copy(res, intermediate.view(), src); + detail::copy(res, dst, intermediate.view()); +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to device in non-CUDA build")); +#endif + + } else if constexpr (config::use_intermediate_dst) { +#ifndef RAFT_DISABLE_CUDA + // Perform necessary changes in layout on device, then copy to final + // destination on host + using mdarray_t = device_mdarray; + auto intermediate = mdarray_t(res, + typename mdarray_t::mapping_type{dst.extents()}, + typename mdarray_t::container_policy_type{}); + detail::copy(res, intermediate.view(), src); + detail::copy(res, dst, intermediate.view()); +#else + throw(raft::non_cuda_build_error("Copying from device in non-CUDA build")); +#endif + } else if constexpr (config::can_use_raft_copy) { +#ifndef RAFT_DISABLE_CUDA + raft::copy(dst.data_handle(), src.data_handle(), dst.size(), resource::get_cuda_stream(res)); +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build")); +#endif + } else if constexpr (config::can_use_cublas) { +#ifndef RAFT_DISABLE_CUDA + auto constexpr const alpha = typename std::remove_reference_t::value_type{1}; + auto constexpr const beta = typename std::remove_reference_t::value_type{0}; + if constexpr (std::is_same_v) { + CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(1), + dst.extent(0), + &alpha, + src.data_handle(), + src.extent(0), + &beta, + dst.data_handle(), + dst.extent(1), + dst.data_handle(), + dst.extent(1), + resource::get_cuda_stream(res))); + } else { + CUBLAS_TRY(linalg::detail::cublasgeam(resource::get_cublas_handle(res), + CUBLAS_OP_T, + CUBLAS_OP_N, + dst.extent(0), + dst.extent(1), + &alpha, + src.data_handle(), + src.extent(1), + &beta, + dst.data_handle(), + dst.extent(0), + dst.data_handle(), + dst.extent(0), + resource::get_cuda_stream(res))); + } +#else + // Not possible to reach this due to enable_ifs. Included for safety. + throw(raft::non_cuda_build_error("Copying to from or on device in non-CUDA build")); +#endif + } else if constexpr (config::custom_kernel_allowed) { +#ifdef __CUDACC__ + config::check_for_unique_dst(dst); + auto const blocks = std::min( + // This maximum is somewhat arbitrary. Could query the device to see + // how many blocks we could reasonably allow, but this is probably + // sufficient considering that this kernel will likely overlap with + // real computations for most use cases. + typename config::index_type{32}, + raft::ceildiv(typename config::index_type(dst.size()), + typename config::index_type(mdspan_copy_tile_elems))); + auto constexpr const threads = dim3{mdspan_copy_tile_dim, mdspan_copy_tile_dim, 1}; + mdspan_copy_kernel<<>>(dst, src); +#else + // Should never actually reach this because of enable_ifs. Included for + // safety. + RAFT_FAIL( + "raft::copy called in a way that requires custom kernel. Please use " + "raft/core/copy.cuh and include the header in a .cu file"); +#endif + } else if constexpr (config::can_use_std_copy) { + std::copy(src.data_handle(), src.data_handle() + dst.size(), dst.data_handle()); + } else { + // TODO(wphicks): Make the following cache-oblivious and add SIMD support + auto indices = std::array{}; + for (auto i = std::size_t{}; i < dst.size(); ++i) { + if (i != 0) { + if constexpr (std::is_same_v) { + // For layout_right/layout_c_contiguous, we iterate over the + // rightmost extent fastest + auto dim = config::src_rank - 1; + while ((++indices[dim]) == src.extent(dim)) { + indices[dim] = typename config::index_type{}; + --dim; + } + } else { + // For layout_left/layout_f_contiguous (and currently all other + // layouts), we iterate over the leftmost extent fastest. The + // cache-oblivious implementation should work through dimensions in + // order of increasing stride. + auto dim = std::size_t{}; + while ((++indices[dim]) == src.extent(dim)) { + indices[dim] = typename config::index_type{}; + ++dim; + } + } + } + std::apply(dst, indices) = std::apply(src, indices); + } + } +} +} // namespace detail +} // namespace raft diff --git a/cpp/include/raft/core/error.hpp b/cpp/include/raft/core/error.hpp index 84b244f4dc..9045c5c871 100644 --- a/cpp/include/raft/core/error.hpp +++ b/cpp/include/raft/core/error.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -98,6 +98,16 @@ struct logic_error : public raft::exception { explicit logic_error(std::string const& message) : raft::exception(message) {} }; +/** + * @brief Exception thrown when attempting to use CUDA features from a non-CUDA + * build + * + */ +struct non_cuda_build_error : public raft::exception { + explicit non_cuda_build_error(char const* const message) : raft::exception(message) {} + explicit non_cuda_build_error(std::string const& message) : raft::exception(message) {} +}; + /** * @} */ diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp index 8e331293bf..c30f2e81e8 100644 --- a/cpp/include/raft/core/resource/resource_types.hpp +++ b/cpp/include/raft/core/resource/resource_types.hpp @@ -39,6 +39,8 @@ enum resource_type { SUB_COMMUNICATOR, // raft sub communicator DEVICE_PROPERTIES, // cuda device properties DEVICE_ID, // cuda device id + STREAM_VIEW, // view of a cuda stream or a placeholder in + // CUDA-free builds THRUST_POLICY, // thrust execution policy WORKSPACE_RESOURCE, // rmm device memory resource diff --git a/cpp/include/raft/core/resource/stream_view.hpp b/cpp/include/raft/core/resource/stream_view.hpp new file mode 100644 index 0000000000..ccf516076f --- /dev/null +++ b/cpp/include/raft/core/resource/stream_view.hpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif + +namespace raft::resource { +struct stream_view_resource : public resource { + stream_view_resource(raft::stream_view view = raft::stream_view_per_thread) : stream(view) {} + void* get_resource() override { return &stream; } + + ~stream_view_resource() override {} + + private: + raft::stream_view stream; +}; + +/** + * Factory that knows how to construct a specific raft::resource to populate + * the resources instance. + */ +struct stream_view_resource_factory : public resource_factory { + public: + stream_view_resource_factory(raft::stream_view view = raft::stream_view_per_thread) : stream(view) + { + } + resource_type get_resource_type() override { return resource_type::STREAM_VIEW; } + resource* make_resource() override { return new stream_view_resource(stream); } + + private: + raft::stream_view stream; +}; + +/** + * @defgroup resource_stream_view stream resource functions compatible with + * non-CUDA builds + * @{ + */ +/** + * Load a raft::stream_view from a resources instance (and populate it on the res + * if needed). + * @param res raft res object for managing resources + * @return + */ +inline raft::stream_view get_stream_view(resources const& res) +{ + if (!res.has_resource_factory(resource_type::STREAM_VIEW)) { + res.add_resource_factory(std::make_shared()); + } + return *res.get_resource(resource_type::STREAM_VIEW); +}; + +/** + * Load a raft::stream__view from a resources instance (and populate it on the res + * if needed). + * @param[in] res raft resources object for managing resources + * @param[in] view raft stream view + */ +inline void set_stream_view(resources const& res, raft::stream_view view) +{ + res.add_resource_factory(std::make_shared(view)); +}; + +/** + * @brief synchronize a specific stream + * + * @param[in] res the raft resources object + * @param[in] stream stream to synchronize + */ +inline void sync_stream_view(const resources& res, raft::stream_view stream) +{ + stream.interruptible_synchronize(); +} + +/** + * @brief synchronize main stream on the resources instance + */ +inline void sync_stream_view(const resources& res) { sync_stream_view(res, get_stream_view(res)); } + +/** + * @} + */ + +} // namespace raft::resource diff --git a/cpp/include/raft/core/stream_view.hpp b/cpp/include/raft/core/stream_view.hpp new file mode 100644 index 0000000000..f7e7934dbf --- /dev/null +++ b/cpp/include/raft/core/stream_view.hpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#endif + +namespace raft { + +namespace detail { +struct fail_stream_view { + constexpr fail_stream_view() = default; + constexpr fail_stream_view(fail_stream_view const&) = default; + constexpr fail_stream_view(fail_stream_view&&) = default; + auto constexpr operator=(fail_stream_view const&) -> fail_stream_view& = default; + auto constexpr operator=(fail_stream_view&&) -> fail_stream_view& = default; + auto value() { throw non_cuda_build_error{"Attempted to access CUDA stream in non-CUDA build"}; } + [[nodiscard]] auto is_per_thread_default() const { return false; } + [[nodiscard]] auto is_default() const { return false; } + void synchronize() const + { + throw non_cuda_build_error{"Attempted to sync CUDA stream in non-CUDA build"}; + } + void synchronize_no_throw() const + { + RAFT_LOG_ERROR("Attempted to sync CUDA stream in non-CUDA build"); + } +}; +} // namespace detail + +/** A lightweight wrapper around rmm::cuda_stream_view that can be used in + * CUDA-free builds + * + * While CUDA-free builds should never actually make use of a CUDA stream at + * runtime, it is sometimes useful to have a symbol that can stand in place of + * a CUDA stream to avoid excessive ifdef directives interspersed with other + * logic. This struct's methods invoke the underlying rmm::cuda_stream_view in + * CUDA-enabled builds but throw runtime exceptions if any non-trivial method + * is called from a CUDA-free build */ +struct stream_view { +#ifndef RAFT_DISABLE_CUDA + using underlying_view_type = rmm::cuda_stream_view; +#else + using underlying_view_type = detail::fail_stream_view; +#endif + + constexpr stream_view( + underlying_view_type base_view = stream_view::get_underlying_per_thread_default()) + : base_view_{base_view} + { + } + constexpr stream_view(stream_view const&) = default; + constexpr stream_view(stream_view&&) = default; + auto operator=(stream_view const&) -> stream_view& = default; + auto operator=(stream_view&&) -> stream_view& = default; + auto value() { return base_view_.value(); } + operator underlying_view_type() const noexcept { return base_view_; } + [[nodiscard]] auto is_per_thread_default() const { return base_view_.is_per_thread_default(); } + [[nodiscard]] auto is_default() const { return base_view_.is_default(); } + void synchronize() const { base_view_.synchronize(); } + void synchronize_no_throw() const { base_view_.synchronize_no_throw(); } + void interruptible_synchronize() const + { +#ifndef RAFT_DISABLE_CUDA + interruptible::synchronize(base_view_); +#else + synchronize(); +#endif + } + + auto underlying() { return base_view_; } + void synchronize_if_cuda_enabled() + { + if constexpr (raft::CUDA_ENABLED) { base_view_.synchronize(); } + } + + private: + underlying_view_type base_view_; + auto static get_underlying_per_thread_default() -> underlying_view_type + { +#ifndef RAFT_DISABLE_CUDA + return rmm::cuda_stream_per_thread; +#else + auto static constexpr const default_fail_stream = underlying_view_type{}; + return default_fail_stream; +#endif + } +}; + +auto static const stream_view_per_thread = stream_view{}; + +} // namespace raft diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh index f9682a973f..1efb4da95e 100644 --- a/cpp/include/raft/neighbors/cagra.cuh +++ b/cpp/include/raft/neighbors/cagra.cuh @@ -318,6 +318,7 @@ index build(raft::resources const& res, auto nn_descent_params = experimental::nn_descent::index_params(); nn_descent_params.graph_degree = intermediate_degree; nn_descent_params.intermediate_graph_degree = 1.5 * intermediate_degree; + nn_descent_params.max_iterations = params.nn_descent_niter; build_knn_graph(res, dataset, knn_graph->view(), nn_descent_params); } diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp index 5061d6082d..4db08110b9 100644 --- a/cpp/include/raft/neighbors/cagra_types.hpp +++ b/cpp/include/raft/neighbors/cagra_types.hpp @@ -58,6 +58,8 @@ struct index_params : ann::index_params { size_t graph_degree = 64; /** ANN algorithm to build knn graph. */ graph_build_algo build_algo = graph_build_algo::IVF_PQ; + /** Number of Iterations to run if building with NN_DESCENT */ + size_t nn_descent_niter = 20; }; enum class search_algo { diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 47c10de200..975ae9ec00 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -124,7 +124,7 @@ inline void make_rotation_matrix(raft::resources const& handle, uint32_t n_rows, uint32_t n_cols, float* rotation_matrix, - raft::random::Rng rng = raft::random::Rng(7ULL)) + raft::random::RngState rng = raft::random::RngState(7ULL)) { common::nvtx::range fun_scope( "ivf_pq::make_rotation_matrix(%u * %u)", n_rows, n_cols); @@ -134,7 +134,7 @@ inline void make_rotation_matrix(raft::resources const& handle, if (force_random_rotation || !inplace) { rmm::device_uvector buf(inplace ? 0 : n * n, stream); float* mat = inplace ? rotation_matrix : buf.data(); - rng.normal(mat, n * n, 0.0f, 1.0f, stream); + raft::random::normal(handle, rng, mat, n * n, 0.0f, 1.0f); linalg::detail::qrGetQ_inplace(handle, mat, n, n, stream); if (!inplace) { RAFT_CUDA_TRY(cudaMemcpy2DAsync(rotation_matrix, diff --git a/cpp/include/raft/neighbors/detail/nn_descent.cuh b/cpp/include/raft/neighbors/detail/nn_descent.cuh index 1fb568a934..ce77cdc3de 100644 --- a/cpp/include/raft/neighbors/detail/nn_descent.cuh +++ b/cpp/include/raft/neighbors/detail/nn_descent.cuh @@ -362,28 +362,28 @@ class GNND { GnndGraph graph_; std::atomic update_counter_; - Index_t nrow_; - const int ndim_; + size_t nrow_; + size_t ndim_; - raft::device_matrix<__half, Index_t, raft::row_major> d_data_; - raft::device_vector l2_norms_; + raft::device_matrix<__half, size_t, raft::row_major> d_data_; + raft::device_vector l2_norms_; - raft::device_matrix graph_buffer_; - raft::device_matrix dists_buffer_; + raft::device_matrix graph_buffer_; + raft::device_matrix dists_buffer_; // TODO: Investigate using RMM/RAFT types https://github.com/rapidsai/raft/issues/1827 thrust::host_vector> graph_host_buffer_; thrust::host_vector> dists_host_buffer_; - raft::device_vector d_locks_; + raft::device_vector d_locks_; thrust::host_vector> h_rev_graph_new_; thrust::host_vector> h_graph_old_; thrust::host_vector> h_rev_graph_old_; // int2.x is the number of forward edges, int2.y is the number of reverse edges - raft::device_vector d_list_sizes_new_; - raft::device_vector d_list_sizes_old_; + raft::device_vector d_list_sizes_new_; + raft::device_vector d_list_sizes_old_; }; constexpr int TILE_ROW_WIDTH = 64; @@ -1143,21 +1143,21 @@ GNND::GNND(raft::resources const& res, const BuildConfig& build NUM_SAMPLES), nrow_(build_config.max_dataset_size), ndim_(build_config.dataset_dim), - d_data_{raft::make_device_matrix<__half, Index_t, raft::row_major>( + d_data_{raft::make_device_matrix<__half, size_t, raft::row_major>( res, nrow_, build_config.dataset_dim)}, - l2_norms_{raft::make_device_vector(res, nrow_)}, + l2_norms_{raft::make_device_vector(res, nrow_)}, graph_buffer_{ - raft::make_device_matrix(res, nrow_, DEGREE_ON_DEVICE)}, + raft::make_device_matrix(res, nrow_, DEGREE_ON_DEVICE)}, dists_buffer_{ - raft::make_device_matrix(res, nrow_, DEGREE_ON_DEVICE)}, + raft::make_device_matrix(res, nrow_, DEGREE_ON_DEVICE)}, graph_host_buffer_(nrow_ * DEGREE_ON_DEVICE), dists_host_buffer_(nrow_ * DEGREE_ON_DEVICE), - d_locks_{raft::make_device_vector(res, nrow_)}, + d_locks_{raft::make_device_vector(res, nrow_)}, h_rev_graph_new_(nrow_ * NUM_SAMPLES), h_graph_old_(nrow_ * NUM_SAMPLES), h_rev_graph_old_(nrow_ * NUM_SAMPLES), - d_list_sizes_new_{raft::make_device_vector(res, nrow_)}, - d_list_sizes_old_{raft::make_device_vector(res, nrow_)} + d_list_sizes_new_{raft::make_device_vector(res, nrow_)}, + d_list_sizes_old_{raft::make_device_vector(res, nrow_)} { static_assert(NUM_SAMPLES <= 32); @@ -1342,8 +1342,8 @@ void GNND::build(Data_t* data, const Index_t nrow, Index_t* out for (size_t i = 0; i < (size_t)nrow_; i++) { for (size_t j = 0; j < build_config_.node_degree; j++) { size_t idx = i * graph_.node_degree + j; - Index_t id = graph_.h_graph[idx].id(); - if (id < nrow_) { + int id = graph_.h_graph[idx].id(); + if (id < static_cast(nrow_)) { graph_shrink_buffer[i * build_config_.node_degree + j] = id; } else { graph_shrink_buffer[i * build_config_.node_degree + j] = diff --git a/cpp/include/raft/stats/detail/neighborhood_recall.cuh b/cpp/include/raft/stats/detail/neighborhood_recall.cuh new file mode 100644 index 0000000000..78cd64538e --- /dev/null +++ b/cpp/include/raft/stats/detail/neighborhood_recall.cuh @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +namespace raft::stats::detail { + +template +__global__ void neighborhood_recall( + raft::device_matrix_view indices, + raft::device_matrix_view ref_indices, + std::optional> + distances, + std::optional> + ref_distances, + raft::device_scalar_view recall_score, + DistanceValueType const eps) +{ + auto constexpr kThreadsPerBlock = 32; + IndexType const row_idx = blockIdx.x; + auto const lane_idx = threadIdx.x % kThreadsPerBlock; + + // Each warp stores a recall score computed across the columns per row + IndexType thread_recall_score = 0; + for (IndexType col_idx = lane_idx; col_idx < indices.extent(1); col_idx += kThreadsPerBlock) { + for (IndexType ref_col_idx = 0; ref_col_idx < ref_indices.extent(1); ref_col_idx++) { + if (indices(row_idx, col_idx) == ref_indices(row_idx, ref_col_idx)) { + thread_recall_score += 1; + break; + } else if (distances.has_value()) { + auto dist = distances.value()(row_idx, col_idx); + auto ref_dist = ref_distances.value()(row_idx, ref_col_idx); + DistanceValueType diff = raft::abs(dist - ref_dist); + DistanceValueType m = std::max(raft::abs(dist), raft::abs(ref_dist)); + DistanceValueType ratio = diff > eps ? diff / m : diff; + + if (ratio <= eps) { + thread_recall_score += 1; + break; + } + } + } + } + + // Reduce across a warp for row score + typedef cub::BlockReduce BlockReduce; + + __shared__ typename BlockReduce::TempStorage temp_storage; + + ScalarType row_recall_score = BlockReduce(temp_storage).Sum(thread_recall_score); + + // Reduce across all rows for global score + if (lane_idx == 0) { + cuda::atomic_ref device_recall_score{ + *recall_score.data_handle()}; + std::size_t const total_count = indices.extent(0) * indices.extent(1); + device_recall_score.fetch_add(row_recall_score / total_count); + } +} + +template +void neighborhood_recall( + raft::resources const& res, + raft::device_matrix_view indices, + raft::device_matrix_view ref_indices, + std::optional> + distances, + std::optional> + ref_distances, + raft::device_scalar_view recall_score, + DistanceValueType const eps) +{ + // One warp per row, launch a warp-width block per-row kernel + auto constexpr kThreadsPerBlock = 32; + auto const num_blocks = indices.extent(0); + + neighborhood_recall<<>>( + indices, ref_indices, distances, ref_distances, recall_score, eps); +} + +} // end namespace raft::stats::detail diff --git a/cpp/include/raft/stats/neighborhood_recall.cuh b/cpp/include/raft/stats/neighborhood_recall.cuh new file mode 100644 index 0000000000..e082bc87b4 --- /dev/null +++ b/cpp/include/raft/stats/neighborhood_recall.cuh @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "detail/neighborhood_recall.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace raft::stats { + +/** + * @defgroup stats_neighborhood_recall Neighborhood Recall Score + * @{ + */ + +/** + * @brief Calculate Neighborhood Recall score on the device for indices, distances computed by any + * Nearest Neighbors Algorithm against reference indices, distances. Recall score is calculated by + * comparing the total number of matching indices and dividing that value by the total size of the + * indices matrix of dimensions (D, k). If distance matrices are provided, then non-matching indices + * could be considered a match if abs(dist, ref_dist) < eps. + * + * Usage example: + * @code{.cpp} + * raft::device_resources res; + * // assume D rows and N column dataset + * auto k = 64; + * auto indices = raft::make_device_matrix(res, D, k); + * auto distances = raft::make_device_matrix(res, D, k); + * // run ANN algorithm of choice + * + * auto ref_indices = raft::make_device_matrix(res, D, k); + * auto ref_distances = raft::make_device_matrix(res, D, k); + * // run brute-force KNN for reference + * + * auto scalar = 0.0f; + * auto recall_score = raft::make_device_scalar(res, scalar); + * + * raft::stats::neighborhood_recall(res, + raft::make_const_mdspan(indices.view()), + raft::make_const_mdspan(ref_indices.view()), + recall_score.view(), + raft::make_const_mdspan(distances.view()), + raft::make_const_mdspan(ref_distances.view())); + * @endcode + * + * @tparam IndicesValueType data-type of the indices + * @tparam IndexType data-type to index all matrices + * @tparam ScalarType data-type to store recall score + * @tparam DistanceValueType data-type of the distances + * @param res raft::resources object to manage resources + * @param[in] indices raft::device_matrix_view indices of neighbors + * @param[in] ref_indices raft::device_matrix_view reference indices of neighbors + * @param[out] recall_score raft::device_scalar_view output recall score + * @param[in] distances (optional) raft::device_matrix_view distances of neighbors + * @param[in] ref_distances (optional) raft::device_matrix_view reference distances of neighbors + * @param[in] eps (optional, default = 0.001) value within which distances are considered matching + */ +template +void neighborhood_recall( + raft::resources const& res, + raft::device_matrix_view indices, + raft::device_matrix_view ref_indices, + raft::device_scalar_view recall_score, + std::optional> + distances = std::nullopt, + std::optional> + ref_distances = std::nullopt, + std::optional> eps = std::nullopt) +{ + RAFT_EXPECTS(indices.extent(0) == ref_indices.extent(0), + "The number of rows in indices and reference indices should be equal"); + RAFT_EXPECTS(indices.extent(1) == ref_indices.extent(1), + "The number of columns in indices and reference indices should be equal"); + + if (distances.has_value() or ref_distances.has_value()) { + RAFT_EXPECTS(distances.has_value() and ref_distances.has_value(), + "Both distances and reference distances should have values"); + + RAFT_EXPECTS(distances.value().extent(0) == ref_distances.value().extent(0), + "The number of rows in distances and reference distances should be equal"); + RAFT_EXPECTS(distances.value().extent(1) == ref_distances.value().extent(1), + "The number of columns in indices and reference indices should be equal"); + + RAFT_EXPECTS(indices.extent(0) == distances.value().extent(0), + "The number of rows in indices and distances should be equal"); + RAFT_EXPECTS(indices.extent(1) == distances.value().extent(1), + "The number of columns in indices and distances should be equal"); + } + + DistanceValueType eps_val = 0.001; + if (eps.has_value()) { eps_val = *eps.value().data_handle(); } + + detail::neighborhood_recall( + res, indices, ref_indices, distances, ref_distances, recall_score, eps_val); +} + +/** + * @brief Calculate Neighborhood Recall score on the host for indices, distances computed by any + * Nearest Neighbors Algorithm against reference indices, distances. Recall score is calculated by + * comparing the total number of matching indices and dividing that value by the total size of the + * indices matrix of dimensions (D, k). If distance matrices are provided, then non-matching indices + * could be considered a match if abs(dist, ref_dist) < eps. + * + * Usage example: + * @code{.cpp} + * raft::device_resources res; + * // assume D rows and N column dataset + * auto k = 64; + * auto indices = raft::make_device_matrix(res, D, k); + * auto distances = raft::make_device_matrix(res, D, k); + * // run ANN algorithm of choice + * + * auto ref_indices = raft::make_device_matrix(res, D, k); + * auto ref_distances = raft::make_device_matrix(res, D, k); + * // run brute-force KNN for reference + * + * auto scalar = 0.0f; + * auto recall_score = raft::make_host_scalar(scalar); + * + * raft::stats::neighborhood_recall(res, + raft::make_const_mdspan(indices.view()), + raft::make_const_mdspan(ref_indices.view()), + recall_score.view(), + raft::make_const_mdspan(distances.view()), + raft::make_const_mdspan(ref_distances.view())); + * @endcode + * + * @tparam IndicesValueType data-type of the indices + * @tparam IndexType data-type to index all matrices + * @tparam ScalarType data-type to store recall score + * @tparam DistanceValueType data-type of the distances + * @param res raft::resources object to manage resources + * @param[in] indices raft::device_matrix_view indices of neighbors + * @param[in] ref_indices raft::device_matrix_view reference indices of neighbors + * @param[out] recall_score raft::host_scalar_view output recall score + * @param[in] distances (optional) raft::device_matrix_view distances of neighbors + * @param[in] ref_distances (optional) raft::device_matrix_view reference distances of neighbors + * @param[in] eps (optional, default = 0.001) value within which distances are considered matching + */ +template +void neighborhood_recall( + raft::resources const& res, + raft::device_matrix_view indices, + raft::device_matrix_view ref_indices, + raft::host_scalar_view recall_score, + std::optional> + distances = std::nullopt, + std::optional> + ref_distances = std::nullopt, + std::optional> eps = std::nullopt) +{ + auto recall_score_d = raft::make_device_scalar(res, *recall_score.data_handle()); + neighborhood_recall( + res, indices, ref_indices, recall_score_d.view(), distances, ref_distances, eps); + raft::update_host(recall_score.data_handle(), + recall_score_d.data_handle(), + 1, + raft::resource::get_cuda_stream(res)); + raft::resource::sync_stream(res); +} + +/** @} */ // end group stats_recall + +} // end namespace raft::stats diff --git a/cpp/internal/raft_internal/neighbors/refine_helper.cuh b/cpp/internal/raft_internal/neighbors/refine_helper.cuh index ee06d90851..4a06116877 100644 --- a/cpp/internal/raft_internal/neighbors/refine_helper.cuh +++ b/cpp/internal/raft_internal/neighbors/refine_helper.cuh @@ -61,16 +61,20 @@ class RefineHelper { refined_distances_host(handle), refined_indices_host(handle) { - raft::random::Rng r(1234ULL); + raft::random::RngState rng(1234ULL); dataset = raft::make_device_matrix(handle_, p.n_rows, p.dim); queries = raft::make_device_matrix(handle_, p.n_queries, p.dim); if constexpr (std::is_same{}) { - r.uniform(dataset.data_handle(), dataset.size(), DataT(-10.0), DataT(10.0), stream_); - r.uniform(queries.data_handle(), queries.size(), DataT(-10.0), DataT(10.0), stream_); + raft::random::uniform( + handle, rng, dataset.data_handle(), dataset.size(), DataT(-10.0), DataT(10.0)); + raft::random::uniform( + handle, rng, queries.data_handle(), queries.size(), DataT(-10.0), DataT(10.0)); } else { - r.uniformInt(dataset.data_handle(), dataset.size(), DataT(1), DataT(20), stream_); - r.uniformInt(queries.data_handle(), queries.size(), DataT(1), DataT(20), stream_); + raft::random::uniformInt( + handle, rng, dataset.data_handle(), dataset.size(), DataT(1), DataT(20)); + raft::random::uniformInt( + handle, rng, queries.data_handle(), queries.size(), DataT(1), DataT(20)); } refined_distances = raft::make_device_matrix(handle_, p.n_queries, p.k); diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 0651ccac86..9b9b882d1d 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -21,7 +21,7 @@ rapids_test_init() function(ConfigureTest) - set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY) + set(options OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY NOCUDA) set(oneValueArgs NAME GPUS PERCENT) set(multiValueArgs PATH TARGETS CONFIGURATIONS) @@ -37,7 +37,11 @@ function(ConfigureTest) set(_RAFT_TEST_PERCENT 100) endif() - set(TEST_NAME ${_RAFT_TEST_NAME}) + if(_RAFT_TEST_NOCUDA) + set(TEST_NAME "${_RAFT_TEST_NAME}_NOCUDA") + else() + set(TEST_NAME ${_RAFT_TEST_NAME}) + endif() add_executable(${TEST_NAME} ${_RAFT_TEST_PATH}) target_link_libraries( @@ -68,6 +72,9 @@ function(ConfigureTest) if(_RAFT_TEST_EXPLICIT_INSTANTIATE_ONLY) target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_EXPLICIT_INSTANTIATE_ONLY") endif() + if(_RAFT_TEST_NOCUDA) + target_compile_definitions(${TEST_NAME} PRIVATE "RAFT_DISABLE_CUDA") + endif() target_include_directories(${TEST_NAME} PUBLIC "$") @@ -117,6 +124,8 @@ if(BUILD_TESTS) test/core/interruptible.cu test/core/nvtx.cpp test/core/mdarray.cu + test/core/mdspan_copy.cpp + test/core/mdspan_copy.cu test/core/mdspan_utils.cu test/core/numpy_serializer.cu test/core/memory_type.cpp @@ -124,12 +133,18 @@ if(BUILD_TESTS) test/core/sparse_matrix.cpp test/core/span.cpp test/core/span.cu + test/core/stream_view.cpp test/core/temporary_device_buffer.cu test/test.cpp LIB EXPLICIT_INSTANTIATE_ONLY ) + ConfigureTest( + NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB + EXPLICIT_INSTANTIATE_ONLY NOCUDA + ) + ConfigureTest( NAME DISTANCE_TEST @@ -420,6 +435,7 @@ if(BUILD_TESTS) test/stats/mean_center.cu test/stats/minmax.cu test/stats/mutual_info_score.cu + test/stats/neighborhood_recall.cu test/stats/r2_score.cu test/stats/rand_index.cu test/stats/regression_metrics.cu diff --git a/cpp/test/core/mdspan_copy.cpp b/cpp/test/core/mdspan_copy.cpp new file mode 100644 index 0000000000..2f938e3035 --- /dev/null +++ b/cpp/test/core/mdspan_copy.cpp @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#include +#endif +#include +#include + +namespace raft { +TEST(MDSpanCopy, Mdspan1DHostHost) +{ + auto res = resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_host_vector(res, cols); + + auto gen_unique_entry = [](auto&& x) { return x; }; + for (auto i = std::uint32_t{}; i < cols; ++i) { + in_left(i) = gen_unique_entry(i); + } + + auto out_right = make_host_vector(res, cols); + static_assert(detail::mdspan_copyable::can_use_std_copy, + "Current implementation should use std::copy for this copy"); + copy(res, out_right.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE(match(out_right(i), double(gen_unique_entry(i)), CompareApprox{0.0001})); + } +} + +#ifndef RAFT_DISABLE_CUDA +TEST(MDSpanCopy, Mdspan1DHostDevice) +{ + auto res = device_resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_host_vector(res, cols); + + auto gen_unique_entry = [](auto&& x) { return x; }; + for (auto i = std::uint32_t{}; i < cols; ++i) { + in_left(i) = gen_unique_entry(i); + } + + auto out_right = make_device_vector(res, cols); + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE( + match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox{0.0001f})); + } +} + +TEST(MDSpanCopy, Mdspan1DDeviceHost) +{ + auto res = device_resources{}; + auto cols = std::uint32_t{2}; + auto in_left = make_device_vector(res, cols); + + auto gen_unique_entry = [](auto&& x) { return x; }; + for (auto i = std::uint32_t{}; i < cols; ++i) { + in_left(i) = gen_unique_entry(i); + } + + auto out_right = make_host_vector(res, cols); + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < cols; ++i) { + ASSERT_TRUE( + match(float(out_right(i)), float(gen_unique_entry(i)), CompareApprox{0.0001f})); + } +} +#endif + +TEST(MDSpanCopy, Mdspan3DHostHost) +{ + auto res = resources{}; + auto constexpr depth = std::uint32_t{500}; + auto constexpr rows = std::uint32_t{300}; + auto constexpr cols = std::uint32_t{200}; + auto in_left = make_host_mdarray( + res, extents{}); + auto in_right = make_host_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + + auto out_left = make_host_mdarray( + res, extents{}); + auto out_right = make_host_mdarray( + res, extents{}); + + static_assert(detail::mdspan_copyable::can_use_std_copy, + "Current implementation should use std::copy for this copy"); + copy(res, out_right.view(), in_right.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + copy(res, out_right.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_right(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + copy(res, out_left.view(), in_right.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } + + static_assert(detail::mdspan_copyable:: + can_use_std_copy, + "Current implementation should use std::copy for this copy"); + copy(res, out_left.view(), in_left.view()); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match( + out_left(i, j, k), double(gen_unique_entry(i, j, k)), CompareApprox{0.0001})); + } + } + } +} + +#ifndef RAFT_DISABLE_CUDA +TEST(MDSpanCopy, Mdspan3DHostDevice) +{ + auto res = device_resources{}; + // Use smaller values here since host/device copy takes awhile. + // Non-trivial logic is tested in the other cases. + auto constexpr depth = std::uint32_t{5}; + auto constexpr rows = std::uint32_t{3}; + auto constexpr cols = std::uint32_t{2}; + auto in_left = make_host_mdarray( + res, extents{}); + auto in_right = make_host_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = + make_device_mdarray( + res, extents{}); + + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match(float(out_right(i, j, k)), + float(gen_unique_entry(i, j, k)), + CompareApprox{0.0001})); + } + } + } + + static_assert(detail::mdspan_copyable:: + can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_left.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_TRUE(match(float(out_left(i, j, k)), + float(gen_unique_entry(i, j, k)), + CompareApprox{0.0001})); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DDeviceDevice) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{300}; + auto constexpr cols = std::uint32_t{200}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + static_assert(detail::mdspan_copyable::can_use_raft_copy, + "Current implementation should use raft::copy for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE( + match(float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + static_assert(detail::mdspan_copyable::can_use_cublas, + "Current implementation should use cuBLAS for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE( + match(float(out_right(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + static_assert(detail::mdspan_copyable::can_use_cublas, + "Current implementation should use cuBLAS for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE( + match(float(out_left(i, j)), float(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} +#endif + +} // namespace raft diff --git a/cpp/test/core/mdspan_copy.cu b/cpp/test/core/mdspan_copy.cu new file mode 100644 index 0000000000..95d7d3befd --- /dev/null +++ b/cpp/test/core/mdspan_copy.cu @@ -0,0 +1,433 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../test_utils.h" +#include +#include +#include +#include +#include +#include + +namespace raft { +TEST(MDSpanCopy, Mdspan3DDeviceDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr const depth = std::uint32_t{50}; + auto constexpr const rows = std::uint32_t{30}; + auto constexpr const cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + res.sync_stream(); + + // Test dtype conversion without transpose + auto out_long = + make_device_mdarray( + res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_long.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k))); + } + } + } + + // Test transpose + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DDeviceDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + res.sync_stream(); + + // Test dtype conversion without transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + // Test dtype conversion with transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} +TEST(MDSpanCopy, Mdspan3DDeviceHostCuda) +{ + auto res = device_resources{}; + auto constexpr const depth = std::uint32_t{50}; + auto constexpr const rows = std::uint32_t{30}; + auto constexpr const cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + res.sync_stream(); + + // Test dtype conversion without transpose + auto out_long = + make_host_mdarray( + res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_long.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k))); + } + } + } + + // Test transpose + auto out_left = make_host_mdarray( + res, extents{}); + auto out_right = make_host_mdarray( + res, extents{}); + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DDeviceHostCuda) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_host_mdarray( + res, extents{}); + auto in_right = make_host_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + res.sync_stream(); + + // Test dtype conversion without transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + // Test dtype conversion with transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} + +TEST(MDSpanCopy, Mdspan3DHostDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr const depth = std::uint32_t{50}; + auto constexpr const rows = std::uint32_t{30}; + auto constexpr const cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y, auto&& z) { return x * 7 + y * 11 + z * 13; }; + + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + in_left(i, j, k) = gen_unique_entry(i, j, k); + in_right(i, j, k) = gen_unique_entry(i, j, k); + } + } + } + res.sync_stream(); + + // Test dtype conversion without transpose + auto out_long = + make_device_mdarray( + res, extents{}); + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_long.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(std::int64_t(out_long(i, j, k)), std::int64_t(gen_unique_entry(i, j, k))); + } + } + } + + // Test transpose + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_right(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } + + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < depth; ++i) { + for (auto j = std::uint32_t{}; j < rows; ++j) { + for (auto k = std::uint32_t{}; k < cols; ++k) { + ASSERT_EQ(int(out_left(i, j, k)), int(gen_unique_entry(i, j, k))); + } + } + } +} + +TEST(MDSpanCopy, Mdspan2DHostDeviceCuda) +{ + auto res = device_resources{}; + auto constexpr rows = std::uint32_t{30}; + auto constexpr cols = std::uint32_t{20}; + auto in_left = make_device_mdarray( + res, extents{}); + auto in_right = make_device_mdarray( + res, extents{}); + auto gen_unique_entry = [](auto&& x, auto&& y) { return x * 7 + y * 11; }; + + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + in_left(i, j) = gen_unique_entry(i, j); + in_right(i, j) = gen_unique_entry(i, j); + } + } + + auto out_left = make_device_mdarray( + res, extents{}); + auto out_right = make_device_mdarray( + res, extents{}); + + res.sync_stream(); + + // Test dtype conversion without transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + + // Test dtype conversion with transpose + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_right.view(), in_left.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_right(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } + static_assert( + detail::mdspan_copyable_with_kernel_v, + "Current implementation should use kernel for this copy"); + copy(res, out_left.view(), in_right.view()); + res.sync_stream(); + for (auto i = std::uint32_t{}; i < rows; ++i) { + for (auto j = std::uint32_t{}; j < cols; ++j) { + ASSERT_TRUE(match( + double(out_left(i, j)), double(gen_unique_entry(i, j)), CompareApprox{0.0001})); + } + } +} + +} // namespace raft diff --git a/cpp/test/core/stream_view.cpp b/cpp/test/core/stream_view.cpp new file mode 100644 index 0000000000..715c53fe21 --- /dev/null +++ b/cpp/test/core/stream_view.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#ifndef RAFT_DISABLE_CUDA +#include +#endif +namespace raft { +TEST(StreamView, Default) +{ + auto stream = stream_view_per_thread; + ASSERT_EQ(stream.is_per_thread_default(), raft::CUDA_ENABLED); + ASSERT_FALSE(stream.is_default()); + if (raft::CUDA_ENABLED) { + EXPECT_NO_THROW(stream.synchronize()); + EXPECT_NO_THROW(stream.interruptible_synchronize()); + } else { + EXPECT_THROW(stream.synchronize(), raft::non_cuda_build_error); + EXPECT_THROW(stream.interruptible_synchronize(), raft::non_cuda_build_error); + } + EXPECT_NO_THROW(stream.synchronize_no_throw()); + EXPECT_NO_THROW(stream.synchronize_if_cuda_enabled()); +#ifndef RAFT_DISABLE_CUDA + static_assert(std::is_same_v, + "underlying should return rmm::cuda_stream_view"); +#endif +} +} // namespace raft diff --git a/cpp/test/distance/gram.cu b/cpp/test/distance/gram.cu index d5fecd93c6..a9dbd8328f 100644 --- a/cpp/test/distance/gram.cu +++ b/cpp/test/distance/gram.cu @@ -99,9 +99,9 @@ class GramMatrixTest : public ::testing::TestWithParam { gram_host.resize(gram.size()); std::fill(gram_host.begin(), gram_host.end(), 0); - raft::random::Rng r(42137ULL); - r.uniform(x1.data(), x1.size(), math_t(0), math_t(1), stream); - r.uniform(x2.data(), x2.size(), math_t(0), math_t(1), stream); + raft::random::RngState rng(42137ULL); + raft::random::uniform(handle, rng, x1.data(), x1.size(), math_t(0), math_t(1)); + raft::random::uniform(handle, rng, x2.data(), x2.size(), math_t(0), math_t(1)); } ~GramMatrixTest() override {} diff --git a/cpp/test/linalg/reduce.cu b/cpp/test/linalg/reduce.cu index fd1b4e7b45..8578fe9637 100644 --- a/cpp/test/linalg/reduce.cu +++ b/cpp/test/linalg/reduce.cu @@ -124,7 +124,7 @@ class ReduceTest : public ::testing::TestWithParam(std::floor((24 - std::log2(dim)) / 2)); - rng.uniformInt(reinterpret_cast(ptr), size, 0u, resolution - 1, cuda_stream); + raft::random::uniformInt(handle, rng, reinterpret_cast(ptr), size, 0u, resolution - 1); GenerateRoundingErrorFreeDataset_kernel<<>>( ptr, size, resolution); @@ -293,13 +294,16 @@ class AnnCagraTest : public ::testing::TestWithParam { { database.resize(((size_t)ps.n_rows) * ps.dim, stream_); search_queries.resize(ps.n_queries * ps.dim, stream_); - raft::random::Rng r(1234ULL); + raft::random::RngState r(1234ULL); if constexpr (std::is_same{}) { - r.normal(database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0), stream_); - r.normal(search_queries.data(), ps.n_queries * ps.dim, DataT(0.1), DataT(2.0), stream_); + raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0)); + raft::random::normal( + handle_, r, search_queries.data(), ps.n_queries * ps.dim, DataT(0.1), DataT(2.0)); } else { - r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), stream_); - r.uniformInt(search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20), stream_); + raft::random::uniformInt( + handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20)); + raft::random::uniformInt( + handle_, r, search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20)); } resource::sync_stream(handle_); } @@ -379,11 +383,12 @@ class AnnCagraSortTest : public ::testing::TestWithParam { void SetUp() override { database.resize(((size_t)ps.n_rows) * ps.dim, handle_.get_stream()); - raft::random::Rng r(1234ULL); + raft::random::RngState r(1234ULL); if constexpr (std::is_same{}) { - GenerateRoundingErrorFreeDataset(database.data(), ps.n_rows, ps.dim, r, handle_.get_stream()); + GenerateRoundingErrorFreeDataset(handle_, database.data(), ps.n_rows, ps.dim, r); } else { - r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), handle_.get_stream()); + raft::random::uniformInt( + handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20)); } handle_.sync_stream(); } @@ -643,13 +648,16 @@ class AnnCagraFilterTest : public ::testing::TestWithParam { { database.resize(((size_t)ps.n_rows) * ps.dim, stream_); search_queries.resize(ps.n_queries * ps.dim, stream_); - raft::random::Rng r(1234ULL); + raft::random::RngState r(1234ULL); if constexpr (std::is_same{}) { - r.normal(database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0), stream_); - r.normal(search_queries.data(), ps.n_queries * ps.dim, DataT(0.1), DataT(2.0), stream_); + raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0)); + raft::random::normal( + handle_, r, search_queries.data(), ps.n_queries * ps.dim, DataT(0.1), DataT(2.0)); } else { - r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), stream_); - r.uniformInt(search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20), stream_); + raft::random::uniformInt( + handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20)); + raft::random::uniformInt( + handle_, r, search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20)); } resource::sync_stream(handle_); } diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index 71d48cdeb7..7b1d32ca83 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -411,13 +411,17 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { database.resize(ps.num_db_vecs * ps.dim, stream_); search_queries.resize(ps.num_queries * ps.dim, stream_); - raft::random::Rng r(1234ULL); + raft::random::RngState r(1234ULL); if constexpr (std::is_same{}) { - r.uniform(database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0), stream_); - r.uniform(search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0), stream_); + raft::random::uniform( + handle_, r, database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0)); + raft::random::uniform( + handle_, r, search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0)); } else { - r.uniformInt(database.data(), ps.num_db_vecs * ps.dim, DataT(1), DataT(20), stream_); - r.uniformInt(search_queries.data(), ps.num_queries * ps.dim, DataT(1), DataT(20), stream_); + raft::random::uniformInt( + handle_, r, database.data(), ps.num_db_vecs * ps.dim, DataT(1), DataT(20)); + raft::random::uniformInt( + handle_, r, search_queries.data(), ps.num_queries * ps.dim, DataT(1), DataT(20)); } resource::sync_stream(handle_); } diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index e03d09ae50..d1f5ee5b03 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -170,13 +170,17 @@ class ivf_pq_test : public ::testing::TestWithParam { database.resize(size_t{ps.num_db_vecs} * size_t{ps.dim}, stream_); search_queries.resize(size_t{ps.num_queries} * size_t{ps.dim}, stream_); - raft::random::Rng r(1234ULL); + raft::random::RngState r(1234ULL); if constexpr (std::is_same{}) { - r.uniform(database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0), stream_); - r.uniform(search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0), stream_); + raft::random::uniform( + handle_, r, database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0)); + raft::random::uniform( + handle_, r, search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0)); } else { - r.uniformInt(database.data(), ps.num_db_vecs * ps.dim, DataT(1), DataT(20), stream_); - r.uniformInt(search_queries.data(), ps.num_queries * ps.dim, DataT(1), DataT(20), stream_); + raft::random::uniformInt( + handle_, r, database.data(), ps.num_db_vecs * ps.dim, DataT(1), DataT(20)); + raft::random::uniformInt( + handle_, r, search_queries.data(), ps.num_queries * ps.dim, DataT(1), DataT(20)); } resource::sync_stream(handle_); } diff --git a/cpp/test/neighbors/ann_nn_descent.cuh b/cpp/test/neighbors/ann_nn_descent.cuh index 948323cf6e..d62b863437 100644 --- a/cpp/test/neighbors/ann_nn_descent.cuh +++ b/cpp/test/neighbors/ann_nn_descent.cuh @@ -121,11 +121,12 @@ class AnnNNDescentTest : public ::testing::TestWithParam { void SetUp() override { database.resize(((size_t)ps.n_rows) * ps.dim, stream_); - raft::random::Rng r(1234ULL); + raft::random::RngState r(1234ULL); if constexpr (std::is_same{}) { - r.normal(database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0), stream_); + raft::random::normal(handle_, r, database.data(), ps.n_rows * ps.dim, DataT(0.1), DataT(2.0)); } else { - r.uniformInt(database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20), stream_); + raft::random::uniformInt( + handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20)); } resource::sync_stream(handle_); } diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh index be60ec5b6d..18860100f3 100644 --- a/cpp/test/neighbors/ann_utils.cuh +++ b/cpp/test/neighbors/ann_utils.cuh @@ -123,13 +123,13 @@ struct idx_dist_pair { idx_dist_pair(IdxT x, DistT y, CompareDist op) : idx(x), dist(y), eq_compare(op) {} }; +/** Calculate recall value using only neighbor indices + */ template -auto eval_recall(const std::vector& expected_idx, +auto calc_recall(const std::vector& expected_idx, const std::vector& actual_idx, size_t rows, - size_t cols, - double eps, - double min_recall) -> testing::AssertionResult + size_t cols) { size_t match_count = 0; size_t total_count = static_cast(rows) * static_cast(cols); @@ -147,8 +147,21 @@ auto eval_recall(const std::vector& expected_idx, } } } - double actual_recall = static_cast(match_count) / static_cast(total_count); - double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps); + return std::make_tuple( + static_cast(match_count) / static_cast(total_count), match_count, total_count); +} + +template +auto eval_recall(const std::vector& expected_idx, + const std::vector& actual_idx, + size_t rows, + size_t cols, + double eps, + double min_recall) -> testing::AssertionResult +{ + auto [actual_recall, match_count, total_count] = + calc_recall(expected_idx, actual_idx, rows, cols); + double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps); RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).", actual_recall, match_count, @@ -164,17 +177,16 @@ auto eval_recall(const std::vector& expected_idx, return testing::AssertionSuccess(); } -/** same as eval_recall, but in case indices do not match, - * then check distances as well, and accept match if actual dist is equal to expected_dist */ +/** Overload of calc_recall to account for distances + */ template -auto eval_neighbours(const std::vector& expected_idx, - const std::vector& actual_idx, - const std::vector& expected_dist, - const std::vector& actual_dist, - size_t rows, - size_t cols, - double eps, - double min_recall) -> testing::AssertionResult +auto calc_recall(const std::vector& expected_idx, + const std::vector& actual_idx, + const std::vector& expected_dist, + const std::vector& actual_dist, + size_t rows, + size_t cols, + double eps) { size_t match_count = 0; size_t total_count = static_cast(rows) * static_cast(cols); @@ -196,8 +208,25 @@ auto eval_neighbours(const std::vector& expected_idx, } } } - double actual_recall = static_cast(match_count) / static_cast(total_count); - double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps); + return std::make_tuple( + static_cast(match_count) / static_cast(total_count), match_count, total_count); +} + +/** same as eval_recall, but in case indices do not match, + * then check distances as well, and accept match if actual dist is equal to expected_dist */ +template +auto eval_neighbours(const std::vector& expected_idx, + const std::vector& actual_idx, + const std::vector& expected_dist, + const std::vector& actual_dist, + size_t rows, + size_t cols, + double eps, + double min_recall) -> testing::AssertionResult +{ + auto [actual_recall, match_count, total_count] = + calc_recall(expected_idx, actual_idx, expected_dist, actual_dist, rows, cols, eps); + double error_margin = (actual_recall - min_recall) / std::max(1.0 - min_recall, eps); RAFT_LOG_INFO("Recall = %f (%zu/%zu), the error is %2.1f%% %s the threshold (eps = %f).", actual_recall, match_count, diff --git a/cpp/test/random/rmat_rectangular_generator.cu b/cpp/test/random/rmat_rectangular_generator.cu index 1af3d2be31..77af44f133 100644 --- a/cpp/test/random/rmat_rectangular_generator.cu +++ b/cpp/test/random/rmat_rectangular_generator.cu @@ -178,7 +178,7 @@ class RmatGenTest : public ::testing::TestWithParam { max_scale{std::max(params.r_scale, params.c_scale)} { theta.resize(4 * max_scale, stream); - uniform(state, theta.data(), theta.size(), 0.0f, 1.0f, stream); + uniform(handle, state, theta.data(), theta.size(), 0.0f, 1.0f); normalize(theta.data(), theta.data(), max_scale, @@ -271,7 +271,7 @@ class RmatGenMdspanTest : public ::testing::TestWithParam { max_scale{std::max(params.r_scale, params.c_scale)} { theta.resize(4 * max_scale, stream); - uniform(state, theta.data(), theta.size(), 0.0f, 1.0f, stream); + uniform(handle, state, theta.data(), theta.size(), 0.0f, 1.0f); normalize(theta.data(), theta.data(), max_scale, diff --git a/cpp/test/sparse/gram.cu b/cpp/test/sparse/gram.cu index 7b4736a08c..ca43aa83b9 100644 --- a/cpp/test/sparse/gram.cu +++ b/cpp/test/sparse/gram.cu @@ -125,7 +125,7 @@ class GramMatrixTest : public ::testing::TestWithParam { protected: GramMatrixTest() : params(GetParam()), - stream(0), + stream(resource::get_cuda_stream(handle)), x1(0, stream), x2(0, stream), x1_csr_indptr(0, stream), @@ -137,8 +137,6 @@ class GramMatrixTest : public ::testing::TestWithParam { gram(0, stream), gram_host(0) { - RAFT_CUDA_TRY(cudaStreamCreate(&stream)); - if (params.ld1 == 0) { params.ld1 = params.is_row_major ? params.n_cols : params.n1; } if (params.ld2 == 0) { params.ld2 = params.is_row_major ? params.n_cols : params.n2; } if (params.ld_out == 0) { params.ld_out = params.is_row_major ? params.n2 : params.n1; } @@ -154,14 +152,14 @@ class GramMatrixTest : public ::testing::TestWithParam { gram_host.resize(gram.size()); std::fill(gram_host.begin(), gram_host.end(), 0); - raft::random::Rng r(42137ULL); - r.uniform(x1.data(), x1.size(), math_t(0), math_t(1), stream); - r.uniform(x2.data(), x2.size(), math_t(0), math_t(1), stream); + raft::random::RngState r(42137ULL); + raft::random::uniform(handle, r, x1.data(), x1.size(), math_t(0), math_t(1)); + raft::random::uniform(handle, r, x2.data(), x2.size(), math_t(0), math_t(1)); RAFT_CUDA_TRY(cudaStreamSynchronize(stream)); } - ~GramMatrixTest() override { RAFT_CUDA_TRY_NO_THROW(cudaStreamDestroy(stream)); } + ~GramMatrixTest() override {} int prepareCsr(math_t* dense, int n_rows, int ld, int* indptr, int* indices, math_t* data) { diff --git a/cpp/test/stats/neighborhood_recall.cu b/cpp/test/stats/neighborhood_recall.cu new file mode 100644 index 0000000000..43ae7059bd --- /dev/null +++ b/cpp/test/stats/neighborhood_recall.cu @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "../neighbors/ann_utils.cuh" +#include "../test_utils.h" + +#include +#include +#include +#include + +#include + +#include +#include + +#include + +namespace raft::stats { + +struct NeighborhoodRecallInputs { + int n_rows; + int n_cols; + int k; +}; + +template +class NeighborhoodRecallTest : public ::testing::TestWithParam { + public: + NeighborhoodRecallTest() + : ps{::testing::TestWithParam::GetParam()}, + data_1{raft::make_device_matrix(res, ps.n_rows, ps.n_cols)}, + data_2{raft::make_device_matrix(res, ps.n_rows, ps.n_cols)} + { + } + + protected: + void test_recall() + { + size_t queries_size = ps.n_rows * ps.k; + + // calculate nn for dataset 1 + auto distances_1 = raft::make_device_matrix(res, ps.n_rows, ps.k); + auto indices_1 = raft::make_device_matrix(res, ps.n_rows, ps.k); + raft::neighbors::naive_knn( + res, + distances_1.data_handle(), + indices_1.data_handle(), + data_1.data_handle(), + data_1.data_handle(), + ps.n_rows, + ps.n_rows, + ps.n_cols, + ps.k, + raft::distance::DistanceType::L2Expanded); + std::vector distances_1_h(queries_size); + std::vector indices_1_h(queries_size); + raft::copy(distances_1_h.data(), + distances_1.data_handle(), + ps.n_rows * ps.k, + raft::resource::get_cuda_stream(res)); + raft::copy(indices_1_h.data(), + indices_1.data_handle(), + ps.n_rows * ps.k, + raft::resource::get_cuda_stream(res)); + + // calculate nn for dataset 2 + auto distances_2 = raft::make_device_matrix(res, ps.n_rows, ps.k); + auto indices_2 = raft::make_device_matrix(res, ps.n_rows, ps.k); + raft::neighbors::naive_knn( + res, + distances_2.data_handle(), + indices_2.data_handle(), + data_2.data_handle(), + data_2.data_handle(), + ps.n_rows, + ps.n_rows, + ps.n_cols, + ps.k, + raft::distance::DistanceType::L2Expanded); + std::vector distances_2_h(queries_size); + std::vector indices_2_h(queries_size); + raft::copy(distances_2_h.data(), + distances_2.data_handle(), + ps.n_rows * ps.k, + raft::resource::get_cuda_stream(res)); + raft::copy(indices_2_h.data(), + indices_2.data_handle(), + ps.n_rows * ps.k, + raft::resource::get_cuda_stream(res)); + + raft::resource::sync_stream(res); + + // find CPU recall scores + [[maybe_unused]] auto [indices_only_recall_h, mc1, tc1] = + raft::neighbors::calc_recall(indices_1_h, indices_2_h, ps.n_rows, ps.k); + [[maybe_unused]] auto [recall_h, mc2, tc2] = raft::neighbors::calc_recall( + indices_1_h, indices_2_h, distances_1_h, distances_2_h, ps.n_rows, ps.k, 0.001); + + // find GPU recall scores + auto s1 = 0; + auto indices_only_recall_scalar = raft::make_host_scalar(s1); + neighborhood_recall(res, + raft::make_const_mdspan(indices_1.view()), + raft::make_const_mdspan(indices_2.view()), + indices_only_recall_scalar.view()); + + auto s2 = 0; + auto recall_scalar = raft::make_host_scalar(s2); + DistanceT s3 = 0.001; + auto eps_mda = raft::make_host_scalar(s3); + + neighborhood_recall(res, + raft::make_const_mdspan(indices_1.view()), + raft::make_const_mdspan(indices_2.view()), + recall_scalar.view(), + raft::make_const_mdspan(distances_1.view()), + raft::make_const_mdspan(distances_2.view())); + + // assert correctness + ASSERT_TRUE(raft::match(indices_only_recall_h, + *indices_only_recall_scalar.data_handle(), + raft::CompareApprox(0.01))); + ASSERT_TRUE( + raft::match(recall_h, *recall_scalar.data_handle(), raft::CompareApprox(0.01))); + } + + void SetUp() override + { + // form two random datasets + raft::random::Rng r1(1234ULL); + r1.normal(data_1.data_handle(), + ps.n_rows * ps.n_cols, + DistanceT(0.1), + DistanceT(2.0), + raft::resource::get_cuda_stream(res)); + raft::random::Rng r2(21111ULL); + r2.normal(data_2.data_handle(), + ps.n_rows * ps.n_cols, + DistanceT(0.1), + DistanceT(2.0), + raft::resource::get_cuda_stream(res)); + resource::sync_stream(res); + } + + private: + raft::resources res; + NeighborhoodRecallInputs ps; + raft::device_matrix data_1; + raft::device_matrix data_2; +}; + +const std::vector inputs = + raft::util::itertools::product({10, 50, 100}, // n_rows + {80, 100}, // n_cols + {32, 64}); // k + +using NeighborhoodRecallTestF_U32 = NeighborhoodRecallTest; +TEST_P(NeighborhoodRecallTestF_U32, AnnCagra) { this->test_recall(); } + +INSTANTIATE_TEST_CASE_P(NeighborhoodRecallTest, + NeighborhoodRecallTestF_U32, + ::testing::ValuesIn(inputs)); + +} // end namespace raft::stats diff --git a/cpp/test/test_utils.cuh b/cpp/test/test_utils.cuh index 5704eefae3..1afa7acc83 100644 --- a/cpp/test/test_utils.cuh +++ b/cpp/test/test_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -228,38 +228,39 @@ testing::AssertionResult diagonalMatch( } template -typename std::enable_if_t> gen_uniform(T* out, +typename std::enable_if_t> gen_uniform(const raft::resources& handle, + T* out, raft::random::RngState& rng, IdxT len, - cudaStream_t stream, T range_min = T(-1), T range_max = T(1)) { - raft::random::uniform(rng, out, len, range_min, range_max, stream); + raft::random::uniform(handle, rng, out, len, range_min, range_max); } template -typename std::enable_if_t> gen_uniform(T* out, +typename std::enable_if_t> gen_uniform(const raft::resources& handle, + T* out, raft::random::RngState& rng, IdxT len, - cudaStream_t stream, T range_min = T(0), T range_max = T(100)) { - raft::random::uniformInt(rng, out, len, range_min, range_max, stream); + raft::random::uniformInt(handle, rng, out, len, range_min, range_max); } template -void gen_uniform(raft::KeyValuePair* out, +void gen_uniform(const raft::resources& handle, + raft::KeyValuePair* out, raft::random::RngState& rng, - IdxT len, - cudaStream_t stream) + IdxT len) { + auto stream = resource::get_cuda_stream(handle); rmm::device_uvector keys(len, stream); rmm::device_uvector values(len, stream); - gen_uniform(keys.data(), rng, len, stream); - gen_uniform(values.data(), rng, len, stream); + gen_uniform(handle, keys.data(), rng, len); + gen_uniform(handle, values.data(), rng, len); const T1* d_keys = keys.data(); const T2* d_values = values.data(); diff --git a/cpp/test/util/bitonic_sort.cu b/cpp/test/util/bitonic_sort.cu index 2cf5420334..f928480b54 100644 --- a/cpp/test/util/bitonic_sort.cu +++ b/cpp/test/util/bitonic_sort.cu @@ -109,6 +109,7 @@ class BitonicTest : public testing::TestWithParam { // NOLINT std::vector in; // NOLINT std::vector out; // NOLINT std::vector ref; // NOLINT + raft::resources handle_; void segmented_sort(std::vector& vec, int k, bool ascending) // NOLINT { @@ -128,14 +129,14 @@ class BitonicTest : public testing::TestWithParam { // NOLINT } } - void fill_random(rmm::device_uvector& arr, rmm::cuda_stream_view stream) + void fill_random(rmm::device_uvector& arr) { - raft::random::Rng rng(42); + raft::random::RngState rng(42); if constexpr (std::is_floating_point_v) { - return rng.normal(arr.data(), arr.size(), T(10), T(100), stream); + return raft::random::normal(handle_, rng, arr.data(), arr.size(), T(10), T(100)); } if constexpr (std::is_integral_v) { - return rng.normalInt(arr.data(), arr.size(), T(10), T(100), stream); + return raft::random::normalInt(handle_, rng, arr.data(), arr.size(), T(10), T(100)); } } @@ -146,11 +147,11 @@ class BitonicTest : public testing::TestWithParam { // NOLINT out(spec.len()), ref(spec.len()) { - auto stream = rmm::cuda_stream_default; + auto stream = resource::get_cuda_stream(handle_); // generate input rmm::device_uvector arr_d(spec.len(), stream); - fill_random(arr_d, stream); + fill_random(arr_d); update_host(in.data(), arr_d.data(), arr_d.size(), stream); // calculate the results diff --git a/dependencies.yaml b/dependencies.yaml index 11d14c7be9..9baacdab4f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -29,6 +29,7 @@ files: - develop - cudatoolkit - nn_bench + - nn_bench_python test_cpp: output: none includes: @@ -226,8 +227,8 @@ dependencies: - glog>=0.6.0 - h5py>=3.8.0 - benchmark>=1.8.2 - - rmm=23.12.* - openblas + - *rmm_conda nn_bench_python: common: - output_types: [conda] diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md index 0faaeba59c..075d82a135 100644 --- a/docs/source/ann_benchmarks_param_tuning.md +++ b/docs/source/ann_benchmarks_param_tuning.md @@ -53,6 +53,7 @@ CAGRA uses a graph-based index, which creates an intermediate, approximate kNN g | `graph_degree` | `build_param` | N | Positive Integer >0 | 64 | Degree of the final kNN graph index. | | `intermediate_graph_degree` | `build_param` | N | Positive Integer >0 | 128 | Degree of the intermediate kNN graph. | | `graph_build_algo` | `build_param` | N | ["IVF_PQ", "NN_DESCENT"] | "IVF_PQ" | Algorithm to use for search | +| `nn_descent_niter` | `build_param` | N | Positive Integer>0 | 20 | Number of iterations if using NN_DESCENT. | | `dataset_memory_type` | `build_param` | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside? | | `query_memory_type` | `search_params` | N | ["device", "host", "mmap"] | "device | What memory type should the queries reside? | | `itopk` | `search_wdith` | N | Positive Integer >0 | 64 | Number of intermediate search results retained during the search. Higher values improve search accuracy at the cost of speed. | diff --git a/docs/source/cpp_api/stats_neighborhood.rst b/docs/source/cpp_api/stats_neighborhood.rst index f80e349c3b..7c7ad90a49 100644 --- a/docs/source/cpp_api/stats_neighborhood.rst +++ b/docs/source/cpp_api/stats_neighborhood.rst @@ -16,3 +16,15 @@ namespace *raft::stats* :project: RAFT :members: :content-only: + +Neighborhood Recall +------------------- + +``#include `` + +namespace *raft::stats* + +.. doxygengroup:: stats_neighborhood_recall + :project: RAFT + :members: + :content-only: diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md index 132e38984c..2e8572c299 100644 --- a/docs/source/raft_ann_benchmarks.md +++ b/docs/source/raft_ann_benchmarks.md @@ -2,9 +2,26 @@ This project provides a benchmark program for various ANN search implementations. It's especially suitable for comparing GPU implementations as well as comparing GPU against CPU. -## Installing the benchmarks +## Table of Contents -The easiest way to install these benchmarks is through conda. We provide packages for GPU enabled systems, as well for systems without a GPU. We suggest using mamba as it generally leads to a faster install time: +- [Installing and Running the Benchmarks](#installing--and-running-the-benchmarks) + - [Using conda](#conda) + - [Using Docker](#docker) +- [End-to-end example: Million-scale](end-to-end-example-million-scale) +- [End-to-end example: Billion-scale](#end-to-end-example-billion-scale) +- [Creating and customizing dataset configurations](#creating-and-customizing-dataset-configurations) +- [Adding a new ANN algorithm](#adding-a-new-ann-algorithm) + +## Installing and Running the Benchmarks + +There are two main ways pre-compiled benchmarks are distributed: + +- [Conda](#Conda): Great solution for users not using containers but want an easy to install and use Python package. Pip wheels are planned to be added as an alternative for users that cannot use conda and prefer to not use containers. +- [Docker](#Docker): Great solution that only needs docker and NVIDIA docker to use. Provides a single docker run command for basic dataset benchmarking, as well as all the functionality of the conda solution inside the containers. + +## Conda + +If containers are not an option or not preferred, the easiest way to install the ANN benchmarks is through conda. We provide packages for GPU enabled systems, as well for systems without a GPU. We suggest using mamba as it generally leads to a faster install time: ```bash @@ -24,7 +41,7 @@ Please see the [build instructions](ann_benchmarks_build.md) to build the benchm ## Running the benchmarks -### Usage +### Python Package Usage There are 4 general steps to running the benchmarks and visualizing the results: 1. Prepare Dataset 2. Build Index and Search Index @@ -60,12 +77,112 @@ Configuration files already exist for the following list of the million-scale da - `fashion-mnist-784-euclidean` - `glove-50-angular` - `glove-100-angular` -- `lastfm-65-angular` - `mnist-784-euclidean` - `nytimes-256-angular` - `sift-128-euclidean` -### End-to-end example: Billion-scale +## Docker + +We provide images for GPU enabled systems, as well as systems without a GPU. The following images are available: + +- `raft-ann-bench`: Contains GPU and CPU benchmarks, can run all algorithms supported. Will download million-scale datasets as required. Best suited for users that prefer a smaller container size for GPU based systems. Requires the NVIDIA Container Toolkit to run GPU algorithms, can run CPU algorithms without it. +- `raft-ann-bench-datasets`: Contains the GPU and CPU benchmarks with million-scale datasets already included in the container. Best suited for users that want to run multiple million scale datasets already included in the image. +- `raft-ann-bench-cpu`: Contains only CPU benchmarks with minimal size. Best suited for users that want the smallest containers to reproduce benchmarks on systems without a GPU. + +Nightly images are located in [dockerhub](https://hub.docker.com/r/rapidsai/raft-ann-bench), meanwhile release (stable) versions are located in [NGC](https://hub.docker.com/r/rapidsai/raft-ann-bench), starting with release 23.10. + +- The following command pulls the nightly container for python version 10, cuda version 12, and RAFT version 23.10: + +```bash +docker pull rapidsai/raft-ann-bench:23.10a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container. +``` + +The CUDA and python versions can be changed for the supported values: + +Supported CUDA versions: 11.2 and 12.0 +Supported Python versions: 3.9 and 3.10. + +You can see the exact versions as well in the dockerhub site: + +- [RAFT ANN Benchmark images](https://hub.docker.com/r/rapidsai/raft-ann-bench/tags) +- [RAFT ANN Benchmark with datasets preloaded images](https://hub.docker.com/r/rapidsai/raft-ann-bench-cpu/tags) +- [RAFT ANN Benchmark CPU only images](https://hub.docker.com/r/rapidsai/raft-ann-bench-datasets/tags) + +**Note:** GPU containers use the CUDA toolkit from inside the container, the only requirement is a driver installed on the host machine that supports that version. So, for example, CUDA 11.8 containers can run in systems with a CUDA 12.x capable driver. + +- The following command (only available after RAPIDS 23.10 release) pulls the container: + +```bash +docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:23.08-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container. +``` + +### Container Usage + +The container can be used in two different ways: + +1. **Automated benchmark with single `docker run` (ease mode)**: Helper scripts are included to ease the procedure of running benchmarks end-to-end: + +For GPU systems, where `$DATA_FOLDER` is a local folder where you want datasets stored in `$DATA_FOLDER/datasets` and results in `$DATA_FOLDER/result` (we highly recommend `$DATA_FOLDER` to be a dedicated folder for the datasets and results of the containers): + +```bash +export DATA_FOLDER=path/to/store/datasets/and/results +docker run --gpus all --rm -it -u $(id -u) \ + -v $DATA_FOLDER:/home/rapids/benchmarks \ + rapidsai/raft-ann-bench:23.10a-cuda11.8-py3.10 \ + "--dataset deep-image-96-angular" \ + "--normalize" \ + "--algorithms raft_cagra,raft_ivf_pq" \ + "" +``` + +Where: + +```bash +export DATA_FOLDER=path/to/store/datasets/and/results # <- local folder to store datasets and results +docker run --gpus all --rm -it -u $(id -u) \ + -v $DATA_FOLDER:/home/rapids/benchmarks \ + rapidsai/raft-ann-bench:23.10a-cuda11.8-py3.10 \ # <- image to use, either `raft-ann-bench` or `raft-ann-bench-datasets`, can choose RAPIDS, cuda and python versions. + "--dataset deep-image-96-angular" \ # <- dataset name + "--normalize" \ # <- whether to normalize the dataset, leave string empty ("") to not normalize. + "--algorithms raft_cagra" \ # <- what algorithm(s) to use as a ; separated list, as well as any other argument to pass to `raft_ann_benchmarks.run` + "" # optional arguments to pass to `raft_ann_benchmarks.plot` +``` + +*** Note about user and file permissions: *** The flag `-u $(id -u)` allows the user inside the container to match the `uid` of the user outside the container, allowing the container to read and write to the mounted volume indicated by $DATA_FOLDER. + +For CPU systems the same interface applies, except for not needing the gpus argument and using the cpu images: +```bash +export DATA_FOLDER=path/to/store/datasets/and/results +docker run all --rm -it -u $(id -u) \ + -v $DATA_FOLDER:/home/rapids/benchmarks \ + rapidsai/raft-ann-bench-cpu:23.10a-py3.10 \ + "--dataset deep-image-96-angular" \ + "--normalize" \ + "--algorithms raft_cagra" \ + "" +``` + +**Note:** The user inside the containers is `root`. To workaround this, the scripts in the containers fix the user of the output files after the benchmarks are run. If the benchmarks are interrupted, the owner of the `datasets/results` produced by the container will be wrong, and will need to be manually fixed by the user. + +2. **Using the preinstalled `raft_ann_benchmarks` python package (advanced mode)**: The docker containers are built using the conda packages described in the following section, so they can be used directly as if they were installed manually following the instructions in the next section. This is recommended for advanced users, and is the option that allows the full flexibility of the benchmarking scripts. To use the python scripts directly, use the following command: + +```bash +export DATA_FOLDER=path/to/store/datasets/and/results +docker run --gpus all --rm -it -u $(id -u) \ + -v $DATA_FOLDER:/home/rapids/benchmarks \ + rapidsai/raft-ann-bench:23.10a-cuda11.8-py3.10 \ + --entrypoint /bin/bash +``` + +This will drop you into a command line in the container, with the `raft_ann_benchmarks` python package ready to use, as was described in the prior [conda section](#conda): + +``` +(base) root@00b068fbb862:/home/rapids# +``` + +Additionally, the containers could be run in dettached mode without any issue. + +## End-to-end example: Billion-scale `raft-ann-bench.get_dataset` cannot be used to download the [billion-scale datasets](ann_benchmarks_dataset.md#billion-scale) because they are so large. You should instead use our billion-scale datasets guide to download and prepare them. All other python mentioned below work as intended once the @@ -102,7 +219,7 @@ options: Path to billion-scale dataset groundtruth file (default: None) ``` -##### Step 1: Prepare Dataset +#### Step 1: Prepare Dataset The script `raft-ann-bench.get_dataset` will download and unpack the dataset in directory that the user provides. As of now, only million-scale datasets are supported by this script. For more information on [datasets and formats](ann_benchmarks_dataset.md). @@ -123,7 +240,7 @@ When option `normalize` is provided to the script, any dataset that has cosine d will be normalized to inner product. So, for example, the dataset `glove-100-angular` will be written at location `datasets/glove-100-inner/`. -#### Step 2: Build and Search Index +### Step 2: Build and Search Index The script `raft-ann-bench.run` will build and search indices for a given dataset and its specified configuration. To confirgure which algorithms are available, we use `algos.yaml`. @@ -180,7 +297,7 @@ it is assumed both are `True`. `indices` and `algorithms` : these parameters ensure that the algorithm specified for an index is available in `algos.yaml` and not disabled, as well as having an associated executable. -#### Step 3: Data Export +### Step 3: Data Export The script `raft-ann-bench.data_export` will convert the intermediate JSON outputs produced by `raft-ann-bench.run` to more easily readable CSV files, which are needed to build charts made by `raft-ann-bench.plot`. diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json b/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json index fff3bca1d7..6e982873bb 100644 --- a/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json +++ b/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json @@ -1,4 +1,4 @@ -{ +c{ "dataset": { "name": "mnist-784-euclidean", "base_file": "mnist-784-euclidean/base.fbin",