diff --git a/cpp/include/raft/matrix/detail/gather.cuh b/cpp/include/raft/matrix/detail/gather.cuh index 73072ec841..b41176b533 100644 --- a/cpp/include/raft/matrix/detail/gather.cuh +++ b/cpp/include/raft/matrix/detail/gather.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,7 +17,15 @@ #pragma once #include +#include +#include +#include +#include +#include #include +#include +#include +#include #include namespace raft { @@ -335,6 +343,70 @@ void gather_if(const InputIteratorT in, gatherImpl(in, D, N, map, stencil, map_length, out, pred_op, transform_op, stream); } +template +void gather_buff(host_matrix_view dataset, + host_vector_view indices, + IdxT offset, + pinned_matrix_view buff) +{ + raft::common::nvtx::range fun_scope("gather_host_buff"); + IdxT batch_size = std::min(buff.extent(0), indices.extent(0) - offset); + +#pragma omp for + for (IdxT i = 0; i < batch_size; i++) { + IdxT in_idx = indices(offset + i); + for (IdxT k = 0; k < buff.extent(1); k++) { + buff(i, k) = dataset(in_idx, k); + } + } +} + +template +void gather(raft::resources const& res, + host_matrix_view dataset, + device_vector_view indices, + raft::device_matrix_view output) +{ + raft::common::nvtx::range fun_scope("gather"); + IdxT n_dim = output.extent(1); + IdxT n_train = output.extent(0); + auto indices_host = raft::make_host_vector(n_train); + raft::copy( + indices_host.data_handle(), indices.data_handle(), n_train, resource::get_cuda_stream(res)); + resource::sync_stream(res); + + const size_t max_batch_size = 32768; + // Gather the vector on the host in tmp buffers. We use two buffers to overlap H2D sync + // and gathering the data. + raft::common::nvtx::push_range("gather::alloc_buffers"); + auto out_tmp1 = raft::make_pinned_matrix(res, max_batch_size, n_dim); + auto out_tmp2 = raft::make_pinned_matrix(res, max_batch_size, n_dim); + auto view1 = out_tmp1.view(); + auto view2 = out_tmp2.view(); + raft::common::nvtx::pop_range(); + + gather_buff(dataset, make_const_mdspan(indices_host.view()), (IdxT)0, view1); +#pragma omp parallel + for (IdxT device_offset = 0; device_offset < n_train; device_offset += max_batch_size) { + IdxT batch_size = std::min(max_batch_size, n_train - device_offset); +#pragma omp master + raft::copy(output.data_handle() + device_offset * n_dim, + view1.data_handle(), + batch_size * n_dim, + resource::get_cuda_stream(res)); + // Start gathering the next batch on the host. + IdxT host_offset = device_offset + batch_size; + batch_size = std::min(max_batch_size, n_train - host_offset); + if (batch_size > 0) { + gather_buff(dataset, make_const_mdspan(indices_host.view()), host_offset, view2); + } +#pragma omp master + resource::sync_stream(res); +#pragma omp barrier + std::swap(view1, view2); + } +} + } // namespace detail } // namespace matrix } // namespace raft diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index a35cb9e1f1..ab30b4009d 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -361,28 +361,23 @@ inline auto build(raft::resources const& handle, // Train the kmeans clustering { + int random_seed = 137; auto trainset_ratio = std::max( 1, n_rows / std::max(params.kmeans_trainset_fraction * n_rows, index.n_lists())); auto n_rows_train = n_rows / trainset_ratio; - rmm::device_uvector trainset(n_rows_train * index.dim(), stream); - // TODO: a proper sampling - RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(), - sizeof(T) * index.dim(), - dataset, - sizeof(T) * index.dim() * trainset_ratio, - sizeof(T) * index.dim(), - n_rows_train, - cudaMemcpyDefault, - stream)); - auto trainset_const_view = - raft::make_device_matrix_view(trainset.data(), n_rows_train, index.dim()); + auto trainset = make_device_matrix(handle, n_rows_train, index.dim()); + raft::spatial::knn::detail::utils::subsample( + handle, dataset, n_rows, trainset.view(), random_seed); auto centers_view = raft::make_device_matrix_view( index.centers().data_handle(), index.n_lists(), index.dim()); raft::cluster::kmeans_balanced_params kmeans_params; kmeans_params.n_iters = params.kmeans_n_iters; kmeans_params.metric = index.metric(); - raft::cluster::kmeans_balanced::fit( - handle, kmeans_params, trainset_const_view, centers_view, utils::mapping{}); + raft::cluster::kmeans_balanced::fit(handle, + kmeans_params, + make_const_mdspan(trainset.view()), + centers_view, + utils::mapping{}); } // add the data if necessary diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 2dfb261f32..cc94511fe7 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -25,9 +25,11 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -46,7 +48,6 @@ #include #include -#include #include #include #include @@ -63,51 +64,6 @@ namespace raft::neighbors::ivf_pq::detail { using namespace raft::spatial::knn::detail; // NOLINT -template -__launch_bounds__(BlockDim) RAFT_KERNEL copy_warped_kernel( - T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows) -{ - using warp = Pow2; - size_t row_ix = warp::div(size_t(threadIdx.x) + size_t(BlockDim) * size_t(blockIdx.x)); - uint32_t i = warp::mod(threadIdx.x); - if (row_ix >= n_rows) return; - out += row_ix * ld_out; - in += row_ix * ld_in; - auto f = utils::mapping{}; - for (uint32_t col_ix = i; col_ix < n_cols; col_ix += warp::Value) { - auto x = f(in[col_ix]); - __syncwarp(); - out[col_ix] = x; - } -} - -/** - * Copy the data one warp-per-row: - * - * 1. load the data per-warp - * 2. apply the `utils::mapping{}` - * 3. sync within warp - * 4. store the data. - * - * Assuming sizeof(T) >= sizeof(S) and the data is properly aligned (see the usage in `build`), this - * allows to re-structure the data within rows in-place. - */ -template -void copy_warped(T* out, - uint32_t ld_out, - const S* in, - uint32_t ld_in, - uint32_t n_cols, - size_t n_rows, - rmm::cuda_stream_view stream) -{ - constexpr uint32_t kBlockDim = 128; - dim3 threads(kBlockDim, 1, 1); - dim3 blocks(div_rounding_up_safe(n_rows, kBlockDim / WarpSize), 1, 1); - copy_warped_kernel - <<>>(out, ld_out, in, ld_in, n_cols, n_rows); -} - /** * @brief Fill-in a random orthogonal transformation matrix. * @@ -1754,76 +1710,46 @@ auto build(raft::resources const& handle, utils::memzero(index.inds_ptrs().data_handle(), index.inds_ptrs().size(), stream); { + int random_seed = 137; auto trainset_ratio = std::max( 1, size_t(n_rows) / std::max(params.kmeans_trainset_fraction * n_rows, index.n_lists())); size_t n_rows_train = n_rows / trainset_ratio; - auto* device_memory = resource::get_workspace_resource(handle); - rmm::mr::managed_memory_resource managed_memory_upstream; + auto* device_mr = resource::get_workspace_resource(handle); + rmm::mr::managed_memory_resource managed_mr; // Besides just sampling, we transform the input dataset into floats to make it easier // to use gemm operations from cublas. - rmm::device_uvector trainset(n_rows_train * index.dim(), stream, device_memory); - // TODO: a proper sampling + auto trainset = + make_device_mdarray(handle, device_mr, make_extents(n_rows_train, dim)); + if constexpr (std::is_same_v) { - RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset.data(), - sizeof(T) * index.dim(), - dataset, - sizeof(T) * index.dim() * trainset_ratio, - sizeof(T) * index.dim(), - n_rows_train, - cudaMemcpyDefault, - stream)); + raft::spatial::knn::detail::utils::subsample( + handle, dataset, n_rows, trainset.view(), random_seed); } else { - size_t dim = index.dim(); - cudaPointerAttributes dataset_attr; - RAFT_CUDA_TRY(cudaPointerGetAttributes(&dataset_attr, dataset)); - if (dataset_attr.devicePointer != nullptr) { - // data is available on device: just run the kernel to copy and map the data - auto p = reinterpret_cast(dataset_attr.devicePointer); - auto trainset_view = - raft::make_device_vector_view(trainset.data(), dim * n_rows_train); - linalg::map_offset(handle, trainset_view, [p, trainset_ratio, dim] __device__(size_t i) { - auto col = i % dim; - return utils::mapping{}(p[(i - col) * size_t(trainset_ratio) + col]); - }); - } else { - // data is not available: first copy, then map inplace - auto trainset_tmp = reinterpret_cast(reinterpret_cast(trainset.data()) + - (sizeof(float) - sizeof(T)) * index.dim()); - // We copy the data in strides, one row at a time, and place the smaller rows of type T - // at the end of float rows. - RAFT_CUDA_TRY(cudaMemcpy2DAsync(trainset_tmp, - sizeof(float) * index.dim(), - dataset, - sizeof(T) * index.dim() * trainset_ratio, - sizeof(T) * index.dim(), - n_rows_train, - cudaMemcpyDefault, - stream)); - // Transform the input `{T -> float}`, one row per warp. - // The threads in each warp copy the data synchronously; this and the layout of the data - // (content is aligned to the end of the rows) together allow doing the transform in-place. - copy_warped(trainset.data(), - index.dim(), - trainset_tmp, - index.dim() * sizeof(float) / sizeof(T), - index.dim(), - n_rows_train, - stream); - } + // TODO(tfeher): Enable codebook generation with any type T, and then remove + // trainset tmp. + auto trainset_tmp = + make_device_mdarray(handle, &managed_mr, make_extents(n_rows_train, dim)); + raft::spatial::knn::detail::utils::subsample( + handle, dataset, n_rows, trainset_tmp.view(), random_seed); + cudaDeviceSynchronize(); + raft::linalg::unaryOp(trainset.data_handle(), + trainset_tmp.data_handle(), + trainset.size(), + utils::mapping{}, // raft::cast_op(), + raft::resource::get_cuda_stream(handle)); } // NB: here cluster_centers is used as if it is [n_clusters, data_dim] not [n_clusters, // dim_ext]! rmm::device_uvector cluster_centers_buf( - index.n_lists() * index.dim(), stream, device_memory); + index.n_lists() * index.dim(), stream, device_mr); auto cluster_centers = cluster_centers_buf.data(); // Train balanced hierarchical kmeans clustering - auto trainset_const_view = - raft::make_device_matrix_view(trainset.data(), n_rows_train, index.dim()); + auto trainset_const_view = raft::make_const_mdspan(trainset.view()); auto centers_view = raft::make_device_matrix_view(cluster_centers, index.n_lists(), index.dim()); raft::cluster::kmeans_balanced_params kmeans_params; @@ -1833,7 +1759,7 @@ auto build(raft::resources const& handle, handle, kmeans_params, trainset_const_view, centers_view, utils::mapping{}); // Trainset labels are needed for training PQ codebooks - rmm::device_uvector labels(n_rows_train, stream, device_memory); + rmm::device_uvector labels(n_rows_train, stream, device_mr); auto centers_const_view = raft::make_device_matrix_view( cluster_centers, index.n_lists(), index.dim()); auto labels_view = raft::make_device_vector_view(labels.data(), n_rows_train); @@ -1859,19 +1785,19 @@ auto build(raft::resources const& handle, train_per_subset(handle, index, n_rows_train, - trainset.data(), + trainset.data_handle(), labels.data(), params.kmeans_n_iters, - &managed_memory_upstream); + &managed_mr); break; case codebook_gen::PER_CLUSTER: train_per_cluster(handle, index, n_rows_train, - trainset.data(), + trainset.data_handle(), labels.data(), params.kmeans_n_iters, - &managed_memory_upstream); + &managed_mr); break; default: RAFT_FAIL("Unreachable code"); } diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index c7823c2d38..e55dc82f5d 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,18 @@ #pragma once +#include +#include +#include +#include +#include + #include +#include #include +#include +#include +#include #include #include #include @@ -573,4 +583,50 @@ struct batch_load_iterator { size_type cur_pos_; }; +template +auto get_subsample_indices(raft::resources const& res, IdxT n_samples, IdxT n_subsamples, int seed) + -> raft::device_vector +{ + RAFT_EXPECTS(n_subsamples <= n_samples, "Cannot have more training samples than dataset vectors"); + + auto data_indices = raft::make_device_vector(res, n_samples); + raft::linalg::map_offset(res, data_indices.view(), identity_op()); + raft::random::RngState rng(seed); + auto train_indices = raft::make_device_vector(res, n_subsamples); + raft::random::sample_without_replacement(res, + rng, + raft::make_const_mdspan(data_indices.view()), + std::nullopt, + train_indices.view(), + std::nullopt); + return train_indices; +} + +/** Subsample the dataset to create a training set*/ +template +void subsample(raft::resources const& res, + const T* input, + IdxT n_samples, + raft::device_matrix_view output, + int seed) +{ + IdxT n_dim = output.extent(1); + IdxT n_train = output.extent(0); + + raft::device_vector train_indices = + get_subsample_indices(res, n_samples, n_train, seed); + + cudaPointerAttributes attr; + RAFT_CUDA_TRY(cudaPointerGetAttributes(&attr, input)); + T* ptr = reinterpret_cast(attr.devicePointer); + if (ptr != nullptr) { + raft::matrix::gather(res, + raft::make_device_matrix_view(ptr, n_samples, n_dim), + raft::make_const_mdspan(train_indices.view()), + output); + } else { + auto dataset = raft::make_host_matrix_view(input, n_samples, n_dim); + raft::matrix::detail::gather(res, dataset, make_const_mdspan(train_indices.view()), output); + } +} } // namespace raft::spatial::knn::detail::utils diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index eb30b60eca..f7e85db1c2 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -68,7 +68,7 @@ struct ivf_pq_inputs { ivf_pq_inputs() { index_params.n_lists = max(32u, min(1024u, num_db_vecs / 128u)); - index_params.kmeans_trainset_fraction = 1.0; + index_params.kmeans_trainset_fraction = 0.95; } };