diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index 7c2fa05bfe..9cde1143e0 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -416,4 +417,77 @@ inline void fill_refinement_index(raft::resources const& handle, refinement_index->veclen()); RAFT_CUDA_TRY(cudaPeekAtLastError()); } + +template +__global__ void pack_interleaved_list_kernel( + const T* codes, + T* list_data, + uint32_t n_rows, + uint32_t dim, + uint32_t veclen, + std::variant offset_or_indices) +{ + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const uint32_t dst_ix = std::holds_alternative(offset_or_indices) + ? std::get(offset_or_indices) + tid + : std::get(offset_or_indices)[tid]; + if (tid < n_rows) { codepacker::pack_1(codes + tid * dim, list_data, dim, veclen, dst_ix); } +} + +template +__global__ void unpack_interleaved_list_kernel( + const T* list_data, + T* codes, + uint32_t n_rows, + uint32_t dim, + uint32_t veclen, + std::variant offset_or_indices) +{ + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const uint32_t src_ix = std::holds_alternative(offset_or_indices) + ? std::get(offset_or_indices) + tid + : std::get(offset_or_indices)[tid]; + if (tid < n_rows) { codepacker::unpack_1(list_data, codes + tid * dim, dim, veclen, src_ix); } +} + +template +void pack_list_data( + raft::resources const& res, + device_matrix_view codes, + uint32_t veclen, + std::variant offset_or_indices, + device_mdspan::list_extents, row_major> list_data) +{ + uint32_t n_rows = codes.extent(0); + uint32_t dim = codes.extent(1); + if (n_rows == 0 || dim == 0) return; + static constexpr uint32_t kBlockSize = 256; + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); + dim3 threads(kBlockSize, 1, 1); + auto stream = resource::get_cuda_stream(res); + pack_interleaved_list_kernel<<>>( + codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen, offset_or_indices); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +void unpack_list_data( + raft::resources const& res, + device_mdspan::list_extents, row_major> list_data, + uint32_t veclen, + std::variant offset_or_indices, + device_matrix_view codes) +{ + uint32_t n_rows = codes.extent(0); + uint32_t dim = codes.extent(1); + if (n_rows == 0 || dim == 0) return; + static constexpr uint32_t kBlockSize = 256; + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); + dim3 threads(kBlockSize, 1, 1); + auto stream = resource::get_cuda_stream(res); + unpack_interleaved_list_kernel<<>>( + list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen, offset_or_indices); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + } // namespace raft::neighbors::ivf_flat::detail diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp new file mode 100644 index 0000000000..4594332fdf --- /dev/null +++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#ifdef _RAFT_HAS_CUDA +#include +#else +#include +#endif + +namespace raft::neighbors::ivf_flat::codepacker { + +template +_RAFT_HOST_DEVICE inline auto roundDown(T x) +{ +#if defined(_RAFT_HAS_CUDA) + return Pow2::roundDown(x); +#else + return raft::round_down_safe(x, kIndexGroupSize); +#endif +} + +template +_RAFT_HOST_DEVICE inline auto mod(T x) +{ +#if defined(_RAFT_HAS_CUDA) + return Pow2::mod(x); +#else + return x % kIndexGroupSize; +#endif +} + +/** + * Write one flat code into a block by the given offset. The offset indicates the id of the record + * in the list. This function interleaves the code and is intended to later copy the interleaved + * codes over to the IVF list on device. NB: no memory allocation happens here; the block must fit + * the record (offset + 1). + * + * @tparam T + * + * @param[in] flat_code input flat code + * @param[out] block block of memory to write interleaved codes to + * @param[in] dim dimension of the flat code + * @param[in] veclen size of interleaved data chunks + * @param[in] offset how many records to skip before writing the data into the list + */ +template +_RAFT_HOST_DEVICE void pack_1( + const T* flat_code, T* block, uint32_t dim, uint32_t veclen, uint32_t offset) +{ + // The data is written in interleaved groups of `index::kGroupSize` vectors + // using interleaved_group = Pow2; + + // Interleave dimensions of the source vector while recording it. + // NB: such `veclen` is selected, that `dim % veclen == 0` + auto group_offset = roundDown(offset); + auto ingroup_id = mod(offset) * veclen; + + for (uint32_t l = 0; l < dim; l += veclen) { + for (uint32_t j = 0; j < veclen; j++) { + block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j] = flat_code[l + j]; + } + } +} + +/** + * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset + * indicates the id of the record. This function fetches one flat code from an interleaved code. + * + * @tparam T + * + * @param[in] block interleaved block. The block can be thought of as the whole inverted list in + * interleaved format. + * @param[out] flat_code output flat code + * @param[in] dim dimension of the flat code + * @param[in] veclen size of interleaved data chunks + * @param[in] offset fetch the flat code by the given offset + */ +template +_RAFT_HOST_DEVICE void unpack_1( + const T* block, T* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset) +{ + // The data is written in interleaved groups of `index::kGroupSize` vectors + // using interleaved_group = Pow2; + + // NB: such `veclen` is selected, that `dim % veclen == 0` + auto group_offset = roundDown(offset); + auto ingroup_id = mod(offset) * veclen; + + for (uint32_t l = 0; l < dim; l += veclen) { + for (uint32_t j = 0; j < veclen; j++) { + flat_code[l + j] = block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j]; + } + } +} +} // namespace raft::neighbors::ivf_flat::codepacker \ No newline at end of file diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh new file mode 100644 index 0000000000..096e8051c3 --- /dev/null +++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace raft::neighbors::ivf_flat::helpers { +/** + * @defgroup ivf_flat_helpers Helper functions for manipulationg IVF Flat Index + * @{ + */ + +namespace codepacker { + +/** + * Write flat codes into an existing list by the given offset. + * + * NB: no memory allocation happens here; the list must fit the data (offset + n_vec). + * + * Usage example: + * @code{.cpp} + * auto list_data = index.lists()[label]->data.view(); + * // allocate the buffer for the input codes + * auto codes = raft::make_device_matrix(res, n_vec, index.dim()); + * ... prepare n_vecs to pack into the list in codes ... + * // write codes into the list starting from the 42nd position + * ivf_pq::helpers::codepacker::pack( + * res, make_const_mdspan(codes.view()), index.veclen(), 42, list_data); + * @endcode + * + * @tparam T + * @tparam IdxT + * + * @param[in] res + * @param[in] codes flat codes [n_vec, dim] + * @param[in] veclen size of interleaved data chunks + * @param[in] offset how many records to skip before writing the data into the list + * @param[inout] list_data block to write into + */ +template +void pack( + raft::resources const& res, + device_matrix_view codes, + uint32_t veclen, + uint32_t offset, + device_mdspan::list_extents, row_major> list_data) +{ + raft::neighbors::ivf_flat::detail::pack_list_data(res, codes, veclen, offset, list_data); +} + +/** + * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index + * starting at given `offset`. + * + * Usage example: + * @code{.cpp} + * auto list_data = index.lists()[label]->data.view(); + * // allocate the buffer for the output + * uint32_t n_take = 4; + * auto codes = raft::make_device_matrix(res, n_take, index.dim()); + * uint32_t offset = 0; + * // unpack n_take elements from the list + * ivf_pq::helpers::codepacker::unpack(res, list_data, index.veclen(), offset, codes.view()); + * @endcode + * + * @tparam T + * @tparam IdxT + * + * @param[in] res raft resource + * @param[in] list_data block to read from + * @param[in] veclen size of interleaved data chunks + * @param[in] offset + * How many records in the list to skip. + * @param[inout] codes + * the destination buffer [n_take, index.dim()]. + * The length `n_take` defines how many records to unpack, + * it must be <= the list size. + */ +template +void unpack( + raft::resources const& res, + device_mdspan::list_extents, row_major> list_data, + uint32_t veclen, + uint32_t offset, + device_matrix_view codes) +{ + raft::neighbors::ivf_flat::detail::unpack_list_data( + res, list_data, veclen, offset, codes); +} +} // namespace codepacker +/** @} */ +} // namespace raft::neighbors::ivf_flat::helpers diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index a252b26600..d72d73680a 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -17,15 +17,27 @@ #include "../test_utils.cuh" #include "ann_utils.cuh" +#include +#include +#include +#include #include #include +#include +#include +#include +#include +#include +#include #include #include #include #include +#include #include +#include #include #include #include @@ -36,6 +48,7 @@ #include +#include #include #include @@ -76,7 +89,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { { } - protected: void testIVFFlat() { size_t queries_size = ps.num_queries * ps.k; @@ -264,6 +276,136 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { } } + void testPacker() + { + ivf_flat::index_params index_params; + ivf_flat::search_params search_params; + index_params.n_lists = ps.nlist; + index_params.metric = ps.metric; + index_params.adaptive_centers = false; + search_params.n_probes = ps.nprobe; + + index_params.add_data_on_build = false; + index_params.kmeans_trainset_fraction = 1.0; + index_params.metric_arg = 0; + + auto database_view = raft::make_device_matrix_view( + (const DataT*)database.data(), ps.num_db_vecs, ps.dim); + + auto idx = ivf_flat::build(handle_, index_params, database_view); + + const std::optional> no_opt = std::nullopt; + index extend_index = ivf_flat::extend(handle_, database_view, no_opt, idx); + + auto list_sizes = raft::make_host_vector(idx.n_lists()); + update_host(list_sizes.data_handle(), + extend_index.list_sizes().data_handle(), + extend_index.n_lists(), + stream_); + resource::sync_stream(handle_); + + auto& lists = idx.lists(); + + // conservative memory allocation for codepacking + auto list_device_spec = list_spec{idx.dim(), false}; + + for (uint32_t label = 0; label < idx.n_lists(); label++) { + uint32_t list_size = list_sizes.data_handle()[label]; + + ivf::resize_list(handle_, lists[label], list_device_spec, list_size, 0); + } + + idx.recompute_internal_state(handle_); + + using interleaved_group = Pow2; + + for (uint32_t label = 0; label < idx.n_lists(); label++) { + uint32_t list_size = list_sizes.data_handle()[label]; + + if (list_size > 0) { + uint32_t padded_list_size = interleaved_group::roundUp(list_size); + uint32_t n_elems = padded_list_size * idx.dim(); + auto list_data = lists[label]->data; + auto list_inds = extend_index.lists()[label]->indices; + + // fetch the flat codes + auto flat_codes = make_device_matrix(handle_, list_size, idx.dim()); + + matrix::gather( + handle_, + make_device_matrix_view( + (const DataT*)database.data(), static_cast(ps.num_db_vecs), idx.dim()), + make_device_vector_view((const IdxT*)list_inds.data_handle(), + list_size), + flat_codes.view()); + + helpers::codepacker::pack( + handle_, make_const_mdspan(flat_codes.view()), idx.veclen(), 0, list_data.view()); + + { + auto mask = make_device_vector(handle_, n_elems); + + linalg::map_offset(handle_, + mask.view(), + [dim = idx.dim(), + list_size, + padded_list_size, + chunk_size = util::FastIntDiv(idx.veclen())] __device__(auto i) { + uint32_t max_group_offset = interleaved_group::roundDown(list_size); + if (i < max_group_offset * dim) { return true; } + uint32_t surplus = (i - max_group_offset * dim); + uint32_t ingroup_id = interleaved_group::mod(surplus / chunk_size); + return ingroup_id < (list_size - max_group_offset); + }); + + // ensure that the correct number of indices are masked out + ASSERT_TRUE(thrust::reduce(resource::get_thrust_policy(handle_), + mask.data_handle(), + mask.data_handle() + n_elems, + 0) == list_size * ps.dim); + + auto packed_list_data = make_device_vector(handle_, n_elems); + + linalg::map_offset(handle_, + packed_list_data.view(), + [mask = mask.data_handle(), + list_data = list_data.data_handle()] __device__(uint32_t i) { + if (mask[i]) return list_data[i]; + return DataT{0}; + }); + + auto extend_data = extend_index.lists()[label]->data; + auto extend_data_filtered = make_device_vector(handle_, n_elems); + linalg::map_offset(handle_, + extend_data_filtered.view(), + [mask = mask.data_handle(), + extend_data = extend_data.data_handle()] __device__(uint32_t i) { + if (mask[i]) return extend_data[i]; + return DataT{0}; + }); + + ASSERT_TRUE(raft::devArrMatch(packed_list_data.data_handle(), + extend_data_filtered.data_handle(), + n_elems, + raft::Compare(), + stream_)); + } + + auto unpacked_flat_codes = + make_device_matrix(handle_, list_size, idx.dim()); + + helpers::codepacker::unpack( + handle_, list_data.view(), idx.veclen(), 0, unpacked_flat_codes.view()); + + ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(), + unpacked_flat_codes.data_handle(), + list_size * ps.dim, + raft::Compare(), + stream_)); + } + } + } + void SetUp() override { database.resize(ps.num_db_vecs * ps.dim, stream_); diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu index f0988ca988..3bfea283e5 100644 --- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu +++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu @@ -21,7 +21,11 @@ namespace raft::neighbors::ivf_flat { typedef AnnIVFFlatTest AnnIVFFlatTestF; -TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); } +TEST_P(AnnIVFFlatTestF, AnnIVFFlat) +{ + this->testIVFFlat(); + this->testPacker(); +} INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs));