From cc9cbd3f82f094a0c3bb197e4052ecd1f4c2abd8 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Fri, 30 Jun 2023 17:03:57 -0700 Subject: [PATCH 01/97] Unpack list data kernel --- .../raft/neighbors/detail/ivf_flat_build.cuh | 65 ++++++ .../raft/neighbors/ivf_flat_helpers.cuh | 216 ++++++++++++++++++ 2 files changed, 281 insertions(+) create mode 100644 cpp/include/raft/neighbors/ivf_flat_helpers.cuh diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index 7c2fa05bfe..35135de7e7 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -416,4 +417,68 @@ inline void fill_refinement_index(raft::resources const& handle, refinement_index->veclen()); RAFT_CUDA_TRY(cudaPeekAtLastError()); } + + + +template +__launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel_float32( + T* out_codes, + T* in_list_data, + uint32_t n_rows, + uint32_t dim, + uint32_t veclen) +{ + const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x; + if (i >= n_rows * dim) { return; } + + auto col = i % dim; + auto row = i / n_rows; + + // The data is written in interleaved groups of `index::kGroupSize` vectors + using interleaved_group = Pow2; + auto group_offset = interleaved_group::roundDown(row); + auto ingroup_id = interleaved_group::mod(row) * veclen; + + // The value of 4 was chosen because for float_32 dtype, calculate_veclen returns 4 + auto within_group_offset = Pow2<4>::quot(col); + + // Interleave dimensions of the source vector while recording it. + // NB: such `veclen` is selected, that `dim % veclen == 0` + out_codes[group_offset * dim + within_group_offset * kIndexGroupSize * veclen + ingroup_id + col % veclen] = in_list_data[i]; +} + +/** + * Unpack interleaved flat codes from an existing packed non-interleaved list by the given row offset. + * + * @param[out] codes flat PQ codes, one code per byte [n_rows, pq_dim] + * @param[in] packed_list_data the packed ivf::list data. + * @param[in] row_offset how many rows in the list to skip. + * @param[in] stream + */ +template +inline void unpack_list_data_float32( + raft::resources const& handle, + device_matrix_view codes, + device_matrix_view packed_list_data, + uint32_t row_offset) +{ + auto stream = raft::resource::get_cuda_stream(handle); + auto n_rows = packed_list_data.extent(0); + if (n_rows == 0) { return; } + + auto dim = packed_list_data.extent(1); + + n_rows -= row_offset; + constexpr uint32_t kBlockSize = 256; + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); + dim3 threads(kBlockSize, 1, 1); + auto kernel = unpack_list_data_kernel_float32; + kernel<<>>(codes.data_handle(), + packed_list_data.data_handle(), + n_rows, + dim, + 4); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + } // namespace raft::neighbors::ivf_flat::detail diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh new file mode 100644 index 0000000000..2eb8085b71 --- /dev/null +++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include +#include + +namespace raft::neighbors::ivf_flat::helpers { +/** + * @defgroup ivf_flat_helpers Helper functions for manipulationg IVF Flat Index + * @{ + */ + +namespace codepacker { +/** + * @brief Unpack `n_take` consecutive records of a single non-interleaved list (cluster) starting at given `row_offset`. + * + * + * Usage example: + * @code{.cpp} + * auto list_data = index.lists()[label]->data.view(); + * // allocate the buffer for the output + * uint32_t n_take = 4; + * auto codes = raft::make_device_matrix(res, n_take, index.pq_dim()); + * uint32_t offset = 0; + * // unpack n_take elements from the list + * ivf_flat::helpers::codepacker::unpack(res, list_data, index.pq_bits(), offset, codes.view()); + * @endcode + * + * @tparam IdxT type of the indices in the source dataset + * + * @param[in] res raft resource + * @param[in] list_data block to read from + * @param[in] offset + * How many records in the list to skip. + * @param[out] codes + */ +template +inline void unpack( + raft::resources const& res, + device_matrix_view list_data, + uint32_t offset, + device_matrix_view codes) +{ + ivf_flat::detail::unpack_list_data_float32(res, codes, list_data, offset); +} + +// /** +// * Write flat PQ codes into an existing list by the given offset. +// * +// * NB: no memory allocation happens here; the list must fit the data (offset + n_vec). +// * +// * Usage example: +// * @code{.cpp} +// * auto list_data = index.lists()[label]->data.view(); +// * // allocate the buffer for the input codes +// * auto codes = raft::make_device_matrix(res, n_vec, index.pq_dim()); +// * ... prepare n_vecs to pack into the list in codes ... +// * // write codes into the list starting from the 42nd position +// * ivf_flat::helpers::codepacker::pack( +// * res, make_const_mdspan(codes.view()), index.pq_bits(), 42, list_data); +// * @endcode +// * +// * @param[in] res +// * @param[in] codes flat PQ codes, one code per byte [n_vec, pq_dim] +// * @param[in] offset how many records to skip before writing the data into the list +// * @param[in] list_data block to write into +// */ +// template +// inline void pack( +// raft::resources const& res, +// device_matrix_view codes, +// uint32_t offset, +// device_mdspan::list_extents, row_major> list_data) +// { +// ivf_flat::detail::pack_list_data(list_data, codes, offset, resource::get_cuda_stream(res)); +// } +// } // namespace codepacker + +// /** +// * Write flat PQ codes into an existing list by the given offset. +// * +// * The list is identified by its label. +// * +// * NB: no memory allocation happens here; the list must fit the data (offset + n_vec). +// * +// * Usage example: +// * @code{.cpp} +// * // We will write into the 137th cluster +// * uint32_t label = 137; +// * // allocate the buffer for the input codes +// * auto codes = raft::make_device_matrix(res, n_vec, index.pq_dim()); +// * ... prepare n_vecs to pack into the list in codes ... +// * // write codes into the list starting from the 42nd position +// * ivf_flat::helpers::pack_list_data(res, &index, codes_to_pack, label, 42); +// * @endcode +// * +// * @param[in] res +// * @param[inout] index IVF-PQ index. +// * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim] +// * @param[in] label The id of the list (cluster) into which we write. +// * @param[in] offset how many records to skip before writing the data into the list +// */ +// template +// void pack_list_data(raft::resources const& res, +// index* index, +// device_matrix_view codes, +// uint32_t label, +// uint32_t offset) +// { +// ivf_flat::detail::pack_list_data(res, index, codes, label, offset); +// } + +// /** +// * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index +// * starting at given `offset`, one code per byte (independently of pq_bits). +// * +// * Usage example: +// * @code{.cpp} +// * // We will unpack the fourth cluster +// * uint32_t label = 3; +// * // Get the list size +// * uint32_t list_size = 0; +// * raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, +// * resource::get_cuda_stream(res)); resource::sync_stream(res); +// * // allocate the buffer for the output +// * auto codes = raft::make_device_matrix(res, list_size, index.pq_dim()); +// * // unpack the whole list +// * ivf_flat::helpers::unpack_list_data(res, index, codes.view(), label, 0); +// * @endcode +// * +// * @tparam IdxT type of the indices in the source dataset +// * +// * @param[in] res +// * @param[in] index +// * @param[out] out_codes +// * the destination buffer [n_take, index.pq_dim()]. +// * The length `n_take` defines how many records to unpack, +// * it must be smaller than the list size. +// * @param[in] label +// * The id of the list (cluster) to decode. +// * @param[in] offset +// * How many records in the list to skip. +// */ +// template +// void unpack_list_data(raft::resources const& res, +// const index& index, +// device_matrix_view out_codes, +// uint32_t label, +// uint32_t offset) +// { +// return ivf_flat::detail::unpack_list_data(res, index, out_codes, label, offset); +// } + +// /** +// * @brief Unpack a series of records of a single list (cluster) in the compressed index +// * by their in-list offsets, one code per byte (independently of pq_bits). +// * +// * Usage example: +// * @code{.cpp} +// * // We will unpack the fourth cluster +// * uint32_t label = 3; +// * // Create the selection vector +// * auto selected_indices = raft::make_device_vector(res, 4); +// * ... fill the indices ... +// * resource::sync_stream(res); +// * // allocate the buffer for the output +// * auto codes = raft::make_device_matrix(res, selected_indices.size(), index.pq_dim()); +// * // decode the whole list +// * ivf_flat::helpers::unpack_list_data( +// * res, index, selected_indices.view(), codes.view(), label); +// * @endcode +// * +// * @tparam IdxT type of the indices in the source dataset +// * +// * @param[in] res +// * @param[in] index +// * @param[in] in_cluster_indices +// * The offsets of the selected indices within the cluster. +// * @param[out] out_codes +// * the destination buffer [n_take, index.pq_dim()]. +// * The length `n_take` defines how many records to unpack, +// * it must be smaller than the list size. +// * @param[in] label +// * The id of the list (cluster) to decode. +// */ +// template +// void unpack_list_data(raft::resources const& res, +// const index& index, +// device_vector_view in_cluster_indices, +// device_matrix_view out_codes, +// uint32_t label) +// { +// return ivf_flat::detail::unpack_list_data(res, index, out_codes, label, in_cluster_indices); +// } + +/** @} */ +} // namespace raft::neighbors::ivf_flat::helpers From e39ee5647dfe7ebcb06897f6676d0cfe5cce61bd Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 5 Jul 2023 15:42:48 -0700 Subject: [PATCH 02/97] update packing and unpacking functions --- .../raft/neighbors/detail/ivf_flat_build.cuh | 185 +++++++++---- .../raft/neighbors/ivf_flat_helpers.cuh | 261 ++++++------------ 2 files changed, 206 insertions(+), 240 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index 35135de7e7..fcca19be0e 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -420,65 +420,130 @@ inline void fill_refinement_index(raft::resources const& handle, -template -__launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel_float32( - T* out_codes, - T* in_list_data, - uint32_t n_rows, - uint32_t dim, - uint32_t veclen) -{ - const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x; - if (i >= n_rows * dim) { return; } - - auto col = i % dim; - auto row = i / n_rows; - - // The data is written in interleaved groups of `index::kGroupSize` vectors - using interleaved_group = Pow2; - auto group_offset = interleaved_group::roundDown(row); - auto ingroup_id = interleaved_group::mod(row) * veclen; - - // The value of 4 was chosen because for float_32 dtype, calculate_veclen returns 4 - auto within_group_offset = Pow2<4>::quot(col); - - // Interleave dimensions of the source vector while recording it. - // NB: such `veclen` is selected, that `dim % veclen == 0` - out_codes[group_offset * dim + within_group_offset * kIndexGroupSize * veclen + ingroup_id + col % veclen] = in_list_data[i]; -} - -/** - * Unpack interleaved flat codes from an existing packed non-interleaved list by the given row offset. - * - * @param[out] codes flat PQ codes, one code per byte [n_rows, pq_dim] - * @param[in] packed_list_data the packed ivf::list data. - * @param[in] row_offset how many rows in the list to skip. - * @param[in] stream - */ -template -inline void unpack_list_data_float32( - raft::resources const& handle, - device_matrix_view codes, - device_matrix_view packed_list_data, - uint32_t row_offset) -{ - auto stream = raft::resource::get_cuda_stream(handle); - auto n_rows = packed_list_data.extent(0); - if (n_rows == 0) { return; } - - auto dim = packed_list_data.extent(1); - - n_rows -= row_offset; - constexpr uint32_t kBlockSize = 256; - dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); - dim3 threads(kBlockSize, 1, 1); - auto kernel = unpack_list_data_kernel_float32; - kernel<<>>(codes.data_handle(), - packed_list_data.data_handle(), - n_rows, - dim, - 4); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} +// template +// __launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel_float32( +// T* out_codes, +// T* in_list_data, +// uint32_t n_rows, +// uint32_t dim, +// uint32_t veclen) +// { +// const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x; +// if (i >= n_rows * dim) { return; } + +// auto col = i % kIndexGroupSize * veclen; +// auto row = i / (kIndexGroupSize * veclen); + +// auto vec = + +// // The data is written in interleaved groups of `index::kGroupSize` vectors +// using interleaved_group = Pow2; +// auto group_offset = interleaved_group::roundDown(row); +// auto ingroup_id = interleaved_group::mod(row) * veclen; + +// // The value of 4 was chosen because for float_32 dtype, calculate_veclen returns 4 +// auto within_group_offset = Pow2<4>::quot(col); + +// // Interleave dimensions of the source vector while recording it. +// // NB: such `veclen` is selected, that `dim % veclen == 0` +// out_codes[] = in_list_data[i]; +// } + +// /** +// * Pack interleaved flat codes from an existing packed non-interleaved list by the given row offset. +// * +// * @param[out] codes flat codes, [n_rows, dim] +// * @param[in] list_data the packed ivf::list data. +// * @param[in] row_offset how many rows in the list to skip. +// * @param[in] stream +// */ +// template +// inline void unpack_list_data_float32( +// raft::resources const& handle, +// device_matrix_view codes, +// device_mdspan::list_extents, row_major> list_data, +// uint32_t row_offset) +// { +// auto stream = raft::resource::get_cuda_stream(handle); +// auto n_rows = codes.extent(0); +// if (n_rows == 0) { return; } + +// auto dim = codes.extent(1); + +// n_rows -= row_offset; +// constexpr uint32_t kBlockSize = 256; +// dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); +// dim3 threads(kBlockSize, 1, 1); +// auto kernel = pack_list_data_kernel_float32; +// kernel<<>>(codes.data_handle(), +// list_data.data_handle(), +// n_rows, +// dim, +// 4); +// RAFT_CUDA_TRY(cudaPeekAtLastError()); +// } + + + +// template +// __launch_bounds__(BlockSize) __global__ void pack_list_data_kernel_float32( +// T* list_data, +// T* codes, +// uint32_t n_rows, +// uint32_t dim, +// uint32_t veclen) +// { +// const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x; +// if (i >= n_rows * dim) { return; } + +// auto col = i % dim; +// auto row = i / n_rows; + +// // The data is written in interleaved groups of `index::kGroupSize` vectors +// using interleaved_group = Pow2; +// auto group_offset = interleaved_group::roundDown(row); +// auto ingroup_id = interleaved_group::mod(row) * veclen; + +// // The value of 4 was chosen because for float_32 dtype, calculate_veclen returns 4 +// auto within_group_offset = Pow2<4>::quot(col); + +// // Interleave dimensions of the source vector while recording it. +// // NB: such `veclen` is selected, that `dim % veclen == 0` +// list_data[group_offset * dim + within_group_offset * kIndexGroupSize * veclen + ingroup_id + col % veclen] = codes[i]; +// } + +// /** +// * Pack interleaved flat codes from an existing packed non-interleaved list by the given row offset. +// * +// * @param[out] codes flat codes, [n_rows, dim] +// * @param[in] list_data the packed ivf::list data. +// * @param[in] row_offset how many rows in the list to skip. +// * @param[in] stream +// */ +// template +// inline void pack_list_data_float32( +// raft::resources const& handle, +// device_mdspan::list_extents, row_major> list_data, +// device_matrix_view codes, +// uint32_t row_offset) +// { +// auto stream = raft::resource::get_cuda_stream(handle); +// auto n_rows = codes.extent(0); +// if (n_rows == 0) { return; } + +// auto dim = codes.extent(1); + +// n_rows -= row_offset; +// constexpr uint32_t kBlockSize = 256; +// dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); +// dim3 threads(kBlockSize, 1, 1); +// auto kernel = pack_list_data_kernel_float32; +// kernel<<>>(codes.data_handle(), +// list_data.data_handle(), +// n_rows, +// dim, +// 4); +// RAFT_CUDA_TRY(cudaPeekAtLastError()); +// } } // namespace raft::neighbors::ivf_flat::detail diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh index 2eb8085b71..da0990fdbf 100644 --- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh +++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh @@ -29,188 +29,89 @@ namespace raft::neighbors::ivf_flat::helpers { * @{ */ -namespace codepacker { -/** - * @brief Unpack `n_take` consecutive records of a single non-interleaved list (cluster) starting at given `row_offset`. - * - * - * Usage example: - * @code{.cpp} - * auto list_data = index.lists()[label]->data.view(); - * // allocate the buffer for the output - * uint32_t n_take = 4; - * auto codes = raft::make_device_matrix(res, n_take, index.pq_dim()); - * uint32_t offset = 0; - * // unpack n_take elements from the list - * ivf_flat::helpers::codepacker::unpack(res, list_data, index.pq_bits(), offset, codes.view()); - * @endcode - * - * @tparam IdxT type of the indices in the source dataset - * - * @param[in] res raft resource - * @param[in] list_data block to read from - * @param[in] offset - * How many records in the list to skip. - * @param[out] codes - */ -template -inline void unpack( - raft::resources const& res, - device_matrix_view list_data, - uint32_t offset, - device_matrix_view codes) -{ - ivf_flat::detail::unpack_list_data_float32(res, codes, list_data, offset); +template +void unpackInterleaved( + const T* in, + T* out, + int numVecs, + int dim, + int veclen) { + + // The data is written in interleaved groups of `index::kGroupSize` vectors + using interleaved_group = Pow2; + + // Interleave dimensions of the source vector while recording it. + // NB: such `veclen` is selected, that `dim % veclen == 0` + #pragma omp parallel for + for (int i = 0; i < numVecs; i++) { + auto group_offset = interleaved_group::roundDown(i); + auto ingroup_id = interleaved_group::mod(i) * veclen; + + // Point to the location of the interleaved group of vectors + out += group_offset * dim; + for (uint32_t l = 0; l < dim; l += veclen) { + for (uint32_t j = 0; j < veclen; j++) { + out[l * kIndexGroupSize + ingroup_id + j] = in[i * dim + l + j]; + } + } + } } -// /** -// * Write flat PQ codes into an existing list by the given offset. -// * -// * NB: no memory allocation happens here; the list must fit the data (offset + n_vec). -// * -// * Usage example: -// * @code{.cpp} -// * auto list_data = index.lists()[label]->data.view(); -// * // allocate the buffer for the input codes -// * auto codes = raft::make_device_matrix(res, n_vec, index.pq_dim()); -// * ... prepare n_vecs to pack into the list in codes ... -// * // write codes into the list starting from the 42nd position -// * ivf_flat::helpers::codepacker::pack( -// * res, make_const_mdspan(codes.view()), index.pq_bits(), 42, list_data); -// * @endcode -// * -// * @param[in] res -// * @param[in] codes flat PQ codes, one code per byte [n_vec, pq_dim] -// * @param[in] offset how many records to skip before writing the data into the list -// * @param[in] list_data block to write into -// */ -// template -// inline void pack( -// raft::resources const& res, -// device_matrix_view codes, -// uint32_t offset, -// device_mdspan::list_extents, row_major> list_data) -// { -// ivf_flat::detail::pack_list_data(list_data, codes, offset, resource::get_cuda_stream(res)); -// } -// } // namespace codepacker - -// /** -// * Write flat PQ codes into an existing list by the given offset. -// * -// * The list is identified by its label. -// * -// * NB: no memory allocation happens here; the list must fit the data (offset + n_vec). -// * -// * Usage example: -// * @code{.cpp} -// * // We will write into the 137th cluster -// * uint32_t label = 137; -// * // allocate the buffer for the input codes -// * auto codes = raft::make_device_matrix(res, n_vec, index.pq_dim()); -// * ... prepare n_vecs to pack into the list in codes ... -// * // write codes into the list starting from the 42nd position -// * ivf_flat::helpers::pack_list_data(res, &index, codes_to_pack, label, 42); -// * @endcode -// * -// * @param[in] res -// * @param[inout] index IVF-PQ index. -// * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim] -// * @param[in] label The id of the list (cluster) into which we write. -// * @param[in] offset how many records to skip before writing the data into the list -// */ -// template -// void pack_list_data(raft::resources const& res, -// index* index, -// device_matrix_view codes, -// uint32_t label, -// uint32_t offset) -// { -// ivf_flat::detail::pack_list_data(res, index, codes, label, offset); -// } - -// /** -// * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index -// * starting at given `offset`, one code per byte (independently of pq_bits). -// * -// * Usage example: -// * @code{.cpp} -// * // We will unpack the fourth cluster -// * uint32_t label = 3; -// * // Get the list size -// * uint32_t list_size = 0; -// * raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, -// * resource::get_cuda_stream(res)); resource::sync_stream(res); -// * // allocate the buffer for the output -// * auto codes = raft::make_device_matrix(res, list_size, index.pq_dim()); -// * // unpack the whole list -// * ivf_flat::helpers::unpack_list_data(res, index, codes.view(), label, 0); -// * @endcode -// * -// * @tparam IdxT type of the indices in the source dataset -// * -// * @param[in] res -// * @param[in] index -// * @param[out] out_codes -// * the destination buffer [n_take, index.pq_dim()]. -// * The length `n_take` defines how many records to unpack, -// * it must be smaller than the list size. -// * @param[in] label -// * The id of the list (cluster) to decode. -// * @param[in] offset -// * How many records in the list to skip. -// */ -// template -// void unpack_list_data(raft::resources const& res, -// const index& index, -// device_matrix_view out_codes, -// uint32_t label, -// uint32_t offset) -// { -// return ivf_flat::detail::unpack_list_data(res, index, out_codes, label, offset); -// } - -// /** -// * @brief Unpack a series of records of a single list (cluster) in the compressed index -// * by their in-list offsets, one code per byte (independently of pq_bits). -// * -// * Usage example: -// * @code{.cpp} -// * // We will unpack the fourth cluster -// * uint32_t label = 3; -// * // Create the selection vector -// * auto selected_indices = raft::make_device_vector(res, 4); -// * ... fill the indices ... -// * resource::sync_stream(res); -// * // allocate the buffer for the output -// * auto codes = raft::make_device_matrix(res, selected_indices.size(), index.pq_dim()); -// * // decode the whole list -// * ivf_flat::helpers::unpack_list_data( -// * res, index, selected_indices.view(), codes.view(), label); -// * @endcode -// * -// * @tparam IdxT type of the indices in the source dataset -// * -// * @param[in] res -// * @param[in] index -// * @param[in] in_cluster_indices -// * The offsets of the selected indices within the cluster. -// * @param[out] out_codes -// * the destination buffer [n_take, index.pq_dim()]. -// * The length `n_take` defines how many records to unpack, -// * it must be smaller than the list size. -// * @param[in] label -// * The id of the list (cluster) to decode. -// */ -// template -// void unpack_list_data(raft::resources const& res, -// const index& index, -// device_vector_view in_cluster_indices, -// device_matrix_view out_codes, -// uint32_t label) -// { -// return ivf_flat::detail::unpack_list_data(res, index, out_codes, label, in_cluster_indices); -// } +template +void pack_host_interleaved( + const T* in, + T* out, + int numVecs, + int dim, + int veclen) { + + // The data is written in interleaved groups of `index::kGroupSize` vectors + using interleaved_group = Pow2; + + // Interleave dimensions of the source vector while recording it. + // NB: such `veclen` is selected, that `dim % veclen == 0` + #pragma omp parallel for + for (int i = 0; i < numVecs; i++) { + auto group_offset = interleaved_group::roundDown(i); + auto ingroup_id = interleaved_group::mod(i) * veclen; + + // Point to the location of the interleaved group of vectors + out += group_offset * dim; + for (uint32_t l = 0; l < dim; l += veclen) { + for (uint32_t j = 0; j < veclen; j++) { + out[l * kIndexGroupSize + ingroup_id + j] = in[i * dim + l + j]; + } + } + } +} + +template +void unpack_host_interleaved( + const T* in, + T* out, + int numVecs, + int dim, + int veclen) { + + // The data is written in interleaved groups of `index::kGroupSize` vectors + using interleaved_group = Pow2; + + // Interleave dimensions of the source vector while recording it. + // NB: such `veclen` is selected, that `dim % veclen == 0` + #pragma omp parallel for + for (int i = 0; i < numVecs; i++) { + auto group_offset = interleaved_group::roundDown(i); + auto ingroup_id = interleaved_group::mod(i) * veclen; + + // Point to the location of the interleaved group of vectors + out += group_offset * dim; + for (uint32_t l = 0; l < dim; l += veclen) { + for (uint32_t j = 0; j < veclen; j++) { + out[i * dim + l + j] = in[l * kIndexGroupSize + ingroup_id + j]; + } + } + } +} /** @} */ } // namespace raft::neighbors::ivf_flat::helpers From 78d63801d4b66c5e49aa832e163d768e8165c779 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 13 Jul 2023 21:53:46 -0700 Subject: [PATCH 03/97] Update codepacker --- .../raft/neighbors/ivf_flat_helpers.cuh | 104 ++++++++---------- .../raft/spatial/knn/detail/ann_quantized.cuh | 90 +++++++++++++++ 2 files changed, 135 insertions(+), 59 deletions(-) diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh index da0990fdbf..85272d01d1 100644 --- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh +++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh @@ -23,95 +23,81 @@ #include #include +// #include + namespace raft::neighbors::ivf_flat::helpers { /** * @defgroup ivf_flat_helpers Helper functions for manipulationg IVF Flat Index * @{ */ +namespace codepacker { +/** + * Write one flat code into a block by the given offset. The offset indicates the id of the record in the list. This function interleaves the code and is intended to later copy the interleaved codes over to the IVF list on device. + * NB: no memory allocation happens here; the block must fit the record (offset + 1). + * + * @tparam T + * + * @param[in] flat_code input flat code + * @param[out] block block of memory to write interleaved codes to + * @param[in] dim dimension of the flat code + * @param[in] veclen size of interleaved data chunks + * @param[in] offset how many records to skip before writing the data into the list + */ template -void unpackInterleaved( - const T* in, - T* out, - int numVecs, - int dim, - int veclen) { - - // The data is written in interleaved groups of `index::kGroupSize` vectors - using interleaved_group = Pow2; - - // Interleave dimensions of the source vector while recording it. - // NB: such `veclen` is selected, that `dim % veclen == 0` - #pragma omp parallel for - for (int i = 0; i < numVecs; i++) { - auto group_offset = interleaved_group::roundDown(i); - auto ingroup_id = interleaved_group::mod(i) * veclen; - - // Point to the location of the interleaved group of vectors - out += group_offset * dim; - for (uint32_t l = 0; l < dim; l += veclen) { - for (uint32_t j = 0; j < veclen; j++) { - out[l * kIndexGroupSize + ingroup_id + j] = in[i * dim + l + j]; - } - } - } -} - - -template -void pack_host_interleaved( - const T* in, - T* out, - int numVecs, - int dim, - int veclen) { - +__host__ __device__ void pack_1_interleaved( + const T* flat_code, + T* block, + uint32_t dim, + uint32_t veclen, + uint32_t offset) { // The data is written in interleaved groups of `index::kGroupSize` vectors using interleaved_group = Pow2; // Interleave dimensions of the source vector while recording it. // NB: such `veclen` is selected, that `dim % veclen == 0` - #pragma omp parallel for - for (int i = 0; i < numVecs; i++) { - auto group_offset = interleaved_group::roundDown(i); - auto ingroup_id = interleaved_group::mod(i) * veclen; + auto group_offset = interleaved_group::roundDown(offset); + auto ingroup_id = interleaved_group::mod(offset) * veclen; - // Point to the location of the interleaved group of vectors - out += group_offset * dim; for (uint32_t l = 0; l < dim; l += veclen) { for (uint32_t j = 0; j < veclen; j++) { - out[l * kIndexGroupSize + ingroup_id + j] = in[i * dim + l + j]; + block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j] = flat_code[l + j]; } } - } } +/** + * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset indicates the id of the record. This function fetches one flat code from an interleaved code. + * + * @tparam T + * + * @param[in] block interleaved block. The block can be thought of as the whole inverted list in interleaved format. + * @param[out] flat_code output flat code + * @param[in] dim dimension of the flat code + * @param[in] veclen size of interleaved data chunks + * @param[in] offset fetch the flat code by the given offset + */ template -void unpack_host_interleaved( - const T* in, - T* out, - int numVecs, - int dim, - int veclen) { +__host__ __device__ void unpack_1_interleaved( + const T* block, + T* flat_code, + uint32_t dim, + uint32_t veclen, + uint32_t offset) { // The data is written in interleaved groups of `index::kGroupSize` vectors using interleaved_group = Pow2; - // Interleave dimensions of the source vector while recording it. // NB: such `veclen` is selected, that `dim % veclen == 0` - #pragma omp parallel for - for (int i = 0; i < numVecs; i++) { - auto group_offset = interleaved_group::roundDown(i); - auto ingroup_id = interleaved_group::mod(i) * veclen; + auto group_offset = interleaved_group::roundDown(offset); + auto ingroup_id = interleaved_group::mod(offset) * veclen; - // Point to the location of the interleaved group of vectors - out += group_offset * dim; for (uint32_t l = 0; l < dim; l += veclen) { for (uint32_t j = 0; j < veclen; j++) { - out[i * dim + l + j] = in[l * kIndexGroupSize + ingroup_id + j]; + flat_code[l + j] = block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j]; } } - } } +} // namespace codepacker /** @} */ } // namespace raft::neighbors::ivf_flat::helpers diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh index 964292f6cb..1d724bada7 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh @@ -18,9 +18,14 @@ #include "../ann_common.h" #include "../ivf_flat.cuh" +#include #include #include "processing.cuh" +#include "raft/core/host_mdarray.hpp" +#include "raft/neighbors/ivf_flat_types.hpp" +#include "raft/neighbors/ivf_flat_helpers.cuh" +#include "raft/util/pow2_utils.cuh" #include #include #include @@ -73,6 +78,91 @@ void approx_knn_build_index(raft::resources const& handle, auto new_params = from_legacy_index_params(*ivf_ft_pams, metric, metricArg); index->ivf_flat() = std::make_unique>( ivf_flat::build(handle, new_params, index_array, int64_t(n), D)); + + // raft::resource::sync_stream(handle); + + // auto old_list = index->ivf_flat()->lists()[0]; + // uint32_t n_rows = old_list->size.load(); + // uint32_t roundup = Pow2::roundUp(n_rows); + + // RAFT_LOG_INFO("roundup %d, n_rows %d", roundup, n_rows); + + // if (n_rows == 0) { return; } + + // auto dim = index->ivf_flat()->dim(); + // auto veclen = index -> ivf_flat()->veclen(); + // RAFT_LOG_INFO("roundup %d, n_rows %d, veclen %d, dim %d", roundup, n_rows, veclen, dim); + // auto codes = make_host_matrix(roundup, dim); + // auto block = make_host_matrix(roundup, dim); + + // T* firstArray; + // cudaMemcpy(&firstArray, index->ivf_flat()->data_ptrs().data_handle(), sizeof(float*), cudaMemcpyDeviceToHost); // Copy the pointer to the first array from device to host + + // raft::print_device_vector("codes_gpu", firstArray, 1, std::cout); + // raft::update_host(codes.data_handle(), firstArray, (size_t)(roundup * dim), stream); + // raft::resource::sync_stream(handle); + // raft::neighbors::ivf_flat::helpers::pack_host_interleaved( + // codes.data_handle(), + // block.data_handle(), + // n_rows, + // dim, + // veclen); + + // RAFT_LOG_INFO("veclen %d", veclen); + // raft::print_host_vector("codes", codes.data_handle(), roundup * dim, std::cout); + // raft::print_host_vector("block", block.data_handle(), roundup * dim, std::cout); + // // auto indices = make_device_vector(handle_, n_rows); + // copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_); + + // ivf_flat::helpers::pack_list_data(handle_, *index, codes.view(), label, 0); + // ivf_pq::helpers::erase_list(handle_, index, label); + // ivf_pq::helpers::extend_list_with_codes( + // handle_, index, codes.view(), indices.view(), label); + + // auto& new_list = index->lists()[label]; + // ASSERT_NE(old_list.get(), new_list.get()) + // << "The old list should have been shared and retained after ivf_pq index has erased the " + // "corresponding cluster."; + // auto list_data_size = (n_rows / ivf_pq::kIndexGroupSize) * new_list->data.extent(1) * + // new_list->data.extent(2) * new_list->data.extent(3); + + // ASSERT_TRUE(old_list->data.size() >= list_data_size); + // ASSERT_TRUE(new_list->data.size() >= list_data_size); + // ASSERT_TRUE(devArrMatch(old_list->data.data_handle(), + // new_list->data.data_handle(), + // list_data_size, + // Compare{})); + + // // Pack a few vectors back to the list. + // int row_offset = 9; + // int n_vec = 3; + // ASSERT_TRUE(row_offset + n_vec < n_rows); + // size_t offset = row_offset * index->pq_dim(); + // auto codes_to_pack = make_device_matrix_view( + // codes.data_handle() + offset, n_vec, index->pq_dim()); + // ivf_pq::helpers::pack_list_data(handle_, index, codes_to_pack, label, row_offset); + // ASSERT_TRUE(devArrMatch(old_list->data.data_handle(), + // new_list->data.data_handle(), + // list_data_size, + // Compare{})); + + // Another test with the API that take list_data directly + // auto list_data = index->lists()[label]->data.view(); + // uint32_t n_take = 4; + // ASSERT_TRUE(row_offset + n_take < n_rows); + // auto codes2 = raft::make_device_matrix(handle_, n_take, index->pq_dim()); + // ivf_pq::helpers::codepacker::unpack( + // handle_, list_data, index->pq_bits(), row_offset, codes2.view()); + + // // Write it back + // ivf_pq::helpers::codepacker::pack( + // handle_, make_const_mdspan(codes2.view()), index->pq_bits(), row_offset, list_data); + // ASSERT_TRUE(devArrMatch(old_list->data.data_handle(), + // new_list->data.data_handle(), + // list_data_size, + // Compare{})); + // } + } else if (ivf_pq_pams) { neighbors::ivf_pq::index_params params; params.metric = metric; From 897338e227ac184061ab98b11670f377c52b0bbb Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 17 Jul 2023 09:50:43 -0700 Subject: [PATCH 04/97] refactor codepacker (does not build) --- .../all_cuda-118_arch-x86_64.yaml | 60 ------ .../all_cuda-120_arch-x86_64.yaml | 56 ------ .../bench_ann_cuda-118_arch-x86_64.yaml | 38 ---- .../raft/neighbors/detail/ivf_flat_build.cuh | 179 ++++++------------ .../raft/neighbors/ivf_flat_codepacker.cuh | 89 +++++++++ .../raft/neighbors/ivf_flat_helpers.cuh | 82 ++------ .../raft/spatial/knn/detail/ann_quantized.cuh | 89 --------- cpp/test/neighbors/ann_ivf_flat.cuh | 90 ++++++++- 8 files changed, 252 insertions(+), 431 deletions(-) delete mode 100644 conda/environments/all_cuda-118_arch-x86_64.yaml delete mode 100644 conda/environments/all_cuda-120_arch-x86_64.yaml delete mode 100644 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml create mode 100644 cpp/include/raft/neighbors/ivf_flat_codepacker.cuh diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml deleted file mode 100644 index 546728d2c6..0000000000 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ /dev/null @@ -1,60 +0,0 @@ -# This file is generated by `rapids-dependency-file-generator`. -# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. -channels: -- rapidsai -- rapidsai-nightly -- dask/label/dev -- conda-forge -- nvidia -dependencies: -- breathe -- c-compiler -- clang-tools=16.0.1 -- clang=16.0.1 -- cmake>=3.23.1,!=3.25.0 -- cuda-profiler-api=11.8.86 -- cuda-python>=11.7.1,<12.0a0 -- cuda-version=11.8 -- cudatoolkit -- cupy>=12.0.0 -- cxx-compiler -- cython>=0.29,<0.30 -- dask-core>=2023.5.1 -- dask-cuda==23.8.* -- dask>=2023.5.1 -- distributed>=2023.5.1 -- doxygen>=1.8.20 -- gcc_linux-64=11.* -- gmock>=1.13.0 -- graphviz -- gtest>=1.13.0 -- ipython -- joblib>=0.11 -- libcublas-dev=11.11.3.6 -- libcublas=11.11.3.6 -- libcurand-dev=10.3.0.86 -- libcurand=10.3.0.86 -- libcusolver-dev=11.4.1.48 -- libcusolver=11.4.1.48 -- libcusparse-dev=11.7.5.86 -- libcusparse=11.7.5.86 -- nccl>=2.9.9 -- ninja -- numba>=0.57 -- numpy>=1.21 -- numpydoc -- pydata-sphinx-theme -- pytest -- pytest-cov -- recommonmark -- rmm==23.8.* -- scikit-build>=0.13.1 -- scikit-learn -- scipy -- sphinx-copybutton -- sphinx-markdown-tables -- sysroot_linux-64==2.17 -- ucx-proc=*=gpu -- ucx-py==0.33.* -- ucx>=1.13.0 -name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml deleted file mode 100644 index 0ae6154078..0000000000 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# This file is generated by `rapids-dependency-file-generator`. -# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. -channels: -- rapidsai -- rapidsai-nightly -- dask/label/dev -- conda-forge -- nvidia -dependencies: -- breathe -- c-compiler -- clang-tools=16.0.1 -- clang=16.0.1 -- cmake>=3.23.1,!=3.25.0 -- cuda-cudart-dev -- cuda-profiler-api -- cuda-python>=12.0,<13.0a0 -- cuda-version=12.0 -- cupy>=12.0.0 -- cxx-compiler -- cython>=0.29,<0.30 -- dask-core>=2023.5.1 -- dask-cuda==23.8.* -- dask>=2023.5.1 -- distributed>=2023.5.1 -- doxygen>=1.8.20 -- gcc_linux-64=11.* -- gmock>=1.13.0 -- graphviz -- gtest>=1.13.0 -- ipython -- joblib>=0.11 -- libcublas-dev -- libcurand-dev -- libcusolver-dev -- libcusparse-dev -- nccl>=2.9.9 -- ninja -- numba>=0.57 -- numpy>=1.21 -- numpydoc -- pydata-sphinx-theme -- pytest -- pytest-cov -- recommonmark -- rmm==23.8.* -- scikit-build>=0.13.1 -- scikit-learn -- scipy -- sphinx-copybutton -- sphinx-markdown-tables -- sysroot_linux-64==2.17 -- ucx-proc=*=gpu -- ucx-py==0.33.* -- ucx>=1.13.0 -name: all_cuda-120_arch-x86_64 diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml deleted file mode 100644 index 74b966cc03..0000000000 --- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml +++ /dev/null @@ -1,38 +0,0 @@ -# This file is generated by `rapids-dependency-file-generator`. -# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. -channels: -- rapidsai -- rapidsai-nightly -- dask/label/dev -- conda-forge -- nvidia -dependencies: -- c-compiler -- clang-tools=16.0.1 -- clang=16.0.1 -- cmake>=3.23.1,!=3.25.0 -- cuda-profiler-api=11.8.86 -- cuda-version=11.8 -- cudatoolkit -- cxx-compiler -- cython>=0.29,<0.30 -- faiss-proc=*=cuda -- gcc_linux-64=11.* -- glog>=0.6.0 -- h5py>=3.8.0 -- hnswlib=0.7.0 -- libcublas-dev=11.11.3.6 -- libcublas=11.11.3.6 -- libcurand-dev=10.3.0.86 -- libcurand=10.3.0.86 -- libcusolver-dev=11.4.1.48 -- libcusolver=11.4.1.48 -- libcusparse-dev=11.7.5.86 -- libcusparse=11.7.5.86 -- libfaiss>=1.7.1 -- nccl>=2.9.9 -- ninja -- nlohmann_json>=3.11.2 -- scikit-build>=0.13.1 -- sysroot_linux-64==2.17 -name: bench_ann_cuda-118_arch-x86_64 diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index fcca19be0e..4615ddba57 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -418,132 +419,60 @@ inline void fill_refinement_index(raft::resources const& handle, RAFT_CUDA_TRY(cudaPeekAtLastError()); } +template +__global__ void pack_interleaved_list_kernel( + const T* codes, T* list_data, uint32_t n_rows, uint32_t dim, uint32_t veclen) +{ + auto tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n_rows) { + codepacker::pack_1_interleaved( + codes + tid * dim, list_data, dim, veclen, tid); + } +} +template +__global__ void unpack_interleaved_list_kernel( + const T* list_data, T* codes, uint32_t n_rows, uint32_t dim, uint32_t veclen) +{ + auto tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < n_rows) { + codepacker::unpack_1_interleaved( + codes + tid * dim, list_data, dim, veclen, tid); + } +} -// template -// __launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel_float32( -// T* out_codes, -// T* in_list_data, -// uint32_t n_rows, -// uint32_t dim, -// uint32_t veclen) -// { -// const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x; -// if (i >= n_rows * dim) { return; } - -// auto col = i % kIndexGroupSize * veclen; -// auto row = i / (kIndexGroupSize * veclen); - -// auto vec = - -// // The data is written in interleaved groups of `index::kGroupSize` vectors -// using interleaved_group = Pow2; -// auto group_offset = interleaved_group::roundDown(row); -// auto ingroup_id = interleaved_group::mod(row) * veclen; - -// // The value of 4 was chosen because for float_32 dtype, calculate_veclen returns 4 -// auto within_group_offset = Pow2<4>::quot(col); - -// // Interleave dimensions of the source vector while recording it. -// // NB: such `veclen` is selected, that `dim % veclen == 0` -// out_codes[] = in_list_data[i]; -// } - -// /** -// * Pack interleaved flat codes from an existing packed non-interleaved list by the given row offset. -// * -// * @param[out] codes flat codes, [n_rows, dim] -// * @param[in] list_data the packed ivf::list data. -// * @param[in] row_offset how many rows in the list to skip. -// * @param[in] stream -// */ -// template -// inline void unpack_list_data_float32( -// raft::resources const& handle, -// device_matrix_view codes, -// device_mdspan::list_extents, row_major> list_data, -// uint32_t row_offset) -// { -// auto stream = raft::resource::get_cuda_stream(handle); -// auto n_rows = codes.extent(0); -// if (n_rows == 0) { return; } - -// auto dim = codes.extent(1); - -// n_rows -= row_offset; -// constexpr uint32_t kBlockSize = 256; -// dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); -// dim3 threads(kBlockSize, 1, 1); -// auto kernel = pack_list_data_kernel_float32; -// kernel<<>>(codes.data_handle(), -// list_data.data_handle(), -// n_rows, -// dim, -// 4); -// RAFT_CUDA_TRY(cudaPeekAtLastError()); -// } - - - -// template -// __launch_bounds__(BlockSize) __global__ void pack_list_data_kernel_float32( -// T* list_data, -// T* codes, -// uint32_t n_rows, -// uint32_t dim, -// uint32_t veclen) -// { -// const IdxT i = IdxT(blockDim.x) * IdxT(blockIdx.x) + threadIdx.x; -// if (i >= n_rows * dim) { return; } - -// auto col = i % dim; -// auto row = i / n_rows; - -// // The data is written in interleaved groups of `index::kGroupSize` vectors -// using interleaved_group = Pow2; -// auto group_offset = interleaved_group::roundDown(row); -// auto ingroup_id = interleaved_group::mod(row) * veclen; - -// // The value of 4 was chosen because for float_32 dtype, calculate_veclen returns 4 -// auto within_group_offset = Pow2<4>::quot(col); - -// // Interleave dimensions of the source vector while recording it. -// // NB: such `veclen` is selected, that `dim % veclen == 0` -// list_data[group_offset * dim + within_group_offset * kIndexGroupSize * veclen + ingroup_id + col % veclen] = codes[i]; -// } - -// /** -// * Pack interleaved flat codes from an existing packed non-interleaved list by the given row offset. -// * -// * @param[out] codes flat codes, [n_rows, dim] -// * @param[in] list_data the packed ivf::list data. -// * @param[in] row_offset how many rows in the list to skip. -// * @param[in] stream -// */ -// template -// inline void pack_list_data_float32( -// raft::resources const& handle, -// device_mdspan::list_extents, row_major> list_data, -// device_matrix_view codes, -// uint32_t row_offset) -// { -// auto stream = raft::resource::get_cuda_stream(handle); -// auto n_rows = codes.extent(0); -// if (n_rows == 0) { return; } - -// auto dim = codes.extent(1); - -// n_rows -= row_offset; -// constexpr uint32_t kBlockSize = 256; -// dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); -// dim3 threads(kBlockSize, 1, 1); -// auto kernel = pack_list_data_kernel_float32; -// kernel<<>>(codes.data_handle(), -// list_data.data_handle(), -// n_rows, -// dim, -// 4); -// RAFT_CUDA_TRY(cudaPeekAtLastError()); -// } +template +void pack_list_data( + raft::resources const& res, + device_matrix_view codes, + uint32_t veclen, + device_mdspan::list_extents, row_major> list_data) +{ + uint32_t n_rows = codes.extent(0); + uint32_t dim = codes.extent(1); + static constexpr uint32_t kBlockSize = 256; + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); + dim3 threads(kBlockSize, 1, 1); + auto stream = resource::get_cuda_stream(res); + pack_interleaved_list_kernel<<>>(list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +void unpack_list_data( + raft::resources const& res, + device_mdspan::list_extents, row_major> list_data, + uint32_t veclen, + device_matrix_view codes) +{ + uint32_t n_rows = codes.extent(0); + uint32_t dim = codes.extent(1); + static constexpr uint32_t kBlockSize = 256; + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); + dim3 threads(kBlockSize, 1, 1); + auto stream = resource::get_cuda_stream(res); + unpack_interleaved_list_kernel<<>>(codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} } // namespace raft::neighbors::ivf_flat::detail diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh b/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh new file mode 100644 index 0000000000..2a6dc55c10 --- /dev/null +++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace raft::neighbors::ivf_flat::codepacker { +/** + * Write one flat code into a block by the given offset. The offset indicates the id of the record + * in the list. This function interleaves the code and is intended to later copy the interleaved + * codes over to the IVF list on device. NB: no memory allocation happens here; the block must fit + * the record (offset + 1). + * + * @tparam T + * + * @param[in] flat_code input flat code + * @param[out] block block of memory to write interleaved codes to + * @param[in] dim dimension of the flat code + * @param[in] veclen size of interleaved data chunks + * @param[in] offset how many records to skip before writing the data into the list + */ +template +__host__ __device__ void pack_1_interleaved( + const T* flat_code, T* block, uint32_t dim, uint32_t veclen, uint32_t offset) +{ + // The data is written in interleaved groups of `index::kGroupSize` vectors + using interleaved_group = Pow2; + + // Interleave dimensions of the source vector while recording it. + // NB: such `veclen` is selected, that `dim % veclen == 0` + auto group_offset = interleaved_group::roundDown(offset); + auto ingroup_id = interleaved_group::mod(offset) * veclen; + + for (uint32_t l = 0; l < dim; l += veclen) { + for (uint32_t j = 0; j < veclen; j++) { + block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j] = flat_code[l + j]; + } + } +} + +/** + * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset + * indicates the id of the record. This function fetches one flat code from an interleaved code. + * + * @tparam T + * + * @param[in] block interleaved block. The block can be thought of as the whole inverted list in + * interleaved format. + * @param[out] flat_code output flat code + * @param[in] dim dimension of the flat code + * @param[in] veclen size of interleaved data chunks + * @param[in] offset fetch the flat code by the given offset + */ +template +__host__ __device__ void unpack_1_interleaved( + const T* block, T* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset) +{ + // The data is written in interleaved groups of `index::kGroupSize` vectors + using interleaved_group = Pow2; + + // NB: such `veclen` is selected, that `dim % veclen == 0` + auto group_offset = interleaved_group::roundDown(offset); + auto ingroup_id = interleaved_group::mod(offset) * veclen; + + for (uint32_t l = 0; l < dim; l += veclen) { + for (uint32_t j = 0; j < veclen; j++) { + flat_code[l + j] = block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j]; + } + } +} +} \ No newline at end of file diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh index 85272d01d1..135467ad32 100644 --- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh +++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh @@ -23,8 +23,6 @@ #include #include -// #include - namespace raft::neighbors::ivf_flat::helpers { /** * @defgroup ivf_flat_helpers Helper functions for manipulationg IVF Flat Index @@ -32,72 +30,32 @@ namespace raft::neighbors::ivf_flat::helpers { */ namespace codepacker { -/** - * Write one flat code into a block by the given offset. The offset indicates the id of the record in the list. This function interleaves the code and is intended to later copy the interleaved codes over to the IVF list on device. - * NB: no memory allocation happens here; the block must fit the record (offset + 1). - * - * @tparam T - * - * @param[in] flat_code input flat code - * @param[out] block block of memory to write interleaved codes to - * @param[in] dim dimension of the flat code - * @param[in] veclen size of interleaved data chunks - * @param[in] offset how many records to skip before writing the data into the list - */ -template -__host__ __device__ void pack_1_interleaved( - const T* flat_code, - T* block, - uint32_t dim, - uint32_t veclen, - uint32_t offset) { - // The data is written in interleaved groups of `index::kGroupSize` vectors - using interleaved_group = Pow2; - // Interleave dimensions of the source vector while recording it. - // NB: such `veclen` is selected, that `dim % veclen == 0` - auto group_offset = interleaved_group::roundDown(offset); - auto ingroup_id = interleaved_group::mod(offset) * veclen; - for (uint32_t l = 0; l < dim; l += veclen) { - for (uint32_t j = 0; j < veclen; j++) { - block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j] = flat_code[l + j]; - } - } +template +inline void pack_full_list( + raft::resources const& res, + device_matrix_view codes, + uint32_t veclen, + device_mdspan::list_extents, row_major> list_data) +{ + raft::neighbors::ivf_flat::detail::pack_list_data(res, + codes, + veclen, + list_data); } -/** - * Unpack 1 record of a single list (cluster) in the index to fetch the flat code. The offset indicates the id of the record. This function fetches one flat code from an interleaved code. - * - * @tparam T - * - * @param[in] block interleaved block. The block can be thought of as the whole inverted list in interleaved format. - * @param[out] flat_code output flat code - * @param[in] dim dimension of the flat code - * @param[in] veclen size of interleaved data chunks - * @param[in] offset fetch the flat code by the given offset - */ template -__host__ __device__ void unpack_1_interleaved( - const T* block, - T* flat_code, - uint32_t dim, - uint32_t veclen, - uint32_t offset) { - - // The data is written in interleaved groups of `index::kGroupSize` vectors - using interleaved_group = Pow2; - - // NB: such `veclen` is selected, that `dim % veclen == 0` - auto group_offset = interleaved_group::roundDown(offset); - auto ingroup_id = interleaved_group::mod(offset) * veclen; - - for (uint32_t l = 0; l < dim; l += veclen) { - for (uint32_t j = 0; j < veclen; j++) { - flat_code[l + j] = block[group_offset * dim + l * kIndexGroupSize + ingroup_id + j]; - } - } +inline void unpack_full_list( + raft::resources const& res, + device_mdspan::list_extents, row_major> list_data, + uint32_t veclen, + device_matrix_view codes) +{ + raft::neighbors::ivf_flat::detail::unpack_list_data(res, + list_data, veclen, codes); } } // namespace codepacker /** @} */ } // namespace raft::neighbors::ivf_flat::helpers + diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh index 1d724bada7..25339a6391 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh @@ -22,10 +22,6 @@ #include #include "processing.cuh" -#include "raft/core/host_mdarray.hpp" -#include "raft/neighbors/ivf_flat_types.hpp" -#include "raft/neighbors/ivf_flat_helpers.cuh" -#include "raft/util/pow2_utils.cuh" #include #include #include @@ -78,91 +74,6 @@ void approx_knn_build_index(raft::resources const& handle, auto new_params = from_legacy_index_params(*ivf_ft_pams, metric, metricArg); index->ivf_flat() = std::make_unique>( ivf_flat::build(handle, new_params, index_array, int64_t(n), D)); - - // raft::resource::sync_stream(handle); - - // auto old_list = index->ivf_flat()->lists()[0]; - // uint32_t n_rows = old_list->size.load(); - // uint32_t roundup = Pow2::roundUp(n_rows); - - // RAFT_LOG_INFO("roundup %d, n_rows %d", roundup, n_rows); - - // if (n_rows == 0) { return; } - - // auto dim = index->ivf_flat()->dim(); - // auto veclen = index -> ivf_flat()->veclen(); - // RAFT_LOG_INFO("roundup %d, n_rows %d, veclen %d, dim %d", roundup, n_rows, veclen, dim); - // auto codes = make_host_matrix(roundup, dim); - // auto block = make_host_matrix(roundup, dim); - - // T* firstArray; - // cudaMemcpy(&firstArray, index->ivf_flat()->data_ptrs().data_handle(), sizeof(float*), cudaMemcpyDeviceToHost); // Copy the pointer to the first array from device to host - - // raft::print_device_vector("codes_gpu", firstArray, 1, std::cout); - // raft::update_host(codes.data_handle(), firstArray, (size_t)(roundup * dim), stream); - // raft::resource::sync_stream(handle); - // raft::neighbors::ivf_flat::helpers::pack_host_interleaved( - // codes.data_handle(), - // block.data_handle(), - // n_rows, - // dim, - // veclen); - - // RAFT_LOG_INFO("veclen %d", veclen); - // raft::print_host_vector("codes", codes.data_handle(), roundup * dim, std::cout); - // raft::print_host_vector("block", block.data_handle(), roundup * dim, std::cout); - // // auto indices = make_device_vector(handle_, n_rows); - // copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_); - - // ivf_flat::helpers::pack_list_data(handle_, *index, codes.view(), label, 0); - // ivf_pq::helpers::erase_list(handle_, index, label); - // ivf_pq::helpers::extend_list_with_codes( - // handle_, index, codes.view(), indices.view(), label); - - // auto& new_list = index->lists()[label]; - // ASSERT_NE(old_list.get(), new_list.get()) - // << "The old list should have been shared and retained after ivf_pq index has erased the " - // "corresponding cluster."; - // auto list_data_size = (n_rows / ivf_pq::kIndexGroupSize) * new_list->data.extent(1) * - // new_list->data.extent(2) * new_list->data.extent(3); - - // ASSERT_TRUE(old_list->data.size() >= list_data_size); - // ASSERT_TRUE(new_list->data.size() >= list_data_size); - // ASSERT_TRUE(devArrMatch(old_list->data.data_handle(), - // new_list->data.data_handle(), - // list_data_size, - // Compare{})); - - // // Pack a few vectors back to the list. - // int row_offset = 9; - // int n_vec = 3; - // ASSERT_TRUE(row_offset + n_vec < n_rows); - // size_t offset = row_offset * index->pq_dim(); - // auto codes_to_pack = make_device_matrix_view( - // codes.data_handle() + offset, n_vec, index->pq_dim()); - // ivf_pq::helpers::pack_list_data(handle_, index, codes_to_pack, label, row_offset); - // ASSERT_TRUE(devArrMatch(old_list->data.data_handle(), - // new_list->data.data_handle(), - // list_data_size, - // Compare{})); - - // Another test with the API that take list_data directly - // auto list_data = index->lists()[label]->data.view(); - // uint32_t n_take = 4; - // ASSERT_TRUE(row_offset + n_take < n_rows); - // auto codes2 = raft::make_device_matrix(handle_, n_take, index->pq_dim()); - // ivf_pq::helpers::codepacker::unpack( - // handle_, list_data, index->pq_bits(), row_offset, codes2.view()); - - // // Write it back - // ivf_pq::helpers::codepacker::pack( - // handle_, make_const_mdspan(codes2.view()), index->pq_bits(), row_offset, list_data); - // ASSERT_TRUE(devArrMatch(old_list->data.data_handle(), - // new_list->data.data_handle(), - // list_data_size, - // Compare{})); - // } - } else if (ivf_pq_pams) { neighbors::ivf_pq::index_params params; params.metric = metric; diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index a252b26600..b2fa4de90a 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -17,6 +17,11 @@ #include "../test_utils.cuh" #include "ann_utils.cuh" +#include "raft/core/device_mdarray.hpp" +#include "raft/core/host_mdarray.hpp" +#include "raft/linalg/map.cuh" +#include "raft/util/cudart_utils.hpp" +#include "raft/util/fast_int_div.cuh" #include #include @@ -26,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +42,7 @@ #include +#include #include #include @@ -65,6 +72,15 @@ template return os; } +template +void flat_codes_from_indices(raft::resources const& handle, IdxT* indices, DataT* data, uint32_t n_rows, uint32_t dim, DataT* flat_codes) { + raft::linalg::map_offset(handle, raft::make_device_vector_view(flat_codes, n_rows * dim), [dim = util::FastIntDiv(dim), indices, data]__device__ (auto idx) { + auto row = idx / dim; + auto col = idx % dim; + return data[indices[row] * dim + col]; + }); +} + template class AnnIVFFlatTest : public ::testing::TestWithParam> { public: @@ -264,6 +280,79 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { } } + void testPacker() { + ivf_flat::index_params index_params; + index_params.n_lists = ps.n_list; + index_params.metric = ps.metric; + index_params.adaptive_centers = false; + + index_params.add_data_on_build = true; + index_params.kmeans_trainset_fraction = 0.5; + index_params.metric_arg = 0; + + auto database_view = raft::make_device_matrix_view( + (const DataT*)database.data(), ps.num_db_vecs, ps.dim); + + auto index = ivf_flat::build(handle_, index_params, database_view); + + auto list_sizes = raft::make_host_vector(index.n_lists()); + update_host(list_sizes.data_handle(), + index.list_sizes().data_handle(), + index.n_lists(), + stream_); + auto data_ptrs = raft::make_host_vector(index.n_lists()); + update_host(data_ptrs.data_handle(), + index.data_ptrs().data_handle(), + index.n_lists(), + stream_); + auto inds_ptrs = raft::make_host_vector(index.n_lists()); + update_host(inds_ptrs.data_handle(), + index.inds_ptrs().data_handle(), + index.n_lists(), + stream_); + resource::sync_stream(handle_); + + auto list_device_spec = list_spec{ps.dim, true}; + + for(uint32_t label = 0; label < index.n_lists(); label++) { + + rmm::device_uvector interleaved_data(Pow2::roundUp(list_sizes[label] * ps.dim), stream_); + list_device_spec.make_list_extents(list_sizes[label]); + + // fetch the flat codes + rmm::device_uvector flat_codes(list_sizes[label] * ps.dim, stream_); + + flat_codes_from_indices(handle_, inds_ptrs + label, data_ptrs + label, list_sizes[label], ps.dim, flat_codes.data_handle()); + + helpers::codepacker::pack_full_list( + handle_, + flat_codes.view(), + index.veclen(), + make_device_vector_view(interleaved_data.data(), Pow2::roundUp(list_sizes[label]), ps.dim)); + + ASSERT_TRUE(raft::devArrMatch(interleaved_data.data(), + data_ptrs + label, + Pow2::roundUp(list_sizes[label]) * ps.dim, + raft::Compare(), + stream_)); + + rmm::device_uvector unpacked_flat_codes(list_sizes[label] * ps.dim, stream_); + + helpers::codepacker::unpack_full_list( + handle_, + raft::make_device_matrix_view(data_ptrs[label], Pow2::roundUp(list_sizes[label]), ps.dim), + index.veclen(), + make_device_vector_view(unpacked_flat_codes.data(), list_sizes[label], ps.dim)); + + ASSERT_TRUE(raft::devArrMatch(flat_codes.data(), + unpacked_flat_codes.data(), + list_sizes[label] * ps.dim, + raft::Compare(), + stream_)); + + } + } + void SetUp() override { database.resize(ps.num_db_vecs * ps.dim, stream_); @@ -356,5 +445,4 @@ const std::vector> inputs = { raft::matrix::detail::select::warpsort::kMaxCapacity * 4, raft::distance::DistanceType::InnerProduct, false}}; - } // namespace raft::neighbors::ivf_flat From 2a2ee515a323c93a5ab922adf7e8176fa7a8593a Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 17 Jul 2023 09:57:29 -0700 Subject: [PATCH 05/97] Undo deletions --- .../all_cuda-118_arch-x86_64.yaml | 60 +++++++++++++++++++ .../all_cuda-120_arch-x86_64.yaml | 57 ++++++++++++++++++ .../bench_ann_cuda-118_arch-x86_64.yaml | 38 ++++++++++++ 3 files changed, 155 insertions(+) create mode 100644 conda/environments/all_cuda-118_arch-x86_64.yaml create mode 100644 conda/environments/all_cuda-120_arch-x86_64.yaml create mode 100644 conda/environments/bench_ann_cuda-118_arch-x86_64.yaml diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml new file mode 100644 index 0000000000..67dd01ada9 --- /dev/null +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -0,0 +1,60 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- breathe +- c-compiler +- clang-tools=16.0.1 +- clang=16.0.1 +- cmake>=3.23.1,!=3.25.0 +- cuda-profiler-api=11.8.86 +- cuda-python>=11.7.1,<12.0a0 +- cuda-version=11.8 +- cudatoolkit +- cupy>=12.0.0 +- cxx-compiler +- cython>=0.29,<0.30 +- dask-core>=2023.5.1 +- dask-cuda==23.8.* +- dask>=2023.5.1 +- distributed>=2023.5.1 +- doxygen>=1.8.20 +- gcc_linux-64=11.* +- gmock>=1.13.0 +- graphviz +- gtest>=1.13.0 +- ipython +- joblib>=0.11 +- libcublas-dev=11.11.3.6 +- libcublas=11.11.3.6 +- libcurand-dev=10.3.0.86 +- libcurand=10.3.0.86 +- libcusolver-dev=11.4.1.48 +- libcusolver=11.4.1.48 +- libcusparse-dev=11.7.5.86 +- libcusparse=11.7.5.86 +- nccl>=2.9.9 +- ninja +- numba>=0.57 +- numpy>=1.21 +- numpydoc +- pydata-sphinx-theme +- pytest +- pytest-cov +- recommonmark +- rmm==23.8.* +- scikit-build>=0.13.1 +- scikit-learn +- scipy +- sphinx-copybutton +- sphinx-markdown-tables +- sysroot_linux-64==2.17 +- ucx-proc=*=gpu +- ucx-py==0.33.* +- ucx>=1.13.0 +name: all_cuda-118_arch-x86_64 \ No newline at end of file diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml new file mode 100644 index 0000000000..f1d56d4ff6 --- /dev/null +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -0,0 +1,57 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- breathe +- c-compiler +- clang-tools=16.0.1 +- clang=16.0.1 +- cmake>=3.23.1,!=3.25.0 +- cuda-cudart-dev +- cuda-profiler-api +- cuda-python>=12.0,<13.0a0 +- cuda-version=12.0 +- cupy>=12.0.0 +- cxx-compiler +- cython>=0.29,<0.30 +- dask-core>=2023.5.1 +- dask-cuda==23.8.* +- dask>=2023.5.1 +- distributed>=2023.5.1 +- doxygen>=1.8.20 +- gcc_linux-64=11.* +- gmock>=1.13.0 +- graphviz +- gtest>=1.13.0 +- ipython +- joblib>=0.11 +- libcublas-dev +- libcurand-dev +- libcusolver-dev +- libcusparse-dev +- nccl>=2.9.9 +- ninja +- numba>=0.57 +- numpy>=1.21 +- numpydoc +- pydata-sphinx-theme +- pytest +- pytest-cov +- recommonmark +- rmm==23.8.* +- scikit-build>=0.13.1 +- scikit-learn +- scipy +- sphinx-copybutton +- sphinx-markdown-tables +- sysroot_linux-64==2.17 +- ucx-proc=*=gpu +- ucx-py==0.33.* +- ucx>=1.13.0 +name: all_cuda-120_arch-x86_64 + diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml new file mode 100644 index 0000000000..0869be212f --- /dev/null +++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml @@ -0,0 +1,38 @@ +# This file is generated by `rapids-dependency-file-generator`. +# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. +channels: +- rapidsai +- rapidsai-nightly +- dask/label/dev +- conda-forge +- nvidia +dependencies: +- c-compiler +- clang-tools=16.0.1 +- clang=16.0.1 +- cmake>=3.23.1,!=3.25.0 +- cuda-profiler-api=11.8.86 +- cuda-version=11.8 +- cudatoolkit +- cxx-compiler +- cython>=0.29,<0.30 +- faiss-proc=*=cuda +- gcc_linux-64=11.* +- glog>=0.6.0 +- h5py>=3.8.0 +- hnswlib=0.7.0 +- libcublas-dev=11.11.3.6 +- libcublas=11.11.3.6 +- libcurand-dev=10.3.0.86 +- libcurand=10.3.0.86 +- libcusolver-dev=11.4.1.48 +- libcusolver=11.4.1.48 +- libcusparse-dev=11.7.5.86 +- libcusparse=11.7.5.86 +- libfaiss>=1.7.1 +- nccl>=2.9.9 +- ninja +- nlohmann_json>=3.11.2 +- scikit-build>=0.13.1 +- sysroot_linux-64==2.17 +name: bench_ann_cuda-118_arch-x86_64 \ No newline at end of file From 834dd2c17c3c7820a2edbfcc9d293ce1b1b14fc8 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 17 Jul 2023 10:00:13 -0700 Subject: [PATCH 06/97] undo yaml changes --- conda/environments/all_cuda-120_arch-x86_64.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index f1d56d4ff6..589fb4920b 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -53,5 +53,4 @@ dependencies: - ucx-proc=*=gpu - ucx-py==0.33.* - ucx>=1.13.0 -name: all_cuda-120_arch-x86_64 - +name: all_cuda-120_arch-x86_64 \ No newline at end of file From 60134298533a92b22c7db8e3fa79b720b25800c4 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 17 Jul 2023 10:07:25 -0700 Subject: [PATCH 07/97] style --- .../raft/neighbors/detail/ivf_flat_build.cuh | 24 +-- .../raft/neighbors/ivf_flat_codepacker.cuh | 2 +- .../raft/neighbors/ivf_flat_helpers.cuh | 15 +- cpp/test/neighbors/ann_ivf_flat.cuh | 157 +++++++++--------- 4 files changed, 100 insertions(+), 98 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index 4615ddba57..edd2b07701 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -425,9 +425,8 @@ __global__ void pack_interleaved_list_kernel( { auto tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n_rows) { - codepacker::pack_1_interleaved( - codes + tid * dim, list_data, dim, veclen, tid); - } + codepacker::pack_1_interleaved(codes + tid * dim, list_data, dim, veclen, tid); + } } template @@ -436,9 +435,8 @@ __global__ void unpack_interleaved_list_kernel( { auto tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n_rows) { - codepacker::unpack_1_interleaved( - codes + tid * dim, list_data, dim, veclen, tid); - } + codepacker::unpack_1_interleaved(codes + tid * dim, list_data, dim, veclen, tid); + } } template @@ -448,13 +446,14 @@ void pack_list_data( uint32_t veclen, device_mdspan::list_extents, row_major> list_data) { - uint32_t n_rows = codes.extent(0); - uint32_t dim = codes.extent(1); + uint32_t n_rows = codes.extent(0); + uint32_t dim = codes.extent(1); static constexpr uint32_t kBlockSize = 256; dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); dim3 threads(kBlockSize, 1, 1); auto stream = resource::get_cuda_stream(res); - pack_interleaved_list_kernel<<>>(list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen); + pack_interleaved_list_kernel<<>>( + list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen); RAFT_CUDA_TRY(cudaPeekAtLastError()); } @@ -465,13 +464,14 @@ void unpack_list_data( uint32_t veclen, device_matrix_view codes) { - uint32_t n_rows = codes.extent(0); - uint32_t dim = codes.extent(1); + uint32_t n_rows = codes.extent(0); + uint32_t dim = codes.extent(1); static constexpr uint32_t kBlockSize = 256; dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); dim3 threads(kBlockSize, 1, 1); auto stream = resource::get_cuda_stream(res); - unpack_interleaved_list_kernel<<>>(codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen); + unpack_interleaved_list_kernel<<>>( + codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen); RAFT_CUDA_TRY(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh b/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh index 2a6dc55c10..9b12761daf 100644 --- a/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh +++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh @@ -86,4 +86,4 @@ __host__ __device__ void unpack_1_interleaved( } } } -} \ No newline at end of file +} // namespace raft::neighbors::ivf_flat::codepacker \ No newline at end of file diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh index 135467ad32..7ed0b7ec8e 100644 --- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh +++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh @@ -31,7 +31,6 @@ namespace raft::neighbors::ivf_flat::helpers { namespace codepacker { - template inline void pack_full_list( raft::resources const& res, @@ -39,23 +38,19 @@ inline void pack_full_list( uint32_t veclen, device_mdspan::list_extents, row_major> list_data) { - raft::neighbors::ivf_flat::detail::pack_list_data(res, - codes, - veclen, - list_data); + raft::neighbors::ivf_flat::detail::pack_list_data(res, codes, veclen, list_data); } template inline void unpack_full_list( raft::resources const& res, - device_mdspan::list_extents, row_major> list_data, + device_mdspan::list_extents, row_major> + list_data, uint32_t veclen, device_matrix_view codes) { - raft::neighbors::ivf_flat::detail::unpack_list_data(res, - list_data, veclen, codes); + raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, veclen, codes); } -} // namespace codepacker +} // namespace codepacker /** @} */ } // namespace raft::neighbors::ivf_flat::helpers - diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index b2fa4de90a..17952c4019 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -73,12 +73,20 @@ template } template -void flat_codes_from_indices(raft::resources const& handle, IdxT* indices, DataT* data, uint32_t n_rows, uint32_t dim, DataT* flat_codes) { - raft::linalg::map_offset(handle, raft::make_device_vector_view(flat_codes, n_rows * dim), [dim = util::FastIntDiv(dim), indices, data]__device__ (auto idx) { - auto row = idx / dim; - auto col = idx % dim; - return data[indices[row] * dim + col]; - }); +void flat_codes_from_indices(raft::resources const& handle, + IdxT* indices, + DataT* data, + uint32_t n_rows, + uint32_t dim, + DataT* flat_codes) +{ + raft::linalg::map_offset(handle, + raft::make_device_vector_view(flat_codes, n_rows * dim), + [dim = util::FastIntDiv(dim), indices, data] __device__(auto idx) { + auto row = idx / dim; + auto col = idx % dim; + return data[indices[row] * dim + col]; + }); } template @@ -280,77 +288,76 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { } } - void testPacker() { + void testPacker() + { ivf_flat::index_params index_params; - index_params.n_lists = ps.n_list; - index_params.metric = ps.metric; - index_params.adaptive_centers = false; - - index_params.add_data_on_build = true; - index_params.kmeans_trainset_fraction = 0.5; - index_params.metric_arg = 0; - - auto database_view = raft::make_device_matrix_view( - (const DataT*)database.data(), ps.num_db_vecs, ps.dim); - - auto index = ivf_flat::build(handle_, index_params, database_view); - - auto list_sizes = raft::make_host_vector(index.n_lists()); - update_host(list_sizes.data_handle(), - index.list_sizes().data_handle(), - index.n_lists(), - stream_); - auto data_ptrs = raft::make_host_vector(index.n_lists()); - update_host(data_ptrs.data_handle(), - index.data_ptrs().data_handle(), - index.n_lists(), - stream_); - auto inds_ptrs = raft::make_host_vector(index.n_lists()); - update_host(inds_ptrs.data_handle(), - index.inds_ptrs().data_handle(), - index.n_lists(), - stream_); - resource::sync_stream(handle_); - - auto list_device_spec = list_spec{ps.dim, true}; - - for(uint32_t label = 0; label < index.n_lists(); label++) { - - rmm::device_uvector interleaved_data(Pow2::roundUp(list_sizes[label] * ps.dim), stream_); - list_device_spec.make_list_extents(list_sizes[label]); - - // fetch the flat codes - rmm::device_uvector flat_codes(list_sizes[label] * ps.dim, stream_); - - flat_codes_from_indices(handle_, inds_ptrs + label, data_ptrs + label, list_sizes[label], ps.dim, flat_codes.data_handle()); - - helpers::codepacker::pack_full_list( - handle_, - flat_codes.view(), - index.veclen(), - make_device_vector_view(interleaved_data.data(), Pow2::roundUp(list_sizes[label]), ps.dim)); - - ASSERT_TRUE(raft::devArrMatch(interleaved_data.data(), - data_ptrs + label, - Pow2::roundUp(list_sizes[label]) * ps.dim, - raft::Compare(), - stream_)); - - rmm::device_uvector unpacked_flat_codes(list_sizes[label] * ps.dim, stream_); - - helpers::codepacker::unpack_full_list( - handle_, - raft::make_device_matrix_view(data_ptrs[label], Pow2::roundUp(list_sizes[label]), ps.dim), - index.veclen(), - make_device_vector_view(unpacked_flat_codes.data(), list_sizes[label], ps.dim)); - - ASSERT_TRUE(raft::devArrMatch(flat_codes.data(), - unpacked_flat_codes.data(), - list_sizes[label] * ps.dim, - raft::Compare(), - stream_)); + index_params.n_lists = ps.n_list; + index_params.metric = ps.metric; + index_params.adaptive_centers = false; + + index_params.add_data_on_build = true; + index_params.kmeans_trainset_fraction = 0.5; + index_params.metric_arg = 0; + + auto database_view = raft::make_device_matrix_view( + (const DataT*)database.data(), ps.num_db_vecs, ps.dim); + + auto index = ivf_flat::build(handle_, index_params, database_view); + + auto list_sizes = raft::make_host_vector(index.n_lists()); + update_host( + list_sizes.data_handle(), index.list_sizes().data_handle(), index.n_lists(), stream_); + auto data_ptrs = raft::make_host_vector(index.n_lists()); + update_host(data_ptrs.data_handle(), index.data_ptrs().data_handle(), index.n_lists(), stream_); + auto inds_ptrs = raft::make_host_vector(index.n_lists()); + update_host(inds_ptrs.data_handle(), index.inds_ptrs().data_handle(), index.n_lists(), stream_); + resource::sync_stream(handle_); - } + auto list_device_spec = list_spec{ps.dim, true}; + + for (uint32_t label = 0; label < index.n_lists(); label++) { + rmm::device_uvector interleaved_data( + Pow2::roundUp(list_sizes[label] * ps.dim), stream_); + list_device_spec.make_list_extents(list_sizes[label]); + + // fetch the flat codes + rmm::device_uvector flat_codes(list_sizes[label] * ps.dim, stream_); + + flat_codes_from_indices(handle_, + inds_ptrs + label, + data_ptrs + label, + list_sizes[label], + ps.dim, + flat_codes.data_handle()); + + helpers::codepacker::pack_full_list( + handle_, + flat_codes.view(), + index.veclen(), + make_device_vector_view( + interleaved_data.data(), Pow2::roundUp(list_sizes[label]), ps.dim)); + + ASSERT_TRUE(raft::devArrMatch(interleaved_data.data(), + data_ptrs + label, + Pow2::roundUp(list_sizes[label]) * ps.dim, + raft::Compare(), + stream_)); + + rmm::device_uvector unpacked_flat_codes(list_sizes[label] * ps.dim, stream_); + + helpers::codepacker::unpack_full_list( + handle_, + raft::make_device_matrix_view( + data_ptrs[label], Pow2::roundUp(list_sizes[label]), ps.dim), + index.veclen(), + make_device_vector_view(unpacked_flat_codes.data(), list_sizes[label], ps.dim)); + + ASSERT_TRUE(raft::devArrMatch(flat_codes.data(), + unpacked_flat_codes.data(), + list_sizes[label] * ps.dim, + raft::Compare(), + stream_)); + } } void SetUp() override From ab6345a7cd4c459dd792403cfce22fd5976ce4b9 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Mon, 17 Jul 2023 17:29:27 -0700 Subject: [PATCH 08/97] Update tests, correct make_list_extents --- .../raft/neighbors/detail/ivf_flat_build.cuh | 12 ++-- .../raft/neighbors/ivf_flat_helpers.cuh | 12 ++-- cpp/include/raft/neighbors/ivf_flat_types.hpp | 6 +- cpp/test/neighbors/ann_ivf_flat.cuh | 65 ++++++++++--------- .../ann_ivf_flat/test_float_int64_t.cu | 2 +- 5 files changed, 50 insertions(+), 47 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index edd2b07701..2c8b0cbe71 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -439,12 +439,12 @@ __global__ void unpack_interleaved_list_kernel( } } -template +template void pack_list_data( raft::resources const& res, - device_matrix_view codes, + device_matrix_view codes, uint32_t veclen, - device_mdspan::list_extents, row_major> list_data) + device_mdspan::list_extents, row_major> list_data) { uint32_t n_rows = codes.extent(0); uint32_t dim = codes.extent(1); @@ -457,12 +457,12 @@ void pack_list_data( RAFT_CUDA_TRY(cudaPeekAtLastError()); } -template +template void unpack_list_data( raft::resources const& res, - device_mdspan::list_extents, row_major> list_data, + device_mdspan::list_extents, row_major> list_data, uint32_t veclen, - device_matrix_view codes) + device_matrix_view codes) { uint32_t n_rows = codes.extent(0); uint32_t dim = codes.extent(1); diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh index 7ed0b7ec8e..b90797d4c5 100644 --- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh +++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh @@ -31,23 +31,23 @@ namespace raft::neighbors::ivf_flat::helpers { namespace codepacker { -template +template inline void pack_full_list( raft::resources const& res, - device_matrix_view codes, + device_matrix_view codes, uint32_t veclen, - device_mdspan::list_extents, row_major> list_data) + device_mdspan::list_extents, row_major> list_data) { raft::neighbors::ivf_flat::detail::pack_list_data(res, codes, veclen, list_data); } -template +template inline void unpack_full_list( raft::resources const& res, - device_mdspan::list_extents, row_major> + device_mdspan::list_extents, row_major> list_data, uint32_t veclen, - device_matrix_view codes) + device_matrix_view codes) { raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, veclen, codes); } diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp index 2e2e49cdbc..d427e99e3e 100644 --- a/cpp/include/raft/neighbors/ivf_flat_types.hpp +++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp @@ -94,9 +94,9 @@ struct list_spec { SizeT align_max; SizeT align_min; - uint32_t dim; + SizeT dim; - constexpr list_spec(uint32_t dim, bool conservative_memory_allocation) + constexpr list_spec(SizeT dim, bool conservative_memory_allocation) : dim(dim), align_min(kIndexGroupSize), align_max(conservative_memory_allocation ? kIndexGroupSize : 1024) @@ -113,7 +113,7 @@ struct list_spec { /** Determine the extents of an array enough to hold a given amount of data. */ constexpr auto make_list_extents(SizeT n_rows) const -> list_extents { - return make_extents(n_rows, dim); + return make_extents(div_rounding_up_safe(n_rows, kIndexGroupSize), dim); } }; diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index 17952c4019..eb49e3d2d9 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -20,6 +20,7 @@ #include "raft/core/device_mdarray.hpp" #include "raft/core/host_mdarray.hpp" #include "raft/linalg/map.cuh" +#include "raft/neighbors/ivf_flat_types.hpp" #include "raft/util/cudart_utils.hpp" #include "raft/util/fast_int_div.cuh" #include @@ -31,7 +32,7 @@ #include #include #include -#include +#include #include #include #include @@ -73,18 +74,18 @@ template } template -void flat_codes_from_indices(raft::resources const& handle, +void flat_codes_from_list_indices(raft::resources const& handle, IdxT* indices, DataT* data, - uint32_t n_rows, - uint32_t dim, + IdxT n_rows, + IdxT dim, DataT* flat_codes) { raft::linalg::map_offset(handle, raft::make_device_vector_view(flat_codes, n_rows * dim), - [dim = util::FastIntDiv(dim), indices, data] __device__(auto idx) { - auto row = idx / dim; - auto col = idx % dim; + [dim, divisor = util::FastIntDiv(dim), indices, data] __device__(auto idx) { + auto row = idx / divisor; + auto col = idx % divisor; return data[indices[row] * dim + col]; }); } @@ -291,7 +292,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { void testPacker() { ivf_flat::index_params index_params; - index_params.n_lists = ps.n_list; + index_params.n_lists = ps.nlist; index_params.metric = ps.metric; index_params.adaptive_centers = false; @@ -313,49 +314,51 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { update_host(inds_ptrs.data_handle(), index.inds_ptrs().data_handle(), index.n_lists(), stream_); resource::sync_stream(handle_); - auto list_device_spec = list_spec{ps.dim, true}; + auto list_device_spec = list_spec{ps.dim, false}; for (uint32_t label = 0; label < index.n_lists(); label++) { - rmm::device_uvector interleaved_data( - Pow2::roundUp(list_sizes[label] * ps.dim), stream_); - list_device_spec.make_list_extents(list_sizes[label]); + uint32_t list_size = list_sizes.data_handle()[label]; + T* list_data_ptr = data_ptrs.data_handle()[label]; + IdxT* list_inds_ptr = inds_ptrs.data_handle()[label]; + + auto exts = list_device_spec.make_list_extents(static_cast(list_size)); + auto interleaved_data = make_device_mdarray(handle_, exts); // fetch the flat codes - rmm::device_uvector flat_codes(list_sizes[label] * ps.dim, stream_); + auto flat_codes = make_device_matrix(handle_, static_cast(list_size), ps.dim); - flat_codes_from_indices(handle_, - inds_ptrs + label, - data_ptrs + label, - list_sizes[label], + flat_codes_from_list_indices(handle_, + list_inds_ptr, + list_data_ptr, + static_cast(list_size), ps.dim, flat_codes.data_handle()); + auto l = ivf::list(handle_, list_device_spec, static_cast(list_size)); helpers::codepacker::pack_full_list( handle_, - flat_codes.view(), + make_device_matrix_view(flat_codes.data_handle(), static_cast(list_size), ps.dim), index.veclen(), - make_device_vector_view( - interleaved_data.data(), Pow2::roundUp(list_sizes[label]), ps.dim)); + interleaved_data.view()); - ASSERT_TRUE(raft::devArrMatch(interleaved_data.data(), - data_ptrs + label, - Pow2::roundUp(list_sizes[label]) * ps.dim, + ASSERT_TRUE(raft::devArrMatch(interleaved_data.data_handle(), + list_data_ptr, + Pow2::roundUp(list_size) * ps.dim, raft::Compare(), stream_)); - rmm::device_uvector unpacked_flat_codes(list_sizes[label] * ps.dim, stream_); + auto unpacked_flat_codes = make_device_matrix(handle_, static_cast(list_size), ps.dim); helpers::codepacker::unpack_full_list( handle_, - raft::make_device_matrix_view( - data_ptrs[label], Pow2::roundUp(list_sizes[label]), ps.dim), + make_const_mdspan(interleaved_data.view()), index.veclen(), - make_device_vector_view(unpacked_flat_codes.data(), list_sizes[label], ps.dim)); + unpacked_flat_codes.view()); - ASSERT_TRUE(raft::devArrMatch(flat_codes.data(), - unpacked_flat_codes.data(), - list_sizes[label] * ps.dim, - raft::Compare(), + ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(), + unpacked_flat_codes.data_handle(), + list_size * ps.dim, + raft::Compare(), stream_)); } } diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu index f0988ca988..1b4a5f36c3 100644 --- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu +++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu @@ -21,7 +21,7 @@ namespace raft::neighbors::ivf_flat { typedef AnnIVFFlatTest AnnIVFFlatTestF; -TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); } +TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); this->testPacker();} INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs)); From ed80d1ad0109f11b81dbb7f8f6720aca627f3e13 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 19 Jul 2023 15:21:18 -0700 Subject: [PATCH 09/97] More changes --- .../raft/neighbors/detail/ivf_flat_build.cuh | 34 ++++---- .../raft/neighbors/ivf_flat_helpers.cuh | 37 ++++++--- cpp/include/raft/neighbors/ivf_flat_types.hpp | 7 +- cpp/test/neighbors/ann_ivf_flat.cuh | 82 ++++++++++--------- .../ann_ivf_flat/test_float_int64_t.cu | 2 +- log.txt | 2 + 6 files changed, 96 insertions(+), 68 deletions(-) create mode 100644 log.txt diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index 2c8b0cbe71..c475e12b96 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -423,7 +423,7 @@ template __global__ void pack_interleaved_list_kernel( const T* codes, T* list_data, uint32_t n_rows, uint32_t dim, uint32_t veclen) { - auto tid = blockIdx.x * blockDim.x + threadIdx.x; + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n_rows) { codepacker::pack_1_interleaved(codes + tid * dim, list_data, dim, veclen, tid); } @@ -433,45 +433,49 @@ template __global__ void unpack_interleaved_list_kernel( const T* list_data, T* codes, uint32_t n_rows, uint32_t dim, uint32_t veclen) { - auto tid = blockIdx.x * blockDim.x + threadIdx.x; + uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n_rows) { - codepacker::unpack_1_interleaved(codes + tid * dim, list_data, dim, veclen, tid); + codepacker::unpack_1_interleaved(list_data, codes + tid * dim, dim, veclen, tid); } } -template +template void pack_list_data( raft::resources const& res, - device_matrix_view codes, + T* codes, + uint32_t n_rows, + uint32_t dim, uint32_t veclen, - device_mdspan::list_extents, row_major> list_data) + T* list_data) { - uint32_t n_rows = codes.extent(0); - uint32_t dim = codes.extent(1); + // uint32_t n_rows = codes.extent(0); + // uint32_t dim = codes.extent(1); static constexpr uint32_t kBlockSize = 256; dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); dim3 threads(kBlockSize, 1, 1); auto stream = resource::get_cuda_stream(res); pack_interleaved_list_kernel<<>>( - list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen); + list_data, codes, n_rows, dim, veclen); RAFT_CUDA_TRY(cudaPeekAtLastError()); } -template +template void unpack_list_data( raft::resources const& res, - device_mdspan::list_extents, row_major> list_data, + T* list_data, + uint32_t n_rows, + uint32_t dim, uint32_t veclen, - device_matrix_view codes) + T* codes) { - uint32_t n_rows = codes.extent(0); - uint32_t dim = codes.extent(1); + // uint32_t n_rows = codes.extent(0); + // uint32_t dim = codes.extent(1); static constexpr uint32_t kBlockSize = 256; dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); dim3 threads(kBlockSize, 1, 1); auto stream = resource::get_cuda_stream(res); unpack_interleaved_list_kernel<<>>( - codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen); + codes, list_data, n_rows, dim, veclen); RAFT_CUDA_TRY(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh index b90797d4c5..e757be9b8a 100644 --- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh +++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh @@ -31,25 +31,38 @@ namespace raft::neighbors::ivf_flat::helpers { namespace codepacker { -template -inline void pack_full_list( - raft::resources const& res, - device_matrix_view codes, +template +void pack_full_list( + raft::resources const& handle, + T* codes, + uint32_t n_rows, + uint32_t dim, uint32_t veclen, - device_mdspan::list_extents, row_major> list_data) + T* list_data) { - raft::neighbors::ivf_flat::detail::pack_list_data(res, codes, veclen, list_data); + raft::neighbors::ivf_flat::detail::pack_list_data(handle, codes, n_rows, dim, veclen, list_data); } -template -inline void unpack_full_list( +// template +// void unpack_full_list( +// raft::resources const& res, +// device_mdspan::list_extents, row_major> +// list_data, +// uint32_t veclen, +// device_matrix_view codes) +// { +// raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, veclen, codes); +// } +template +void unpack_full_list( raft::resources const& res, - device_mdspan::list_extents, row_major> - list_data, + T* list_data, + uint32_t n_rows, + uint32_t dim, uint32_t veclen, - device_matrix_view codes) + T* codes) { - raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, veclen, codes); + raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, n_rows, dim, veclen, codes); } } // namespace codepacker /** @} */ diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp index d427e99e3e..6032da8556 100644 --- a/cpp/include/raft/neighbors/ivf_flat_types.hpp +++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp @@ -94,9 +94,9 @@ struct list_spec { SizeT align_max; SizeT align_min; - SizeT dim; + uint32_t dim; - constexpr list_spec(SizeT dim, bool conservative_memory_allocation) + constexpr list_spec(uint32_t dim, bool conservative_memory_allocation) : dim(dim), align_min(kIndexGroupSize), align_max(conservative_memory_allocation ? kIndexGroupSize : 1024) @@ -113,7 +113,8 @@ struct list_spec { /** Determine the extents of an array enough to hold a given amount of data. */ constexpr auto make_list_extents(SizeT n_rows) const -> list_extents { - return make_extents(div_rounding_up_safe(n_rows, kIndexGroupSize), dim); + // return make_extents(round_up_safe(n_rows, this->align_min), dim); + return make_extents(n_rows, dim); } }; diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index eb49e3d2d9..b6bcb01d91 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -19,6 +19,8 @@ #include "ann_utils.cuh" #include "raft/core/device_mdarray.hpp" #include "raft/core/host_mdarray.hpp" +#include "raft/core/mdspan.hpp" +#include "raft/core/mdspan_types.hpp" #include "raft/linalg/map.cuh" #include "raft/neighbors/ivf_flat_types.hpp" #include "raft/util/cudart_utils.hpp" @@ -31,8 +33,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -73,23 +77,6 @@ template return os; } -template -void flat_codes_from_list_indices(raft::resources const& handle, - IdxT* indices, - DataT* data, - IdxT n_rows, - IdxT dim, - DataT* flat_codes) -{ - raft::linalg::map_offset(handle, - raft::make_device_vector_view(flat_codes, n_rows * dim), - [dim, divisor = util::FastIntDiv(dim), indices, data] __device__(auto idx) { - auto row = idx / divisor; - auto col = idx % divisor; - return data[indices[row] * dim + col]; - }); -} - template class AnnIVFFlatTest : public ::testing::TestWithParam> { public: @@ -151,6 +138,8 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { database.data(), ps.num_db_vecs, ps.dim); + + RAFT_LOG_INFO("Index build successfully"); resource::sync_stream(handle_); approx_knn_search(handle_, @@ -160,6 +149,8 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { ps.k, search_queries.data(), ps.num_queries); + + RAFT_LOG_INFO("search successful"); update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_); update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_); @@ -305,6 +296,9 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { auto index = ivf_flat::build(handle_, index_params, database_view); + raft::print_device_vector("list_sizes", index.list_sizes().data_handle(), ps.nlist, std::cout); + RAFT_LOG_INFO("total db size %lld", ps.num_db_vecs); + auto list_sizes = raft::make_host_vector(index.n_lists()); update_host( list_sizes.data_handle(), index.list_sizes().data_handle(), index.n_lists(), stream_); @@ -314,32 +308,44 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { update_host(inds_ptrs.data_handle(), index.inds_ptrs().data_handle(), index.n_lists(), stream_); resource::sync_stream(handle_); - auto list_device_spec = list_spec{ps.dim, false}; + auto list_device_spec = list_spec{static_cast(ps.dim), false}; for (uint32_t label = 0; label < index.n_lists(); label++) { uint32_t list_size = list_sizes.data_handle()[label]; T* list_data_ptr = data_ptrs.data_handle()[label]; IdxT* list_inds_ptr = inds_ptrs.data_handle()[label]; - auto exts = list_device_spec.make_list_extents(static_cast(list_size)); - auto interleaved_data = make_device_mdarray(handle_, exts); + auto exts = list_device_spec.make_list_extents(Pow2::roundUp(list_size)); + auto interleaved_data = make_device_mdarray(handle_, exts); - // fetch the flat codes - auto flat_codes = make_device_matrix(handle_, static_cast(list_size), ps.dim); - - flat_codes_from_list_indices(handle_, - list_inds_ptr, - list_data_ptr, - static_cast(list_size), - ps.dim, - flat_codes.data_handle()); - auto l = ivf::list(handle_, list_device_spec, static_cast(list_size)); + RAFT_LOG_INFO("interleaved_codes_extent(0) %u", interleaved_data.extent(0)); + RAFT_LOG_INFO("interleaved_codes_extent(1) %u", interleaved_data.extent(1)); - helpers::codepacker::pack_full_list( + // fetch the flat codes + auto flat_codes = make_device_matrix(handle_, list_size, static_cast(ps.dim)); + // auto flat_codes_view = raft::make_device_matrix_view( + // (const T*)flat_codes.data_handle(), list_size, ps.dim); + + // flat_codes_from_list_indices(handle_, + // list_inds_ptr, + // list_data_ptr, + // list_size, + // static_cast(ps.dim), + // flat_codes.data_handle()); + matrix::gather(handle_, + make_device_matrix_view((const DataT*)database.data(), static_cast(ps.num_db_vecs), static_cast(ps.dim)), + make_device_vector_view((const IdxT*)list_inds_ptr, list_size), + flat_codes.view()); + + // auto interleaved_codes_view = device_mdspan::list_extents, row_major>(interleaved_data.data_handle(), exts); + // auto interleaved_codes_view = raft::make_device_matrix_view(interleaved_data.data_handle(), list_size, (uint32_t)ps.dim); + detail::pack_list_data( handle_, - make_device_matrix_view(flat_codes.data_handle(), static_cast(list_size), ps.dim), + flat_codes.data_handle(), + list_size, + (uint32_t)ps.dim, index.veclen(), - interleaved_data.view()); + interleaved_data.data_handle()); ASSERT_TRUE(raft::devArrMatch(interleaved_data.data_handle(), list_data_ptr, @@ -347,13 +353,15 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { raft::Compare(), stream_)); - auto unpacked_flat_codes = make_device_matrix(handle_, static_cast(list_size), ps.dim); + auto unpacked_flat_codes = make_device_matrix(handle_, static_cast(list_size), ps.dim); - helpers::codepacker::unpack_full_list( + detail::unpack_list_data( handle_, - make_const_mdspan(interleaved_data.view()), + interleaved_data.data_handle(), + list_size, + (uint32_t)ps.dim, index.veclen(), - unpacked_flat_codes.view()); + unpacked_flat_codes.data_handle()); ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(), unpacked_flat_codes.data_handle(), diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu index 1b4a5f36c3..fc33621bf8 100644 --- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu +++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu @@ -21,7 +21,7 @@ namespace raft::neighbors::ivf_flat { typedef AnnIVFFlatTest AnnIVFFlatTestF; -TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat(); this->testPacker();} +TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat();} INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs)); diff --git a/log.txt b/log.txt new file mode 100644 index 0000000000..733021b7ab --- /dev/null +++ b/log.txt @@ -0,0 +1,2 @@ +========= COMPUTE-SANITIZER +========= Target application doesn't exist or is not a valid executable From 7412272c17a245f9a4f00ba40b18764ab31aeabd Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 20 Jul 2023 09:04:05 -0700 Subject: [PATCH 10/97] debugging --- cpp/test/neighbors/ann_ivf_flat.cuh | 13 +++++++++++++ .../neighbors/ann_ivf_flat/test_float_int64_t.cu | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index b6bcb01d91..eaf0858c66 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -19,6 +19,7 @@ #include "ann_utils.cuh" #include "raft/core/device_mdarray.hpp" #include "raft/core/host_mdarray.hpp" +#include "raft/core/logger-macros.hpp" #include "raft/core/mdspan.hpp" #include "raft/core/mdspan_types.hpp" #include "raft/linalg/map.cuh" @@ -332,6 +333,9 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { // list_size, // static_cast(ps.dim), // flat_codes.data_handle()); + + RAFT_LOG_INFO("static_cast %u", static_cast(ps.num_db_vecs)); + RAFT_LOG_INFO("static_cast %u", static_cast(ps.dim)); matrix::gather(handle_, make_device_matrix_view((const DataT*)database.data(), static_cast(ps.num_db_vecs), static_cast(ps.dim)), make_device_vector_view((const IdxT*)list_inds_ptr, list_size), @@ -346,6 +350,15 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { (uint32_t)ps.dim, index.veclen(), interleaved_data.data_handle()); + + raft::print_device_vector("indices", list_inds_ptr, list_size, std::cout); + raft::print_device_vector("flat_codes", flat_codes.data_handle(), list_size * ps.dim, std::cout); + raft::print_device_vector("interleaved_data", interleaved_data.data_handle(), Pow2::roundUp(list_size) * ps.dim, std::cout); + raft::print_device_vector("list_data", list_data_ptr, Pow2::roundUp(list_size) * ps.dim, std::cout); + auto inds = make_host_vector(handle_, list_size); + raft::update_host(inds.data_handle(), list_inds_ptr, list_size, stream_); + resource::sync_stream(handle_); + raft::print_device_vector("first_flat_code", database.data() + inds.data_handle()[0] * ps.dim, ps.dim, std::cout); ASSERT_TRUE(raft::devArrMatch(interleaved_data.data_handle(), list_data_ptr, diff --git a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu index fc33621bf8..553c10e0f4 100644 --- a/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu +++ b/cpp/test/neighbors/ann_ivf_flat/test_float_int64_t.cu @@ -21,7 +21,7 @@ namespace raft::neighbors::ivf_flat { typedef AnnIVFFlatTest AnnIVFFlatTestF; -TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testIVFFlat();} +TEST_P(AnnIVFFlatTestF, AnnIVFFlat) { this->testPacker();} INSTANTIATE_TEST_CASE_P(AnnIVFFlatTest, AnnIVFFlatTestF, ::testing::ValuesIn(inputs)); From 700ea822e5a39f0a45fcb2cd1d615bfa3d56a7b1 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 20 Jul 2023 17:19:15 -0700 Subject: [PATCH 11/97] Working build --- .../raft/neighbors/detail/ivf_flat_build.cuh | 32 +++++++------- .../raft/neighbors/ivf_flat_helpers.cuh | 33 +++++--------- cpp/test/neighbors/ann_ivf_flat.cuh | 43 ++++++------------- 3 files changed, 39 insertions(+), 69 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index c475e12b96..708f9ae549 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -439,43 +439,41 @@ __global__ void unpack_interleaved_list_kernel( } } -template +template void pack_list_data( - raft::resources const& res, - T* codes, - uint32_t n_rows, - uint32_t dim, +raft::resources const& res, + device_matrix_view codes, uint32_t veclen, - T* list_data) + device_mdspan::list_extents, row_major> + list_data) { - // uint32_t n_rows = codes.extent(0); - // uint32_t dim = codes.extent(1); + uint32_t n_rows = codes.extent(0); + uint32_t dim = codes.extent(1); static constexpr uint32_t kBlockSize = 256; dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); dim3 threads(kBlockSize, 1, 1); auto stream = resource::get_cuda_stream(res); pack_interleaved_list_kernel<<>>( - list_data, codes, n_rows, dim, veclen); + codes.data_handle(), list_data.data_handle(), n_rows, dim, veclen); RAFT_CUDA_TRY(cudaPeekAtLastError()); } -template +template void unpack_list_data( raft::resources const& res, - T* list_data, - uint32_t n_rows, - uint32_t dim, + device_mdspan::list_extents, row_major> + list_data, uint32_t veclen, - T* codes) + device_matrix_view codes) { - // uint32_t n_rows = codes.extent(0); - // uint32_t dim = codes.extent(1); + uint32_t n_rows = codes.extent(0); + uint32_t dim = codes.extent(1); static constexpr uint32_t kBlockSize = 256; dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); dim3 threads(kBlockSize, 1, 1); auto stream = resource::get_cuda_stream(res); unpack_interleaved_list_kernel<<>>( - codes, list_data, n_rows, dim, veclen); + list_data.data_handle(), codes.data_handle(), n_rows, dim, veclen); RAFT_CUDA_TRY(cudaPeekAtLastError()); } diff --git a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh index e757be9b8a..3f9d9325c4 100644 --- a/cpp/include/raft/neighbors/ivf_flat_helpers.cuh +++ b/cpp/include/raft/neighbors/ivf_flat_helpers.cuh @@ -16,7 +16,6 @@ #pragma once -#include #include #include @@ -31,38 +30,26 @@ namespace raft::neighbors::ivf_flat::helpers { namespace codepacker { -template +template void pack_full_list( - raft::resources const& handle, - T* codes, - uint32_t n_rows, - uint32_t dim, + raft::resources const& res, + device_matrix_view codes, uint32_t veclen, - T* list_data) + device_mdspan::list_extents, row_major> + list_data) { - raft::neighbors::ivf_flat::detail::pack_list_data(handle, codes, n_rows, dim, veclen, list_data); + raft::neighbors::ivf_flat::detail::pack_list_data(res, codes, veclen, list_data); } -// template -// void unpack_full_list( -// raft::resources const& res, -// device_mdspan::list_extents, row_major> -// list_data, -// uint32_t veclen, -// device_matrix_view codes) -// { -// raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, veclen, codes); -// } template void unpack_full_list( raft::resources const& res, - T* list_data, - uint32_t n_rows, - uint32_t dim, + device_mdspan::list_extents, row_major> + list_data, uint32_t veclen, - T* codes) + device_matrix_view codes) { - raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, n_rows, dim, veclen, codes); + raft::neighbors::ivf_flat::detail::unpack_list_data(res, list_data, veclen, codes); } } // namespace codepacker /** @} */ diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index eaf0858c66..dccbedee98 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -22,10 +22,6 @@ #include "raft/core/logger-macros.hpp" #include "raft/core/mdspan.hpp" #include "raft/core/mdspan_types.hpp" -#include "raft/linalg/map.cuh" -#include "raft/neighbors/ivf_flat_types.hpp" -#include "raft/util/cudart_utils.hpp" -#include "raft/util/fast_int_div.cuh" #include #include @@ -37,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -297,6 +292,8 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { auto index = ivf_flat::build(handle_, index_params, database_view); + // index index_2 = ivf_flat::extend(handle_, half_of_data_view, no_opt, idx); + raft::print_device_vector("list_sizes", index.list_sizes().data_handle(), ps.nlist, std::cout); RAFT_LOG_INFO("total db size %lld", ps.num_db_vecs); @@ -309,30 +306,21 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { update_host(inds_ptrs.data_handle(), index.inds_ptrs().data_handle(), index.n_lists(), stream_); resource::sync_stream(handle_); - auto list_device_spec = list_spec{static_cast(ps.dim), false}; + auto list_device_spec = list_spec{static_cast(ps.dim), false}; for (uint32_t label = 0; label < index.n_lists(); label++) { uint32_t list_size = list_sizes.data_handle()[label]; - T* list_data_ptr = data_ptrs.data_handle()[label]; + DataT* list_data_ptr = data_ptrs.data_handle()[label]; IdxT* list_inds_ptr = inds_ptrs.data_handle()[label]; auto exts = list_device_spec.make_list_extents(Pow2::roundUp(list_size)); - auto interleaved_data = make_device_mdarray(handle_, exts); + auto interleaved_data = make_device_mdarray(handle_, exts); RAFT_LOG_INFO("interleaved_codes_extent(0) %u", interleaved_data.extent(0)); RAFT_LOG_INFO("interleaved_codes_extent(1) %u", interleaved_data.extent(1)); // fetch the flat codes auto flat_codes = make_device_matrix(handle_, list_size, static_cast(ps.dim)); - // auto flat_codes_view = raft::make_device_matrix_view( - // (const T*)flat_codes.data_handle(), list_size, ps.dim); - - // flat_codes_from_list_indices(handle_, - // list_inds_ptr, - // list_data_ptr, - // list_size, - // static_cast(ps.dim), - // flat_codes.data_handle()); RAFT_LOG_INFO("static_cast %u", static_cast(ps.num_db_vecs)); RAFT_LOG_INFO("static_cast %u", static_cast(ps.dim)); @@ -342,14 +330,13 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { flat_codes.view()); // auto interleaved_codes_view = device_mdspan::list_extents, row_major>(interleaved_data.data_handle(), exts); - // auto interleaved_codes_view = raft::make_device_matrix_view(interleaved_data.data_handle(), list_size, (uint32_t)ps.dim); - detail::pack_list_data( + helpers::codepacker::pack_full_list( handle_, - flat_codes.data_handle(), - list_size, - (uint32_t)ps.dim, + make_const_mdspan(flat_codes.view()), index.veclen(), - interleaved_data.data_handle()); + interleaved_data.view()); + + resource::sync_stream(handle_); raft::print_device_vector("indices", list_inds_ptr, list_size, std::cout); raft::print_device_vector("flat_codes", flat_codes.data_handle(), list_size * ps.dim, std::cout); @@ -366,15 +353,13 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { raft::Compare(), stream_)); - auto unpacked_flat_codes = make_device_matrix(handle_, static_cast(list_size), ps.dim); + auto unpacked_flat_codes = make_device_matrix(handle_, list_size, static_cast(ps.dim)); - detail::unpack_list_data( + helpers::codepacker::unpack_full_list( handle_, - interleaved_data.data_handle(), - list_size, - (uint32_t)ps.dim, + interleaved_data.view(), index.veclen(), - unpacked_flat_codes.data_handle()); + unpacked_flat_codes.view()); ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(), unpacked_flat_codes.data_handle(), From 9d742efe8296eaac1173c80506af853ccc4da1b5 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Fri, 21 Jul 2023 15:25:24 -0700 Subject: [PATCH 12/97] rename codepacking api --- cpp/include/raft/neighbors/detail/ivf_flat_build.cuh | 6 +++--- .../{ivf_flat_codepacker.cuh => ivf_flat_codepacker.hpp} | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) rename cpp/include/raft/neighbors/{ivf_flat_codepacker.cuh => ivf_flat_codepacker.hpp} (97%) diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index 708f9ae549..f2959c5829 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include @@ -425,7 +425,7 @@ __global__ void pack_interleaved_list_kernel( { uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n_rows) { - codepacker::pack_1_interleaved(codes + tid * dim, list_data, dim, veclen, tid); + codepacker::pack_1(codes + tid * dim, list_data, dim, veclen, tid); } } @@ -435,7 +435,7 @@ __global__ void unpack_interleaved_list_kernel( { uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n_rows) { - codepacker::unpack_1_interleaved(list_data, codes + tid * dim, dim, veclen, tid); + codepacker::unpack_1(list_data, codes + tid * dim, dim, veclen, tid); } } diff --git a/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp similarity index 97% rename from cpp/include/raft/neighbors/ivf_flat_codepacker.cuh rename to cpp/include/raft/neighbors/ivf_flat_codepacker.hpp index 9b12761daf..430ca7d995 100644 --- a/cpp/include/raft/neighbors/ivf_flat_codepacker.cuh +++ b/cpp/include/raft/neighbors/ivf_flat_codepacker.hpp @@ -38,7 +38,7 @@ namespace raft::neighbors::ivf_flat::codepacker { * @param[in] offset how many records to skip before writing the data into the list */ template -__host__ __device__ void pack_1_interleaved( +_RAFT_HOST_DEVICE void pack_1( const T* flat_code, T* block, uint32_t dim, uint32_t veclen, uint32_t offset) { // The data is written in interleaved groups of `index::kGroupSize` vectors @@ -70,7 +70,7 @@ __host__ __device__ void pack_1_interleaved( * @param[in] offset fetch the flat code by the given offset */ template -__host__ __device__ void unpack_1_interleaved( +_RAFT_HOST_DEVICE void unpack_1( const T* block, T* flat_code, uint32_t dim, uint32_t veclen, uint32_t offset) { // The data is written in interleaved groups of `index::kGroupSize` vectors From d1ef8a16b5602dcb4f1e7df7a76c5e058cf57a48 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 27 Jul 2023 14:20:27 -0700 Subject: [PATCH 13/97] Updated gtest --- .../raft/neighbors/detail/ivf_flat_build.cuh | 10 + cpp/test/neighbors/ann_ivf_flat.cuh | 255 +++++++++++------- 2 files changed, 163 insertions(+), 102 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index f2959c5829..6da3de9e6c 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -16,6 +16,7 @@ #pragma once +#include "raft/core/logger-macros.hpp" #include #include #include @@ -152,9 +153,15 @@ __global__ void build_index_kernel(const LabelT* labels, // NB: such `veclen` is selected, that `dim % veclen == 0` for (uint32_t l = 0; l < dim; l += veclen) { for (uint32_t j = 0; j < veclen; j++) { + if (list_id == 0) { + printf("l %u, j %u, dst_index %u, src_value %f\n", l, j, l * kIndexGroupSize + ingroup_id + j, (float)source_vecs[l + j]); + } list_data[l * kIndexGroupSize + ingroup_id + j] = source_vecs[l + j]; } } + if (list_id == 0) { + printf("list_id %u, inlist_id %u, group_offset %u, ingroup_id %u\n", list_id, inlist_id, group_offset, ingroup_id); + } } /** See raft::neighbors::ivf_flat::extend docs */ @@ -248,6 +255,7 @@ void extend(raft::resources const& handle, // Kernel to insert the new vectors const dim3 block_dim(256); const dim3 grid_dim(raft::ceildiv(n_rows, block_dim.x)); + build_index_kernel<<>>(new_labels.data_handle(), new_vectors, new_indices, @@ -449,6 +457,7 @@ raft::resources const& res, { uint32_t n_rows = codes.extent(0); uint32_t dim = codes.extent(1); + if (n_rows == 0 || dim == 0) return; static constexpr uint32_t kBlockSize = 256; dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); dim3 threads(kBlockSize, 1, 1); @@ -468,6 +477,7 @@ void unpack_list_data( { uint32_t n_rows = codes.extent(0); uint32_t dim = codes.extent(1); + if (n_rows == 0 || dim == 0) return; static constexpr uint32_t kBlockSize = 256; dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); dim3 threads(kBlockSize, 1, 1); diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index dccbedee98..2702deef07 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -22,6 +22,10 @@ #include "raft/core/logger-macros.hpp" #include "raft/core/mdspan.hpp" #include "raft/core/mdspan_types.hpp" +#include "raft/linalg/map.cuh" +#include "raft/neighbors/detail/ivf_pq_build.cuh" +#include "raft/neighbors/ivf_flat_types.hpp" +#include "raft/neighbors/ivf_list.hpp" #include #include @@ -84,6 +88,21 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { { } + void construct_pad_mask(raft::device_matrix_view mask) { + uint32_t list_size = mask.extent(0); + uint32_t dim = mask.extent(1); + using interleaved_group = Pow2; + raft::linalg::map_offset(handle_, make_device_vector_view(mask.data_handle(), list_size * dim), [list_size, dim = util::FastIntDiv(dim)]__device__(uint32_t i) { + uint32_t row = i / dim; + uint32_t max_group_offset = interleaved_group::roundDown(list_size); + if (row < max_group_offset) { + return true; + } + uint32_t ingroup_id = interleaved_group::mod(row); + return ingroup_id < (list_size - max_group_offset); + }); +} + protected: void testIVFFlat() { @@ -283,91 +302,123 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { index_params.metric = ps.metric; index_params.adaptive_centers = false; - index_params.add_data_on_build = true; - index_params.kmeans_trainset_fraction = 0.5; + index_params.add_data_on_build = false; + index_params.kmeans_trainset_fraction = 1.0; index_params.metric_arg = 0; auto database_view = raft::make_device_matrix_view( (const DataT*)database.data(), ps.num_db_vecs, ps.dim); - auto index = ivf_flat::build(handle_, index_params, database_view); + auto idx = ivf_flat::build(handle_, index_params, database_view); - // index index_2 = ivf_flat::extend(handle_, half_of_data_view, no_opt, idx); + const std::optional> no_opt = std::nullopt; + index extend_index = ivf_flat::extend(handle_, database_view, no_opt, idx); - raft::print_device_vector("list_sizes", index.list_sizes().data_handle(), ps.nlist, std::cout); - RAFT_LOG_INFO("total db size %lld", ps.num_db_vecs); - - auto list_sizes = raft::make_host_vector(index.n_lists()); + auto list_sizes = raft::make_host_vector(idx.n_lists()); update_host( - list_sizes.data_handle(), index.list_sizes().data_handle(), index.n_lists(), stream_); - auto data_ptrs = raft::make_host_vector(index.n_lists()); - update_host(data_ptrs.data_handle(), index.data_ptrs().data_handle(), index.n_lists(), stream_); - auto inds_ptrs = raft::make_host_vector(index.n_lists()); - update_host(inds_ptrs.data_handle(), index.inds_ptrs().data_handle(), index.n_lists(), stream_); + list_sizes.data_handle(), idx.list_sizes().data_handle(), idx.n_lists(), stream_); + // auto data_ptrs = raft::make_host_vector(idx.n_lists()); + // update_host(data_ptrs.data_handle(), idx.data_ptrs().data_handle(), idx.n_lists(), stream_); + // auto inds_ptrs = raft::make_host_vector(idx.n_lists()); + // update_host(inds_ptrs.data_handle(), idx.inds_ptrs().data_handle(), idx.n_lists(), stream_); resource::sync_stream(handle_); + auto& lists = idx.lists(); + // conservative memory allocation for codepacking auto list_device_spec = list_spec{static_cast(ps.dim), false}; - for (uint32_t label = 0; label < index.n_lists(); label++) { + for (uint32_t label = 0; label < idx.n_lists(); label++) { uint32_t list_size = list_sizes.data_handle()[label]; - DataT* list_data_ptr = data_ptrs.data_handle()[label]; - IdxT* list_inds_ptr = inds_ptrs.data_handle()[label]; - - auto exts = list_device_spec.make_list_extents(Pow2::roundUp(list_size)); - auto interleaved_data = make_device_mdarray(handle_, exts); - RAFT_LOG_INFO("interleaved_codes_extent(0) %u", interleaved_data.extent(0)); - RAFT_LOG_INFO("interleaved_codes_extent(1) %u", interleaved_data.extent(1)); + ivf::resize_list(handle_, lists[label], list_device_spec, list_size, 0); + } + + idx.recompute_internal_state(handle_); + + using interleaved_group = Pow2; + + for (uint32_t label = 0; label < idx.n_lists(); label++) { + uint32_t list_size = list_sizes.data_handle()[label]; + uint32_t padded_list_size = interleaved_group::roundUp(list_size); + uint32_t n_elems = padded_list_size * static_cast(ps.dim); + auto list_data = lists[label]->data; + auto list_inds = lists[label]->indices; // fetch the flat codes auto flat_codes = make_device_matrix(handle_, list_size, static_cast(ps.dim)); - RAFT_LOG_INFO("static_cast %u", static_cast(ps.num_db_vecs)); - RAFT_LOG_INFO("static_cast %u", static_cast(ps.dim)); matrix::gather(handle_, make_device_matrix_view((const DataT*)database.data(), static_cast(ps.num_db_vecs), static_cast(ps.dim)), - make_device_vector_view((const IdxT*)list_inds_ptr, list_size), + make_device_vector_view((const IdxT*)list_inds.data_handle(), list_size), flat_codes.view()); - // auto interleaved_codes_view = device_mdspan::list_extents, row_major>(interleaved_data.data_handle(), exts); helpers::codepacker::pack_full_list( handle_, make_const_mdspan(flat_codes.view()), - index.veclen(), - interleaved_data.view()); + idx.veclen(), + list_data.view()); - resource::sync_stream(handle_); + { + auto mask = make_device_matrix(handle_,list_size, static_cast(ps.dim)); + + construct_pad_mask(mask.view()); + + auto packed_list_data = make_device_vector(handle_, n_elems); + + thrust::transform( + list_data.data_handle(), list_data.data_handle() + n_elems, + mask.data_handle(), + packed_list_data.data_handle(), + thrust::multiplies() + ); + auto extend_data = extend_index.lists()[label]->data; + auto extend_data_filtered = make_device_vector(handle_, n_elems); + thrust::transform( + extend_data.data_handle(), extend_data.data_handle() + n_elems, + mask.data_handle(), + extend_data_filtered.data_handle(), + thrust::multiplies() + ); + + ASSERT_TRUE(raft::devArrMatch(packed_list_data.data_handle(), + extend_data_filtered.data_handle(), + n_elems, + raft::Compare(), + stream_)); + } - raft::print_device_vector("indices", list_inds_ptr, list_size, std::cout); - raft::print_device_vector("flat_codes", flat_codes.data_handle(), list_size * ps.dim, std::cout); - raft::print_device_vector("interleaved_data", interleaved_data.data_handle(), Pow2::roundUp(list_size) * ps.dim, std::cout); - raft::print_device_vector("list_data", list_data_ptr, Pow2::roundUp(list_size) * ps.dim, std::cout); - auto inds = make_host_vector(handle_, list_size); - raft::update_host(inds.data_handle(), list_inds_ptr, list_size, stream_); - resource::sync_stream(handle_); - raft::print_device_vector("first_flat_code", database.data() + inds.data_handle()[0] * ps.dim, ps.dim, std::cout); + // raft::print_device_vector("indices", list_inds_ptr, list_size, std::cout); + // raft::print_device_vector("flat_codes", flat_codes.data_handle(), list_size * ps.dim, std::cout); + // raft::print_device_vector("interleaved_data", interleaved_data.data_handle(), Pow2::roundUp(list_size) * ps.dim, std::cout); + // auto database_data_host = raft::make_host_vector(handle_, (uint32_t)ps.num_db_vecs).data_handle(); + // auto list_data_host = raft::make_host_vector(handle_, Pow2::roundUp(list_size)).data_handle(); + // raft::update_host(database_data_host, database.data(), ps.num_db_vecs, stream_); + // raft::update_host(list_data_host, list_data_ptr, Pow2::roundUp(list_size), stream_); + // raft::resource::sync_stream(handle_); - ASSERT_TRUE(raft::devArrMatch(interleaved_data.data_handle(), - list_data_ptr, - Pow2::roundUp(list_size) * ps.dim, - raft::Compare(), - stream_)); + + // raft::print_device_vector("list_data", list_data_ptr, Pow2::roundUp(list_size) * ps.dim, std::cout); + // auto inds = make_host_vector(handle_, list_size); + // raft::update_host(inds.data_handle(), list_inds_ptr, list_size, stream_); + // resource::sync_stream(handle_); + // raft::print_device_vector("first_flat_code", database.data() + inds.data_handle()[0] * ps.dim, ps.dim, std::cout); auto unpacked_flat_codes = make_device_matrix(handle_, list_size, static_cast(ps.dim)); helpers::codepacker::unpack_full_list( handle_, - interleaved_data.view(), - index.veclen(), + list_data.view(), + idx.veclen(), unpacked_flat_codes.view()); ASSERT_TRUE(raft::devArrMatch(flat_codes.data_handle(), unpacked_flat_codes.data_handle(), list_size * ps.dim, - raft::Compare(), + raft::Compare(), stream_)); } - } + } + void SetUp() override { @@ -402,63 +453,63 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { const std::vector> inputs = { // test various dims (aligned and not aligned to vector sizes) - {1000, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}, - {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, - {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}, - {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, - {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false}, - {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true}, - {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, false}, - {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, true}, + {100, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}}; + // {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, + // {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}, + // {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, + // {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false}, + // {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true}, + // {1000, 10000, 5, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, false}, + // {1000, 10000, 8, 16, 40, 1024, raft::distance::DistanceType::L2SqrtExpanded, true}, // test dims that do not fit into kernel shared memory limits - {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, - {1000, 10000, 2049, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, - {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false}, - {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true}, - {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false}, - {1000, 10000, 2053, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}, - {1000, 10000, 2056, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}, - - // various random combinations - {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, - {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded, false}, - {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded, false}, - {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::L2Expanded, false}, - {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true}, - {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true}, - {10000, 131072, 8, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, false}, - - {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::InnerProduct, true}, - {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true}, - {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::InnerProduct, false}, - {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::InnerProduct, true}, - {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, true}, - {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, false}, - {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true}, - - {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct, false}, - - // test splitting the big query batches (> max gridDim.y) into smaller batches - {100000, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, false}, - {1000000, 1024, 32, 10, 256, 256, raft::distance::DistanceType::InnerProduct, false}, - {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, true}, - - // test radix_sort for getting the cluster selection - {1000, - 10000, - 16, - 10, - raft::matrix::detail::select::warpsort::kMaxCapacity * 2, - raft::matrix::detail::select::warpsort::kMaxCapacity * 4, - raft::distance::DistanceType::L2Expanded, - false}, - {1000, - 10000, - 16, - 10, - raft::matrix::detail::select::warpsort::kMaxCapacity * 4, - raft::matrix::detail::select::warpsort::kMaxCapacity * 4, - raft::distance::DistanceType::InnerProduct, - false}}; + // {1000, 10000, 2048, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, + // {1000, 10000, 2049, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, + // {1000, 10000, 2050, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false}, + // {1000, 10000, 2051, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, true}, + // {1000, 10000, 2052, 16, 40, 1024, raft::distance::DistanceType::InnerProduct, false}, + // {1000, 10000, 2053, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}, + // {1000, 10000, 2056, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}, + + // // various random combinations + // {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, + // {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::L2Expanded, false}, + // {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::L2Expanded, false}, + // {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::L2Expanded, false}, + // {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true}, + // {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, true}, + // {10000, 131072, 8, 10, 20, 1024, raft::distance::DistanceType::L2Expanded, false}, + + // {1000, 10000, 16, 10, 40, 1024, raft::distance::DistanceType::InnerProduct, true}, + // {1000, 10000, 16, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true}, + // {1000, 10000, 16, 10, 70, 1024, raft::distance::DistanceType::InnerProduct, false}, + // {100, 10000, 16, 10, 20, 512, raft::distance::DistanceType::InnerProduct, true}, + // {20, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, true}, + // {1000, 100000, 16, 10, 20, 1024, raft::distance::DistanceType::InnerProduct, false}, + // {10000, 131072, 8, 10, 50, 1024, raft::distance::DistanceType::InnerProduct, true}, + + // {1000, 10000, 4096, 20, 50, 1024, raft::distance::DistanceType::InnerProduct, false}, + + // // test splitting the big query batches (> max gridDim.y) into smaller batches + // {100000, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, false}, + // {1000000, 1024, 32, 10, 256, 256, raft::distance::DistanceType::InnerProduct, false}, + // {98306, 1024, 32, 10, 64, 64, raft::distance::DistanceType::InnerProduct, true}, + + // // test radix_sort for getting the cluster selection + // {1000, + // 10000, + // 16, + // 10, + // raft::matrix::detail::select::warpsort::kMaxCapacity * 2, + // raft::matrix::detail::select::warpsort::kMaxCapacity * 4, + // raft::distance::DistanceType::L2Expanded, + // false}, + // {1000, + // 10000, + // 16, + // 10, + // raft::matrix::detail::select::warpsort::kMaxCapacity * 4, + // raft::matrix::detail::select::warpsort::kMaxCapacity * 4, + // raft::distance::DistanceType::InnerProduct, + // false}}; } // namespace raft::neighbors::ivf_flat From 4ee99e3ef16ccac3d08212fe165620c2422f4de5 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Thu, 27 Jul 2023 15:21:24 -0700 Subject: [PATCH 14/97] updates --- cpp/include/raft/neighbors/detail/ivf_flat_build.cuh | 9 --------- cpp/test/neighbors/ann_ivf_flat.cuh | 7 +------ 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh index 6da3de9e6c..afb7c461e5 100644 --- a/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_flat_build.cuh @@ -16,7 +16,6 @@ #pragma once -#include "raft/core/logger-macros.hpp" #include #include #include @@ -33,7 +32,6 @@ #include #include #include -#include #include #include @@ -153,15 +151,9 @@ __global__ void build_index_kernel(const LabelT* labels, // NB: such `veclen` is selected, that `dim % veclen == 0` for (uint32_t l = 0; l < dim; l += veclen) { for (uint32_t j = 0; j < veclen; j++) { - if (list_id == 0) { - printf("l %u, j %u, dst_index %u, src_value %f\n", l, j, l * kIndexGroupSize + ingroup_id + j, (float)source_vecs[l + j]); - } list_data[l * kIndexGroupSize + ingroup_id + j] = source_vecs[l + j]; } } - if (list_id == 0) { - printf("list_id %u, inlist_id %u, group_offset %u, ingroup_id %u\n", list_id, inlist_id, group_offset, ingroup_id); - } } /** See raft::neighbors::ivf_flat::extend docs */ @@ -255,7 +247,6 @@ void extend(raft::resources const& handle, // Kernel to insert the new vectors const dim3 block_dim(256); const dim3 grid_dim(raft::ceildiv(n_rows, block_dim.x)); - build_index_kernel<<>>(new_labels.data_handle(), new_vectors, new_indices, diff --git a/cpp/test/neighbors/ann_ivf_flat.cuh b/cpp/test/neighbors/ann_ivf_flat.cuh index 2702deef07..289cd5960a 100644 --- a/cpp/test/neighbors/ann_ivf_flat.cuh +++ b/cpp/test/neighbors/ann_ivf_flat.cuh @@ -23,7 +23,6 @@ #include "raft/core/mdspan.hpp" #include "raft/core/mdspan_types.hpp" #include "raft/linalg/map.cuh" -#include "raft/neighbors/detail/ivf_pq_build.cuh" #include "raft/neighbors/ivf_flat_types.hpp" #include "raft/neighbors/ivf_list.hpp" #include @@ -154,8 +153,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { ps.num_db_vecs, ps.dim); - RAFT_LOG_INFO("Index build successfully"); - resource::sync_stream(handle_); approx_knn_search(handle_, distances_ivfflat_dev.data(), @@ -165,8 +162,6 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { search_queries.data(), ps.num_queries); - RAFT_LOG_INFO("search successful"); - update_host(distances_ivfflat.data(), distances_ivfflat_dev.data(), queries_size, stream_); update_host(indices_ivfflat.data(), indices_ivfflat_dev.data(), queries_size, stream_); resource::sync_stream(handle_); @@ -453,7 +448,7 @@ class AnnIVFFlatTest : public ::testing::TestWithParam> { const std::vector> inputs = { // test various dims (aligned and not aligned to vector sizes) - {100, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}}; + {1000, 10000, 1, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}}; // {1000, 10000, 2, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, // {1000, 10000, 3, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, true}, // {1000, 10000, 4, 16, 40, 1024, raft::distance::DistanceType::L2Expanded, false}, From 22f4f801deea6d4f47c6ef9f4f734e3de3afcea0 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Fri, 28 Jul 2023 09:20:42 -0700 Subject: [PATCH 15/97] update testing --- cpp/include/raft/neighbors/ivf_list.hpp | 6 + cpp/include/raft/neighbors/ivf_list_types.hpp | 9 + cpp/test/neighbors/ann_ivf_flat.cuh | 290 ++++++++++-------- 3 files changed, 180 insertions(+), 125 deletions(-) diff --git a/cpp/include/raft/neighbors/ivf_list.hpp b/cpp/include/raft/neighbors/ivf_list.hpp index ad06a3ee71..de7d84da14 100644 --- a/cpp/include/raft/neighbors/ivf_list.hpp +++ b/cpp/include/raft/neighbors/ivf_list.hpp @@ -67,6 +67,12 @@ list::list(raft::resources const& res, indices.data_handle(), indices.size(), ivf::kInvalidRecord); + + // Fill the data buffer with a pre-defined marker for easier debugging + thrust::fill_n(resource::get_thrust_policy(res), + data.data_handle(), + data.size(), + ivf::kPadElem); } /** diff --git a/cpp/include/raft/neighbors/ivf_list_types.hpp b/cpp/include/raft/neighbors/ivf_list_types.hpp index 6317825201..9cbb36acda 100644 --- a/cpp/include/raft/neighbors/ivf_list_types.hpp +++ b/cpp/include/raft/neighbors/ivf_list_types.hpp @@ -34,6 +34,15 @@ template constexpr static IdxT kInvalidRecord = (std::is_signed_v ? IdxT{0} : std::numeric_limits::max()) - 1; +/** + * Default value filled in the `data` array. + * One may encounter it trying to access a record within a list that is outside of the + * `size` bound or whenever the list is allocated but not filled-in yet. + */ +template +constexpr static T kPadElem = + (std::is_signed_v ? T{0} : std::numeric_limits::max()) - T{1}; + /** The data for a single IVF list. */ template